In [None]:
# ====================================================================
# STANDALONE EDA DATA PREPARATION
# ====================================================================

# Import required packages for comprehensive EDA
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from scipy.stats import skew, kurtosis
from sklearn.manifold import TSNE
from typing import Optional
import os
import pickle
import warnings
warnings.filterwarnings('ignore')

# Load data using pyreadr
import pyreadr

# Configuration
OUTPUT_PATH = "../../output"
VERSION = "v2"
SIMULATION_RUN_COLUMN_NAME = "simulationRun"
TARGET_VARIABLE_COLUMN_NAME = "faultNumber"
FAULT_INJECTION_STARTING_POINT = 160

print("=== STANDALONE EDA DATA PREPARATION ===")
print("Loading raw data directly for exploratory analysis...")

# Load raw data directly - EDA only needs raw data
fault_free_training_dict = pyreadr.read_r("../../code/Tennessee Eastman/data/TEP_FaultFree_Training.RData")
faulty_training_dict = pyreadr.read_r("../../code/Tennessee Eastman/data/TEP_Faulty_Training.RData")

# Extract dataframes from the loaded data
faulty_key = list(faulty_training_dict.keys())[0]
faultfree_key = list(fault_free_training_dict.keys())[0]

DF_F_TRAINING_RAW = faulty_training_dict[faulty_key]
DF_FF_TRAINING_RAW = fault_free_training_dict[faultfree_key]

print("✓ Raw data loaded successfully!")
print(f"✓ Faulty training data shape: {DF_F_TRAINING_RAW.shape}")
print(f"✓ Fault-free training data shape: {DF_FF_TRAINING_RAW.shape}")
print(f"✓ Available columns: {list(DF_F_TRAINING_RAW.columns[:10])}...")  # Show first 10 columns
print(f"✓ Unique fault numbers: {sorted(DF_F_TRAINING_RAW[TARGET_VARIABLE_COLUMN_NAME].unique())}")
print("=== EDA DATA PREPARATION COMPLETE ===\n")

In [None]:
# Configuration
VERSION = "1.00"
OUTPUT_PATH = "output"
TARGET_VARIABLE_COLUMN_NAME = "faultNumber"
SIMULATION_RUN_COLUMN_NAME = "simulationRun"
COLUMNS_TO_REMOVE = ["simulationRun", "sample"]
SKIPED_FAULTS = []
FAULTS_TO_BE_MERGED_TOGETHER = [3, 8, 9, 18, 15]
MERGE_FAUTS_TO_NUMBER = 3
FAULT_INJECTION_STARTING_POINT = 25

In [None]:
def save_plot(plot_name: str,
              suffix: str = "",
              plot_path: str = "EDA") -> None:
    """Save current matplotlib figure."""
    timestamp: str = ""
    base_dir: str = os.path.join(OUTPUT_PATH, "data", plot_path)
    os.makedirs(base_dir, exist_ok=True)

    filename: str = f"{plot_name}_{suffix}_v{VERSION}_{timestamp}.png" if suffix else f"{plot_name}_v{VERSION}_{timestamp}.png"
    filepath: str = os.path.join(base_dir, filename)

    plt.savefig(filepath, bbox_inches="tight", dpi=300)
    print(f"Plot saved: {filepath}")


def save_dataframe(df: pd.DataFrame, name: str, suffix: str = "") -> None:
    """Save a DataFrame to CSV."""
    timestamp: str = ""
    base_dir: str = os.path.join(OUTPUT_PATH, "data")
    os.makedirs(base_dir, exist_ok=True)

    filename: str = f"{name}_{suffix}_v{VERSION}_{timestamp}.csv" if suffix else f"{name}_v{VERSION}_{timestamp}.csv"
    filepath: str = os.path.join(base_dir, filename)

    df.to_csv(filepath, index=True)
    print(f"Data saved: {filepath}")


def save_pickle(obj, name: str, suffix: str = "") -> None:
    """Save object as pickle file."""
    timestamp: str = ""
    base_dir: str = os.path.join(OUTPUT_PATH, "data")
    os.makedirs(base_dir, exist_ok=True)

    filename: str = f"{name}_{suffix}_v{VERSION}_{timestamp}.pkl" if suffix else f"{name}_v{VERSION}_{timestamp}.pkl"
    filepath: str = os.path.join(base_dir, filename)

    with open(filepath, 'wb') as f:
        pickle.dump(obj, f)
    print(f"Results saved: {filepath}")

In [None]:
def calculate_statistics_per_fault_run(
        df_faulty: pd.DataFrame, df_normal: pd.DataFrame) -> pd.DataFrame:
    """
    Calculate comprehensive statistics for each fault run including mean, std, skewness, kurtosis, and autocorrelation.
    """
    base_dir: str = os.path.join(OUTPUT_PATH, "data", "fault_statistics.csv")
    output_path: Path = Path(base_dir)
    output_path.parent.mkdir(parents=True, exist_ok=True)

    if output_path.exists():
        return pd.read_csv(output_path)

    features: list[str] = [
        col for col in df_faulty.columns
        if col not in [TARGET_VARIABLE_COLUMN_NAME, SIMULATION_RUN_COLUMN_NAME, "time", "sample"]
    ]

    def compute_stats(df: pd.DataFrame) -> pd.DataFrame:
        rows: list[dict] = []
        grouped = df.groupby([TARGET_VARIABLE_COLUMN_NAME, SIMULATION_RUN_COLUMN_NAME])
        for (fault, run), group in grouped:
            for feature in features:
                values: pd.Series = group[feature].dropna()
                if values.empty:
                    continue
                rows.append({
                    TARGET_VARIABLE_COLUMN_NAME: fault,
                    SIMULATION_RUN_COLUMN_NAME: run,
                    "feature": feature,
                    "mean": values.mean(),
                    "std": values.std(),
                    "skewness": skew(values),
                    "kurtosis": kurtosis(values),
                    "autocorr_1": values.autocorr(lag=1) if len(values) > 1 else np.nan,
                })
        return pd.DataFrame(rows)

    df_all: pd.DataFrame = pd.concat([df_normal, df_faulty], ignore_index=True)
    result_df: pd.DataFrame = compute_stats(df_all)
    result_df.to_csv(output_path, index=False)
    save_dataframe(df=result_df, name="fault_statistics")
    return result_df


def calculate_statistics_per_fault_run(
        df_faulty: pd.DataFrame, df_normal: pd.DataFrame) -> pd.DataFrame:
    """Calculate comprehensive statistics per fault and simulation run."""
    features: list[str] = [
        col for col in df_faulty.columns if col not in [
            TARGET_VARIABLE_COLUMN_NAME, SIMULATION_RUN_COLUMN_NAME, "time",
            "sample"
        ]
    ]

    def compute_stats(df: pd.DataFrame) -> pd.DataFrame:
        rows: list[dict] = []
        grouped = df.groupby(
            [TARGET_VARIABLE_COLUMN_NAME, SIMULATION_RUN_COLUMN_NAME])

        for (fault, run), group in grouped:
            for feature in features:
                values: pd.Series = group[feature].dropna()
                if values.empty:
                    continue

                rows.append({
                    TARGET_VARIABLE_COLUMN_NAME:
                    fault,
                    SIMULATION_RUN_COLUMN_NAME:
                    run,
                    "feature":
                    feature,
                    "mean":
                    values.mean(),
                    "std":
                    values.std(),
                    "skewness":
                    skew(values),
                    "kurtosis":
                    kurtosis(values),
                    "autocorr_1":
                    values.autocorr(lag=1) if len(values) > 1 else np.nan,
                })
        return pd.DataFrame(rows)

    df_all: pd.DataFrame = pd.concat([df_normal, df_faulty], ignore_index=True)
    result_df: pd.DataFrame = compute_stats(df_all)

    save_dataframe(df=result_df, name="fault_statistics", suffix="EDA")
    return result_df


# Calculate statistics for reduced dataset
statics_f_df = DF_F_TRAINING_RAW[
    (DF_F_TRAINING_RAW[SIMULATION_RUN_COLUMN_NAME] >= 1)
    & (DF_F_TRAINING_RAW[SIMULATION_RUN_COLUMN_NAME] < 4)]
statics_ff_df = DF_FF_TRAINING_RAW[
    (DF_FF_TRAINING_RAW[SIMULATION_RUN_COLUMN_NAME] >= 1)
    & (DF_FF_TRAINING_RAW[SIMULATION_RUN_COLUMN_NAME] < 4)]

stats_df = calculate_statistics_per_fault_run(df_faulty=statics_f_df,
                                              df_normal=statics_ff_df)
print(
    f"Statistics calculated for {len(stats_df)} feature-fault-run combinations"
)
stats_df.head()

In [None]:
def create_comprehensive_boxplots(df_fault: pd.DataFrame, df_normal: pd.DataFrame, 
                                fault_number: int = 3, simulation_run: int = 10, 
                                plot_type: str = "normal"):
    """
    Create comprehensive boxplots for all features comparing fault vs normal data.
    
    Parameters:
    - df_fault: Faulty data
    - df_normal: Normal data  
    - fault_number: Which fault to analyze
    - simulation_run: Which simulation run to use
    - plot_type: "normal", "fault", or "combined"
    """
    
    if plot_type == "fault":
        df_selected_f_data = df_fault[
            (df_fault[TARGET_VARIABLE_COLUMN_NAME] == fault_number)
            & (df_fault[SIMULATION_RUN_COLUMN_NAME] == simulation_run)]
        used_data = df_selected_f_data
        title_suffix = f"Fault {fault_number}"
    elif plot_type == "normal":
        df_select_ff_data = df_normal[
            (df_normal[TARGET_VARIABLE_COLUMN_NAME] == 0)
            & (df_normal[SIMULATION_RUN_COLUMN_NAME] == simulation_run)]
        used_data = df_select_ff_data
        title_suffix = "Normal Operation"
    else:  # combined
        # This will be handled separately for side-by-side comparison
        pass

    if plot_type != "combined":
        feature_columns = df_normal.columns[3:]
        num_features = len(feature_columns)

        # Define the number of rows and columns for the subplots in the grid
        num_cols = min(4, num_features)  # Maximum of 4 columns
        num_rows = int(np.ceil(num_features / num_cols))
        fig, axes = plt.subplots(num_rows,
                                 num_cols,
                                 figsize=(4 * num_cols, 3 * num_rows))
        fig.suptitle(f"Boxplots of All Features - {title_suffix}", fontsize=16, y=1.02)

        for i, col in enumerate(feature_columns):
            row_index = i // num_cols
            col_index = i % num_cols
            if num_rows == 1:
                ax = axes[col_index] if num_cols > 1 else axes
            else:
                ax = axes[row_index, col_index] if num_cols > 1 else axes[row_index]

            ax.boxplot(
                used_data[col],
                patch_artist=True,
                boxprops=dict(facecolor="lightblue", color="navy"),
                medianprops=dict(color="red"),
                whiskerprops=dict(color="gray"),
                capprops=dict(color="gray"),
                flierprops=dict(marker="o", color="darkorange", alpha=0.5),
            )

            ax.grid(True, linestyle="--", alpha=0.5)
            ax.set_ylabel(col)
            ax.set_title(col.replace("_", " "))

        # Remove empty subplots
        for i in range(num_features, num_rows * num_cols):
            if num_rows > 1 and num_cols > 1:
                fig.delaxes(axes.flatten()[i])
            elif num_rows == 1 and num_cols > 1:
                fig.delaxes(axes[i])

        plt.tight_layout()
        save_plot(f"boxplots_all_features_{title_suffix.lower().replace(' ', '_')}")
        plt.show()

In [None]:
def create_distribution_plots(df_fault: pd.DataFrame, df_normal: pd.DataFrame,
                             fault_number: int = 3, simulation_run: int = 1,
                             max_features: int = 10):
    """
    Create side-by-side distribution plots comparing fault vs normal data.
    
    Parameters:
    - df_fault: Faulty data
    - df_normal: Normal data
    - fault_number: Which fault to analyze
    - simulation_run: Which simulation run to use
    - max_features: Maximum number of features to plot (to avoid overcrowding)
    """
    
    df_selected_f_data = df_fault[
        (df_fault[TARGET_VARIABLE_COLUMN_NAME] == fault_number)
        & (df_fault[SIMULATION_RUN_COLUMN_NAME] == simulation_run)]
    df_select_ff_data = df_normal[
        (df_normal[TARGET_VARIABLE_COLUMN_NAME] == 0)
        & (df_normal[SIMULATION_RUN_COLUMN_NAME] == simulation_run)]

    # Select features to plot (limit to max_features for readability)
    selected_features = df_selected_f_data.columns[3:3+max_features]
    
    # Create a figure with subplots for each selected feature
    fig, axes = plt.subplots(len(selected_features), 2, 
                            figsize=(12, 6 * len(selected_features)))
    
    for i, feature in enumerate(selected_features):
        # Plotting the faulty data distribution
        sns.histplot(
            df_selected_f_data[feature],
            kde=True,
            ax=axes[i, 0],
            color="red",
            label="Faulty Data",
        )
        axes[i, 0].set_title(f"Faulty Data - {feature}")
        axes[i, 0].set_xlabel(feature)
        axes[i, 0].set_ylabel("Density")
        axes[i, 0].legend()
        
        # Plotting the fault-free data distribution
        sns.histplot(
            df_select_ff_data[feature],
            kde=True,
            ax=axes[i, 1],
            color="blue",
            label="Fault-Free Data",
        )
        axes[i, 1].set_title(f"Fault-Free Data - {feature}")
        axes[i, 1].set_xlabel(feature)
        axes[i, 1].set_ylabel("Density")
        axes[i, 1].legend()
    
    plt.tight_layout()
    save_plot(f"distribution_comparison_fault_{fault_number}_run_{simulation_run}")
    plt.show()

In [None]:
def plot_tsne_visualization(x_data: np.ndarray, y_labels: np.ndarray, 
                           step: int = 50, title: str = "t-SNE Visualization"):
    """
    Visualize class separation and structure in high-dimensional process data using t-SNE embedding to 2D.

    Parameters:
    - x_data: High-dimensional feature matrix (e.g., 54 dimensions).
    - y_labels: 1D label array corresponding to x_data.
    - step: Downsampling factor to reduce computational load (default=50).
    - title: Plot title
    """

    # Downsample the data to reduce computation
    x_down = x_data[::step, :]
    y_label = y_labels[::step]

    print(f"Applying t-SNE to {x_down.shape[0]} samples with {x_down.shape[1]} features...")
    
    # Apply t-SNE to project high-dimensional data into 2D space
    x_embedded = TSNE(n_components=2, learning_rate="auto",
                      init="random", random_state=42).fit_transform(x_down)

    # Create a scatter plot of the 2D embedded data
    f, ax = plt.subplots(figsize=(12, 8))
    sns.scatterplot(
        x=x_embedded[:, 0],
        y=x_embedded[:, 1],
        hue=y_label,
        style=y_label,
        palette="bright",
        edgecolor="black",
        alpha=0.7
    )
    plt.legend(bbox_to_anchor=(1.1, 1.05))
    plt.title(title)
    plt.xlabel("t-SNE Component 1")
    plt.ylabel("t-SNE Component 2")
    save_plot("tsne_visualization")
    plt.show()

In [None]:
def plot_fault_injection_segment(
    df: pd.DataFrame,
    fault_injection_index: int,
    features: list[str],
    fault_number: int,
    point_start: int = 0,
    point_end: int = 500,
) -> None:
    """
    Plot feature time series before and after fault injection.

    Parameters:
    - df: DataFrame containing time series (1 simulation run).
    - fault_injection_index: Time step where fault is injected.
    - features: List of feature names to plot.
    - fault_number: Fault number to analyze
    - point_start: Starting time point
    - point_end: Ending time point
    """
    # Filter only the rows with the selected fault number
    df_filtered = df[df[TARGET_VARIABLE_COLUMN_NAME] == fault_number].reset_index(drop=True)

    x_range = np.arange(point_start, point_end)
    df_window = df_filtered.iloc[point_start:point_end]

    n_features = len(features)
    fig, axes = plt.subplots(n_features, 1, figsize=(15, 3 * n_features), sharex=True)

    if n_features == 1:
        axes = [axes]

    for i, feature in enumerate(features):
        ax = axes[i]
        y = df_window[feature].values
        ax.plot(x_range, y, label=feature, color="black")

        # Mark every 20th time index for readability
        for x in x_range:
            if x % 20 == 0:
                ax.axvline(x, color="gray", linestyle=":", linewidth=0.5, alpha=0.7)

        # Vertical fault injection marker
        ax.axvline(
            fault_injection_index,
            color="red",
            linestyle="--",
            linewidth=2,
            label="Fault Injected",
        )

        # Text showing the fault injection point
        ax.text(
            fault_injection_index,
            ax.get_ylim()[1] * 0.9,
            f"Fault Injection\nt={fault_injection_index}",
            color="red",
            fontsize=10,
            ha="center",
            va="top",
            bbox=dict(boxstyle="round,pad=0.3", facecolor="white", alpha=0.8)
        )

        # Calculate median before/after
        before = df_filtered.loc[point_start:fault_injection_index - 1, feature]
        after = df_filtered.loc[fault_injection_index:point_end - 1, feature]

        if len(before) > 0 and len(after) > 0:
            before_med = np.median(before)
            after_med = np.median(after)

            ax.axhline(before_med, color="blue", linestyle=":", alpha=0.7, label="Median Before")
            ax.axhline(after_med, color="green", linestyle=":", alpha=0.7, label="Median After")

        ax.set_ylabel(feature)
        ax.grid(True, alpha=0.3)

        if i == 0:
            ax.legend(loc="best")

    plt.xlabel("Time Index")
    plt.suptitle(f"Feature Response Around Fault {fault_number} Injection", fontsize=14)
    plt.tight_layout()
    save_plot(f"fault_injection_analysis_fault_{fault_number}")
    plt.show()


def plot_boxplots_all_features(df, title_suffix=""):
    """Plot boxplots for all features."""
    feature_columns = df.columns[3:] if len(df.columns) > 3 else df.columns
    num_features = len(feature_columns)

    num_cols = min(4, num_features)
    num_rows = int(np.ceil(num_features / num_cols))

    fig, axes = plt.subplots(num_rows,
                             num_cols,
                             figsize=(4 * num_cols, 3 * num_rows))
    fig.suptitle(f"Boxplots of All Features {title_suffix}",
                 fontsize=16,
                 y=1.02)

    for i, col in enumerate(feature_columns):
        row_index = i // num_cols
        col_index = i % num_cols

        if num_rows == 1:
            ax = axes[col_index] if num_cols > 1 else axes
        else:
            ax = axes[row_index,
                      col_index] if num_cols > 1 else axes[row_index]

        ax.boxplot(
            df[col],
            patch_artist=True,
            boxprops=dict(facecolor="lightblue", color="navy"),
            medianprops=dict(color="red"),
            whiskerprops=dict(color="gray"),
            capprops=dict(color="gray"),
            flierprops=dict(marker="o", color="darkorange", alpha=0.5),
        )

        ax.grid(True, linestyle="--", alpha=0.5)
        ax.set_ylabel(col)
        ax.set_title(col.replace("_", " "))

    # Remove empty subplots
    if num_rows > 1 and num_cols > 1:
        for i in range(num_features, num_rows * num_cols):
            fig.delaxes(axes.flatten()[i])

    plt.tight_layout()
    save_plot("boxplots_all_features", title_suffix.replace(" ", "_"))
    plt.show()

In [None]:
def plot_fault_vs_normal_segment(
    df_fault: pd.DataFrame,
    df_normal: pd.DataFrame,
    features: list[str],
    point_start: int,
    point_end: int,
    fault_number: int,
    fault_injection_index: int = 160,
) -> None:
    """
    Compare fault vs normal time series for selected features.

    Parameters:
    - df_fault: DataFrame with faults (should include faultNumber column).
    - df_normal: Normal (non-faulty) baseline DataFrame.
    - features: List of feature names to compare.
    - point_start: Start index of window.
    - point_end: End index of window (exclusive).
    - fault_number: Fault number to extract from df_fault.
    - fault_injection_index: When the fault was injected
    """
    df_fault_filtered = df_fault[df_fault[TARGET_VARIABLE_COLUMN_NAME] == fault_number].reset_index(drop=True)
    df_fault_window = df_fault_filtered.iloc[point_start:point_end]
    df_normal_window = df_normal.iloc[point_start:point_end]

    x_range = np.arange(point_start, point_end)
    n_features = len(features)
    fig, axes = plt.subplots(n_features, 1, figsize=(15, 3 * n_features), sharex=True)

    if n_features == 1:
        axes = [axes]

    for i, feature in enumerate(features):
        ax = axes[i]

        y_fault = df_fault_window[feature].values
        y_normal = df_normal_window[feature].values

        ax.plot(x_range, y_fault, label=f"Fault {fault_number}", color="red", linewidth=2)
        ax.plot(x_range, y_normal, label="Normal", color="blue", linewidth=2)

        # Median lines
        ax.axhline(
            np.median(y_normal),
            color="blue",
            linestyle="--",
            alpha=0.7,
            label="Normal Median",
        )
        ax.axhline(
            np.median(y_fault),
            color="red",
            linestyle="--",
            alpha=0.7,
            label="Fault Median",
        )
        
        # Fault injection marker
        if point_start <= fault_injection_index <= point_end:
            ax.axvline(
                fault_injection_index,
                color="green",
                linestyle="-",
                linewidth=3,
                alpha=0.8,
                label="Fault Injected",
            )

        # Mark every 10th index for reference
        for x in x_range:
            if x % 10 == 0:
                ax.axvline(x, color="gray", linestyle=":", linewidth=0.5, alpha=0.5)

        ax.set_ylabel(feature)
        ax.grid(True, alpha=0.3)
        
        if i == 0:
            ax.legend(loc="best")

    plt.xlabel("Time Index")
    plt.suptitle(f"Fault {fault_number} vs Normal Comparison", fontsize=14)
    plt.tight_layout()
    save_plot(f"fault_vs_normal_comparison_fault_{fault_number}")
    plt.show()


def compare_faults_stat(
    stats_df: pd.DataFrame,
    fault_a: int,
    fault_b: int,
    run_id: int,
    stat: str = "mean",
) -> pd.DataFrame:
    """
    Compare statistical measures between two faults for a specific run.
    """
    df_a = stats_df.query(
        "faultNumber == @fault_a and simulationRun == @run_id")[[
            "feature", stat
        ]].rename(columns={stat: f"fault_{fault_a}"})

    df_b = stats_df.query(
        "faultNumber == @fault_b and simulationRun == @run_id")[[
            "feature", stat
        ]].rename(columns={stat: f"fault_{fault_b}"})

    merged = pd.merge(df_a, df_b, on="feature", how="outer")
    merged["delta"] = merged[f"fault_{fault_a}"] - merged[f"fault_{fault_b}"]
    return merged

# Comprehensive Exploratory Data Analysis (EDA)

## Dataset Overview

**The TEP variables (columns 4 to 55) were sampled every 3 minutes for a total duration of 25 hours and 48 hours respectively.
The faults were introduced 1 hour into the Faulty Training and 8 hours into Faulty Testing datasets**

In [None]:
# Basic dataset information
print("=== DATASET OVERVIEW ===")
print(f"Faulty training data shape: {DF_F_TRAINING_RAW.shape}")
print(f"Fault-free training data shape: {DF_FF_TRAINING_RAW.shape}")
print(f"\nColumn names: {list(DF_F_TRAINING_RAW.columns[:10])}...")  # Show first 10 columns

print(f"\nUnique fault numbers in faulty data: {sorted(DF_F_TRAINING_RAW[TARGET_VARIABLE_COLUMN_NAME].unique())}")
print(f"Unique simulation runs in faulty data: {sorted(DF_F_TRAINING_RAW[SIMULATION_RUN_COLUMN_NAME].unique())}")

print(f"\nFault-free data fault numbers: {sorted(DF_FF_TRAINING_RAW[TARGET_VARIABLE_COLUMN_NAME].unique())}")
print(f"Fault-free simulation runs: {sorted(DF_FF_TRAINING_RAW[SIMULATION_RUN_COLUMN_NAME].unique())}")

# Display first few rows
print("\n=== FIRST FEW ROWS OF FAULTY DATA ===")
display(DF_F_TRAINING_RAW.head())

print("\n=== FIRST FEW ROWS OF FAULT-FREE DATA ===")
display(DF_FF_TRAINING_RAW.head())

In [None]:
# Calculate comprehensive statistics per fault run
# Using reduced data for speed (first 3 simulation runs)
print("=== CALCULATING COMPREHENSIVE STATISTICS ===")
statics_f_df = DF_F_TRAINING_RAW[(DF_F_TRAINING_RAW[SIMULATION_RUN_COLUMN_NAME] >= 1)
                             & (DF_F_TRAINING_RAW[SIMULATION_RUN_COLUMN_NAME] < 4)]
statics_ff_df = DF_FF_TRAINING_RAW[(DF_FF_TRAINING_RAW[SIMULATION_RUN_COLUMN_NAME] >= 1)
                               & (DF_FF_TRAINING_RAW[SIMULATION_RUN_COLUMN_NAME] < 4)]

print(f"Reduced faulty data shape: {statics_f_df.shape}")
print(f"Reduced fault-free data shape: {statics_ff_df.shape}")

# Calculate statistics
stats_df = calculate_statistics_per_fault_run(df_faulty=statics_f_df,
                                              df_normal=statics_ff_df)

print(f"\nStatistics dataframe shape: {stats_df.shape}")
print("Sample of calculated statistics:")
display(stats_df.head(10))

# Save the statistics
#save_dataframe(stats_df, "comprehensive_fault_statistics")

# Show statistics for a specific fault
print("\n=== STATISTICS FOR FAULT 3, RUN 1 ===")
display(stats_df.query("faultNumber == 3 and simulationRun == 1").head())

## Boxplot Analysis

Comparing the distribution of features between normal and faulty operations.

In [None]:
# Create boxplots for normal operation
print("Creating boxplots for normal operation...")
create_comprehensive_boxplots(DF_F_TRAINING_RAW, DF_FF_TRAINING_RAW, 
                             fault_number=3, simulation_run=10, plot_type="normal")

In [None]:
# Create boxplots for fault 3
print("Creating boxplots for fault 3...")
create_comprehensive_boxplots(DF_F_TRAINING_RAW, DF_FF_TRAINING_RAW, 
                             fault_number=3, simulation_run=10, plot_type="fault")

## Distribution Analysis

Comparing the distribution patterns between fault and fault-free data.

In [None]:
# Create distribution plots comparing fault vs normal
print("Creating distribution plots for fault 3 vs normal...")
create_distribution_plots(DF_F_TRAINING_RAW, DF_FF_TRAINING_RAW,
                         fault_number=3, simulation_run=1, max_features=6)

# Exploratory Data Analysis (EDA)

This notebook performs comprehensive exploratory data analysis on the Tennessee Eastman Process dataset. This notebook is completely standalone and loads raw data directly.

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from scipy.stats import skew, kurtosis
import pickle
import os
import pyreadr
from pathlib import Path
from ipywidgets import Button, HBox, VBox, Output, Dropdown
import sys
from datetime import datetime
from typing import Union, List, Tuple
from numpy.typing import NDArray

In [None]:
# Configuration
VERSION = "1.00"
OUTPUT_PATH = "output"
TARGET_VARIABLE_COLUMN_NAME = "faultNumber"
SIMULATION_RUN_COLUMN_NAME = "simulationRun"
COLUMNS_TO_REMOVE = ["simulationRun", "sample"]
SKIPED_FAULTS = []
FAULTS_TO_BE_MERGED_TOGETHER = [3, 8, 9, 18, 15]
MERGE_FAUTS_TO_NUMBER = 3
FAULT_INJECTION_STARTING_POINT = 25

## Load Prepared Data

In [None]:
# # Data is already loaded above in the EDA data preparation section
# # Use the raw data that was loaded earlier

# print(f"Raw fault-free training shape: {DF_FF_TRAINING_RAW.shape}")
# print(f"Raw faulty training shape: {DF_F_TRAINING_RAW.shape}")

# # Create a simple combined dataset for EDA analysis
# print("Preparing data for EDA analysis...")

# # Select only a subset for EDA to speed up processing
# feature_cols = [col for col in DF_F_TRAINING_RAW.columns 
#                 if col not in [TARGET_VARIABLE_COLUMN_NAME, SIMULATION_RUN_COLUMN_NAME, "sample"]]

# # Use limited simulation runs for EDA
# X_TRAIN = DF_F_TRAINING_RAW[(DF_F_TRAINING_RAW[SIMULATION_RUN_COLUMN_NAME] >= 1) & 
#                            (DF_F_TRAINING_RAW[SIMULATION_RUN_COLUMN_NAME] < 3)][feature_cols].to_numpy()
# Y_TRAIN_DF = DF_F_TRAINING_RAW[(DF_F_TRAINING_RAW[SIMULATION_RUN_COLUMN_NAME] >= 1) & 
#                                (DF_F_TRAINING_RAW[SIMULATION_RUN_COLUMN_NAME] < 3)][TARGET_VARIABLE_COLUMN_NAME]

# print(f"EDA analysis features shape: {X_TRAIN.shape}")
# print(f"EDA analysis labels shape: {Y_TRAIN_DF.shape}")
# print("✓ Data prepared for EDA analysis")

## Statistical Analysis

## Visualization Functions

In [None]:
def plot_distribution_comparison(df_faulty, df_normal, fault_number=3, simulation_run=1):
    """Plot distribution comparison between faulty and normal data."""
    df_selected_f_data = df_faulty[
        (df_faulty[TARGET_VARIABLE_COLUMN_NAME] == fault_number) &
        (df_faulty[SIMULATION_RUN_COLUMN_NAME] == simulation_run)
    ]
    df_select_ff_data = df_normal[
        (df_normal[TARGET_VARIABLE_COLUMN_NAME] == 0) &
        (df_normal[SIMULATION_RUN_COLUMN_NAME] == simulation_run)
    ]

    selected_features = df_selected_f_data.columns[3:8]  # Select first 5 features for demo
    
    fig, axes = plt.subplots(len(selected_features), 2, figsize=(12, 6 * len(selected_features)))
    
    for i, feature in enumerate(selected_features):
        # Faulty data
        sns.histplot(
            df_selected_f_data[feature],
            kde=True,
            ax=axes[i, 0],
            color="red",
            label="Faulty Data",
        )
        axes[i, 0].set_title(f"Faulty Data - {feature}")
        axes[i, 0].set_xlabel(feature)
        axes[i, 0].set_ylabel("Density")
        axes[i, 0].legend()
        
        # Normal data
        sns.histplot(
            df_select_ff_data[feature],
            kde=True,
            ax=axes[i, 1],
            color="blue",
            label="Fault-Free Data",
        )
        axes[i, 1].set_title(f"Fault-Free Data - {feature}")
        axes[i, 1].set_xlabel(feature)
        axes[i, 1].set_ylabel("Density")
        axes[i, 1].legend()
        
    plt.tight_layout()
    save_plot("distribution_comparison", f"fault_{fault_number}_run_{simulation_run}")
    plt.show()

In [None]:
def plot_tsne_visualization(x_train: np.ndarray, y_labeled_train: np.ndarray, step: int = 50) -> None:
    """Visualize class separation using t-SNE."""
    # Downsample for computation
    x_down = x_train[::step, :]
    y_label = y_labeled_train[::step]

    # Apply t-SNE
    x_embedded = TSNE(n_components=2, learning_rate="auto", init="random", random_state=42).fit_transform(x_down)

    # Plot
    f, ax = plt.subplots(figsize=(10, 6))
    sns.scatterplot(
        x=x_embedded[:, 0],
        y=x_embedded[:, 1],
        hue=y_label,
        style=y_label,
        palette="bright",
        edgecolor="black",
    )
    plt.legend(bbox_to_anchor=(1.1, 1.05))
    plt.title("t-SNE Visualization of Labeled Data")
    save_plot("tsne_visualization")
    plt.show()

In [None]:
def plot_correlation_matrix(df, threshold: float = 0.95):
    """Plot correlation matrix with threshold filtering."""
    data = df[df[SIMULATION_RUN_COLUMN_NAME] == 1].iloc[:, 3:] if SIMULATION_RUN_COLUMN_NAME in df.columns else df
    corr = data.corr()

    if threshold < 1.0:
        # Filter high correlations
        corr_abs = corr.abs().copy()
        np.fill_diagonal(corr_abs.values, 0.0)
        
        selected_features = set(corr_abs.columns[(corr_abs > threshold).any()])
        selected_features_list = sorted(selected_features)
        
        if selected_features_list:
            filtered_corr = corr.loc[selected_features_list, selected_features_list]
            mask = np.triu(np.ones_like(filtered_corr, dtype=bool))
            
            f, ax = plt.subplots(figsize=(12, 10))
            sns.heatmap(
                filtered_corr,
                mask=mask,
                cmap="coolwarm",
                annot=True,
                fmt=".2f",
                square=True,
                linewidths=0.5,
                cbar_kws={"shrink": 0.5},
            )
            plt.title(f"Correlation Matrix (|corr| > {threshold})")
            save_plot("correlation_matrix", f"threshold_{threshold}")
        else:
            print(f"No correlations above threshold {threshold} found.")
    else:
        # Full correlation matrix
        mask = np.triu(np.ones_like(corr, dtype=bool))
        f, ax = plt.subplots(figsize=(20, 20))
        sns.heatmap(
            corr,
            mask=mask,
            cmap="coolwarm",
            annot=True,
            fmt=".2f",
            square=True,
            linewidths=0.5,
            cbar_kws={"shrink": 0.5},
        )
        plt.title("Full Correlation Matrix")
        save_plot("correlation_matrix", "full")
    
    plt.tight_layout()
    plt.show()

## Generate EDA Visualizations

In [None]:
# 1. Boxplots for fault-free data
fault_free_sample = DF_FF_TRAINING_RAW[
    (DF_FF_TRAINING_RAW[TARGET_VARIABLE_COLUMN_NAME] == 0) &
    (DF_FF_TRAINING_RAW[SIMULATION_RUN_COLUMN_NAME] == 1)
]
plot_boxplots_all_features(fault_free_sample, "Fault_Free")

In [None]:
# 2. Distribution comparison
plot_distribution_comparison(DF_F_TRAINING_RAW, DF_FF_TRAINING_RAW, fault_number=3, simulation_run=1)

In [None]:
# 3. t-SNE visualization (warning: computationally intensive)
print("Running t-SNE visualization... This may take a few minutes.")
plot_tsne_visualization(X_TRAIN, Y_TRAIN_DF, step=100)  # Increased step for faster computation

In [None]:
# 4. Scatter plot of first two features
plt.figure(figsize=(10, 6))
plt.scatter(X_TRAIN[:, 0], X_TRAIN[:, 1], c=Y_TRAIN_DF, cmap="viridis", alpha=0.5)
plt.colorbar(label="Fault Number")
plt.title("Scatter Plot of First Two Features")
plt.xlabel("Feature 1")
plt.ylabel("Feature 2")
plt.grid()
save_plot("scatter_first_two_features")
plt.show()

In [None]:
# 5. Correlation analysis
print("Generating correlation matrix with high correlations...")
plot_correlation_matrix(DF_FF_TRAINING_RAW, threshold=0.95)

## Time Series Analysis

In [None]:
def plot_time_series_sample(df, fault_number=0, simulation_run=1, features_to_plot=5):
    """Plot time series for selected features."""
    df_sample = df[
        (df[TARGET_VARIABLE_COLUMN_NAME] == fault_number) &
        (df[SIMULATION_RUN_COLUMN_NAME] == simulation_run)
    ]
    
    feature_columns = df_sample.columns[3:3+features_to_plot]  # First few features
    
    fig, axes = plt.subplots(len(feature_columns), 1, figsize=(15, 3 * len(feature_columns)))
    
    if len(feature_columns) == 1:
        axes = [axes]
    
    for i, col in enumerate(feature_columns):
        axes[i].plot(df_sample["sample"] if "sample" in df_sample.columns else range(len(df_sample)), 
                    df_sample[col])
        axes[i].set_xlabel("Sample" if "sample" in df_sample.columns else "Index")
        axes[i].set_ylabel(col)
        axes[i].set_title(f"{col.replace('_', ' ')} - Fault {fault_number}, Run {simulation_run}")
        axes[i].grid(True)
    
    plt.tight_layout()
    save_plot("time_series", f"fault_{fault_number}_run_{simulation_run}")
    plt.show()

# Plot time series for normal operation
if "sample" in DF_FF_TRAINING_RAW.columns:
    plot_time_series_sample(DF_FF_TRAINING_RAW, fault_number=0, simulation_run=1)

## Summary Statistics Export

In [None]:
# Generate and save summary statistics
if 'sample' not in DF_FF_TRAINING_RAW.columns:
    feature_cols = DF_FF_TRAINING_RAW.columns[3:] if len(DF_FF_TRAINING_RAW.columns) > 3 else DF_FF_TRAINING_RAW.columns
else:
    feature_cols = [col for col in DF_FF_TRAINING_RAW.columns if col not in [TARGET_VARIABLE_COLUMN_NAME, SIMULATION_RUN_COLUMN_NAME, "sample", "time"]]

summary_stats_normal = DF_FF_TRAINING_RAW[feature_cols].describe()
save_dataframe(summary_stats_normal, "summary_statistics_normal", "EDA")

summary_stats_faulty = DF_F_TRAINING_RAW[feature_cols].describe()
save_dataframe(summary_stats_faulty, "summary_statistics_faulty", "EDA")

# Class distribution
class_distribution = DF_F_TRAINING_RAW[TARGET_VARIABLE_COLUMN_NAME].value_counts().sort_index()
class_dist_df = pd.DataFrame({
    'Fault_Number': class_distribution.index,
    'Count': class_distribution.values
})
save_dataframe(class_dist_df, "class_distribution", "EDA")

print("\n=== EDA Summary ===")
print(f"Normal data shape: {DF_FF_TRAINING_RAW.shape}")
print(f"Faulty data shape: {DF_F_TRAINING_RAW.shape}")
print(f"Number of features: {len(feature_cols)}")
print(f"\nFault distribution:")
print(class_dist_df)

print("\n=== EDA Complete ===")
print(f"All plots and statistics saved to: {OUTPUT_PATH}/{VERSION}/")
print("Files generated:")
print("- fault_statistics_EDA_*.csv")
print("- summary_statistics_*.csv")
print("- class_distribution_*.csv")
print("- Various plots in EDA/ subfolder")