# Data Loading and Preparation

This notebook handles data loading, preprocessing, and standardization for the Tennessee Eastman Process dataset.

In [None]:
import sys

def check_python_version() -> None:
    required_major: int = 3
    required_minor: int = 11
    current_version: tuple[int, int, int] = sys.version_info[:3]

    if current_version[0] != required_major or current_version[1] != required_minor:
        raise RuntimeError(
            f"Python {required_major}.{required_minor}.xx is required, "
            f"but you are using {current_version[0]}.{current_version[1]}.{current_version[2]}"
        )

check_python_version()

In [None]:
# Import libraries
import pandas as pd
import pyreadr
import numpy as np
import os
from datetime import datetime
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
import pickle

In [None]:
# Global Configuration
VERSION = "1.00"
OUTPUT_PATH = "output"
TARGET_VARIABLE_COLUMN_NAME = "faultNumber"
SIMULATION_RUN_COLUMN_NAME = "simulationRun"
COLUMNS_TO_REMOVE = ["simulationRun", "sample"]
SKIPED_FAULTS = []
FAULTS_TO_BE_MERGED_TOGETHER = [3, 8, 9, 18, 15]
MERGE_FAUTS_TO_NUMBER = 3
FAULT_INJECTION_STARTING_POINT = 25

In [None]:
def save_dataframe(df: pd.DataFrame, name: str, suffix: str = "") -> None:
    """Save a DataFrame to CSV with versioned filename."""
    timestamp: str = ""
    base_dir: str = os.path.join(OUTPUT_PATH, "data")
    os.makedirs(base_dir, exist_ok=True)

    filename: str = f"{name}_{suffix}_v{VERSION}_{timestamp}.csv" if suffix else f"{name}_v{VERSION}_{timestamp}.csv"
    filepath: str = os.path.join(base_dir, filename)

    df.to_csv(filepath, index=True)
    print(f"Data saved: {filepath}")

def save_pickle(obj, name: str, suffix: str = "") -> None:
    """Save object as pickle file."""
    timestamp: str = ""
    base_dir: str = os.path.join(OUTPUT_PATH, "data")
    os.makedirs(base_dir, exist_ok=True)

    filename: str = f"{name}_{suffix}_v{VERSION}_{timestamp}.pkl" if suffix else f"{name}_v{VERSION}_{timestamp}.pkl"
    filepath: str = os.path.join(base_dir, filename)

    with open(filepath, 'wb') as f:
        pickle.dump(obj, f)
    print(f"Saved: {filepath}")

## Load Raw Data

In [None]:
# Load .RData files
fault_free_training_dict = pyreadr.read_r("data/TEP_FaultFree_Training.RData")
fault_free_testing_dict = pyreadr.read_r("data/TEP_FaultFree_Testing.RData")
faulty_training_dict = pyreadr.read_r("data/TEP_Faulty_Training.RData")
faulty_testing_dict = pyreadr.read_r("data/TEP_Faulty_Testing.RData")

# Extract DataFrames
DF_FF_TRAINING_RAW = fault_free_training_dict["fault_free_training"]
DF_FF_TEST_RAW = fault_free_testing_dict["fault_free_testing"]
DF_F_TRAINING_RAW = faulty_training_dict["faulty_training"]
DF_F_TEST_RAW = faulty_testing_dict["faulty_testing"]

print(f"Fault-free training: {DF_FF_TRAINING_RAW.shape}")
print(f"Fault-free testing: {DF_FF_TEST_RAW.shape}")
print(f"Faulty training: {DF_F_TRAINING_RAW.shape}")
print(f"Faulty testing: {DF_F_TEST_RAW.shape}")

## Data Preprocessing

In [None]:
# Skip specified faults
DF_F_TRAIN_SKIPPED_FAULTS = DF_F_TRAINING_RAW[~DF_F_TRAINING_RAW[TARGET_VARIABLE_COLUMN_NAME].isin(SKIPED_FAULTS)].reset_index(drop=True)
DF_F_TEST_SKIPPED_FAULTS = DF_F_TEST_RAW[~DF_F_TEST_RAW[TARGET_VARIABLE_COLUMN_NAME].isin(SKIPED_FAULTS)].reset_index(drop=True)

# Reduce data for development (keeping balanced)
DF_FF_TRAINING_REDUCED = DF_FF_TRAINING_RAW[(DF_FF_TRAINING_RAW[SIMULATION_RUN_COLUMN_NAME] > 0) & (DF_FF_TRAINING_RAW[SIMULATION_RUN_COLUMN_NAME] < 2)].drop(columns=COLUMNS_TO_REMOVE, axis=1)
DF_F_TRAINING_REDUCED = DF_F_TRAIN_SKIPPED_FAULTS[(DF_F_TRAIN_SKIPPED_FAULTS[SIMULATION_RUN_COLUMN_NAME] > 4) & (DF_F_TRAIN_SKIPPED_FAULTS[SIMULATION_RUN_COLUMN_NAME] < 6) &(DF_F_TRAIN_SKIPPED_FAULTS["sample"] > FAULT_INJECTION_STARTING_POINT)].drop(columns=COLUMNS_TO_REMOVE, axis=1)


DF_FF_TEST_REDUCED = DF_FF_TEST_RAW[(DF_FF_TEST_RAW[SIMULATION_RUN_COLUMN_NAME] > 2) & (DF_FF_TEST_RAW[SIMULATION_RUN_COLUMN_NAME] < 4)].drop(columns=COLUMNS_TO_REMOVE, axis=1)
DF_F_TEST_REDUCED = DF_F_TEST_SKIPPED_FAULTS[(DF_F_TEST_SKIPPED_FAULTS[SIMULATION_RUN_COLUMN_NAME] > 5) & (DF_F_TEST_SKIPPED_FAULTS[SIMULATION_RUN_COLUMN_NAME] < 7) & (DF_F_TEST_SKIPPED_FAULTS["sample"] > FAULT_INJECTION_STARTING_POINT)].drop(columns=COLUMNS_TO_REMOVE, axis=1)

print(f"Reduced fault-free training: {DF_FF_TRAINING_REDUCED.shape}")
print(f"Reduced faulty training: {DF_F_TRAINING_REDUCED.shape}")
print(f"Reduced fault-free testing: {DF_FF_TEST_REDUCED.shape}")
print(f"Reduced faulty testing: {DF_F_TEST_REDUCED.shape}")

In [None]:
def check_balance_difference(df1: pd.DataFrame, df2: pd.DataFrame, threshold: int = 100) -> None:
    size_diff: int = abs(df1.shape[0] - df2.shape[0])
    print(f"Data difference: {size_diff}")
    if size_diff > threshold:
        raise ValueError(f"Data imbalance too large: difference = {size_diff} rows")

# Check balance
check_balance_difference(DF_FF_TRAINING_REDUCED, DF_F_TRAINING_REDUCED.query("faultNumber == 1"))
check_balance_difference(DF_FF_TEST_REDUCED, DF_F_TEST_REDUCED.query("faultNumber == 1"))

## Prepare Datasets for Different Tasks

In [None]:
# 1. Supervised Classification Dataset
DF_TRAINING_REDUCED_CONCATED = pd.concat([DF_FF_TRAINING_REDUCED, DF_F_TRAINING_REDUCED])
DF_TEST_REDUCED_CONCATED = pd.concat([DF_FF_TEST_REDUCED, DF_F_TEST_REDUCED])

# Standardize data
sc = StandardScaler()
sc.fit(DF_TRAINING_REDUCED_CONCATED.drop(columns=[TARGET_VARIABLE_COLUMN_NAME], axis=1))
X_TRAIN = sc.transform(DF_TRAINING_REDUCED_CONCATED.drop(columns=[TARGET_VARIABLE_COLUMN_NAME], axis=1))
Y_TRAIN_DF = DF_TRAINING_REDUCED_CONCATED[TARGET_VARIABLE_COLUMN_NAME]

# Encode labels
le = LabelEncoder()
Y_TRAIN = le.fit_transform(Y_TRAIN_DF)

# Test data
X_TEST_REDUCED = sc.transform(DF_TEST_REDUCED_CONCATED.drop(columns=[TARGET_VARIABLE_COLUMN_NAME], axis=1))
Y_TEST_REDUCED_DF = DF_TEST_REDUCED_CONCATED[TARGET_VARIABLE_COLUMN_NAME]
Y_TEST_REDUCED = le.transform(Y_TEST_REDUCED_DF)

# One-hot encode
encoder_1 = OneHotEncoder(sparse_output=False)
Y_reshaped = DF_TRAINING_REDUCED_CONCATED[TARGET_VARIABLE_COLUMN_NAME].to_numpy().reshape(-1, 1)
Y_ENC_TRAIN = encoder_1.fit_transform(Y_reshaped)

Y_test_reshaped = DF_TEST_REDUCED_CONCATED[TARGET_VARIABLE_COLUMN_NAME].to_numpy().reshape(-1, 1)
Y_ENC_TEST_REDUCED = encoder_1.transform(Y_test_reshaped)

print(f"Training features shape: {X_TRAIN.shape}")
print(f"Training labels shape: {Y_TRAIN.shape}")
print(f"Test features shape: {X_TEST_REDUCED.shape}")
print(f"Test labels shape: {Y_TEST_REDUCED.shape}")
print(f"Unique fault numbers in training: {DF_TRAINING_REDUCED_CONCATED[TARGET_VARIABLE_COLUMN_NAME].unique()}")

In [None]:
# 2. Anomaly Detection Dataset (binary: normal vs fault)
# Training data
X_INCONTROL_TRAIN_REDUCED_DF = DF_FF_TRAINING_REDUCED.drop(columns=[TARGET_VARIABLE_COLUMN_NAME], axis=1)
sc_anomaly = StandardScaler()
sc_anomaly.fit(X_INCONTROL_TRAIN_REDUCED_DF)
X_INCONTROL_TRAIN_REDUCED = sc_anomaly.transform(X_INCONTROL_TRAIN_REDUCED_DF)

# Binary labels for anomaly detection
Y_TRAIN_ANOMALY_REDUCED_DF = DF_TRAINING_REDUCED_CONCATED[TARGET_VARIABLE_COLUMN_NAME].apply(lambda x: 0 if x == 0 else 1)
encoder_2 = OneHotEncoder(sparse_output=False)
Y_reshaped_anomaly = Y_TRAIN_ANOMALY_REDUCED_DF.to_numpy().reshape(-1, 1)
Y_ENC_ANOMALY_TRAIN_REDUCED = encoder_2.fit_transform(Y_reshaped_anomaly)

# Test data
X_INCONTROL_TEST_REDUCED_DF = DF_FF_TEST_REDUCED.drop(columns=[TARGET_VARIABLE_COLUMN_NAME], axis=1)
X_INCONTROL_TEST_REDUCED = sc_anomaly.transform(X_INCONTROL_TEST_REDUCED_DF)

X_OUT_OF_CONTROL_TEST_REDUCED_DF = DF_F_TEST_REDUCED.drop(columns=[TARGET_VARIABLE_COLUMN_NAME], axis=1)
X_OUT_OF_CONTROL_TEST_REDUCED = sc_anomaly.transform(X_OUT_OF_CONTROL_TEST_REDUCED_DF)

Y_TEST_ANOMALY_REDUCED_DF = DF_TEST_REDUCED_CONCATED[TARGET_VARIABLE_COLUMN_NAME].apply(lambda x: 0 if x == 0 else 1)
Y_test_reshaped_anomaly = Y_TEST_ANOMALY_REDUCED_DF.to_numpy().reshape(-1, 1)
Y_ENC_ANOMALY_TEST_REDUCED = encoder_2.transform(Y_test_reshaped_anomaly)

print(f"Anomaly detection - In-control training: {X_INCONTROL_TRAIN_REDUCED.shape}")
print(f"Anomaly detection - Test features: {X_TEST_REDUCED.shape}")
print(f"Anomaly detection - Binary labels: {Y_TRAIN_ANOMALY_REDUCED_DF.value_counts()}")

In [None]:
# 3. Merged Faults Dataset
Y_TRAIN_MERGED_FAULTS_REDUCED_DF = DF_TRAINING_REDUCED_CONCATED[TARGET_VARIABLE_COLUMN_NAME].apply(
    lambda x: MERGE_FAUTS_TO_NUMBER if x in FAULTS_TO_BE_MERGED_TOGETHER else x
)
encoder_3 = OneHotEncoder(sparse_output=False)
Y_reshaped_train_merged = Y_TRAIN_MERGED_FAULTS_REDUCED_DF.to_numpy().reshape(-1, 1)
Y_ENC_MERGED_TRAIN_REDUCED = encoder_3.fit_transform(Y_reshaped_train_merged)

Y_TEST_MERGED_FAULTS_DF = DF_TEST_REDUCED_CONCATED[TARGET_VARIABLE_COLUMN_NAME].apply(
    lambda x: MERGE_FAUTS_TO_NUMBER if x in FAULTS_TO_BE_MERGED_TOGETHER else x
)
Y_reshaped_test_merged = Y_TEST_MERGED_FAULTS_DF.to_numpy().reshape(-1, 1)
Y_ENC_MERGED_TEST_REDUCED = encoder_3.transform(Y_reshaped_test_merged)

print(f"Merged faults - Unique values in training: {np.unique(Y_reshaped_train_merged)}")
print(f"Merged faults - Unique values in testing: {np.unique(Y_reshaped_test_merged)}")

## Export Prepared Data

In [None]:
# Save raw dataframes
save_dataframe(DF_FF_TRAINING_RAW, "raw_fault_free_training")
save_dataframe(DF_FF_TEST_RAW, "raw_fault_free_test")
save_dataframe(DF_F_TRAINING_RAW, "raw_faulty_training")
save_dataframe(DF_F_TEST_RAW, "raw_faulty_test")

# Save reduced dataframes
save_dataframe(DF_FF_TRAINING_REDUCED, "fault_free_training_reduced")
save_dataframe(DF_FF_TEST_REDUCED, "fault_free_test_reduced")
save_dataframe(DF_F_TRAINING_REDUCED, "faulty_training_reduced")
save_dataframe(DF_F_TEST_REDUCED, "faulty_test_reduced")

# Save combined dataframes
save_dataframe(DF_TRAINING_REDUCED_CONCATED, "training_combined")
save_dataframe(DF_TEST_REDUCED_CONCATED, "test_combined")

In [None]:
# Save processed arrays for classification
data_classification = {
    'X_TRAIN': X_TRAIN,
    'Y_TRAIN': Y_TRAIN,
    'Y_TRAIN_DF': Y_TRAIN_DF,
    'X_TEST_REDUCED': X_TEST_REDUCED,
    'Y_TEST_REDUCED': Y_TEST_REDUCED,
    'Y_TEST_REDUCED_DF': Y_TEST_REDUCED_DF,
    'Y_ENC_TRAIN': Y_ENC_TRAIN,
    'Y_ENC_TEST_REDUCED': Y_ENC_TEST_REDUCED,
    'scaler': sc,
    'label_encoder': le,
    'onehot_encoder': encoder_1
}
save_pickle(data_classification, "data_classification")

# Save processed arrays for anomaly detection
data_anomaly = {
    'X_INCONTROL_TRAIN_REDUCED': X_INCONTROL_TRAIN_REDUCED,
    'X_INCONTROL_TEST_REDUCED': X_INCONTROL_TEST_REDUCED,
    'X_OUT_OF_CONTROL_TEST_REDUCED': X_OUT_OF_CONTROL_TEST_REDUCED,
    'Y_TRAIN_ANOMALY_REDUCED_DF': Y_TRAIN_ANOMALY_REDUCED_DF,
    'Y_TEST_ANOMALY_REDUCED_DF': Y_TEST_ANOMALY_REDUCED_DF,
    'Y_ENC_ANOMALY_TRAIN_REDUCED': Y_ENC_ANOMALY_TRAIN_REDUCED,
    'Y_ENC_ANOMALY_TEST_REDUCED': Y_ENC_ANOMALY_TEST_REDUCED,
    'scaler_anomaly': sc_anomaly,
    'onehot_encoder_anomaly': encoder_2
}
save_pickle(data_anomaly, "data_anomaly")

# Save processed arrays for merged faults
data_merged = {
    'Y_TRAIN_MERGED_FAULTS_REDUCED_DF': Y_TRAIN_MERGED_FAULTS_REDUCED_DF,
    'Y_TEST_MERGED_FAULTS_DF': Y_TEST_MERGED_FAULTS_DF,
    'Y_ENC_MERGED_TRAIN_REDUCED': Y_ENC_MERGED_TRAIN_REDUCED,
    'Y_ENC_MERGED_TEST_REDUCED': Y_ENC_MERGED_TEST_REDUCED,
    'onehot_encoder_merged': encoder_3
}
save_pickle(data_merged, "data_merged")

print("All data preparation complete!")