## Configuration

In [1]:
import pyreadr
from sklearn.preprocessing import StandardScaler
import pandas as pd
import os
import numpy as np


fault_free_training_dict = pyreadr.read_r("data/TEP_FaultFree_Training.RData")
fault_free_testing_dict = pyreadr.read_r("data/TEP_FaultFree_Testing.RData")

faulty_training_dict = pyreadr.read_r("data/TEP_Faulty_Training.RData")
faulty_testing_dict = pyreadr.read_r("data/TEP_Faulty_Testing.RData")

## Full data export

In [2]:
TARGET_VARIABLE_COLUMN_NAME = "faultNumber"
SIMULATION_RUN_COLUMN_NAME = "simulationRun"
COLUMNS_TO_REMOVE = ["faultNumber","simulationRun", "sample"]
export_dir="data/exported_csv"

In [3]:
# Read raw data
DF_FF_TRAINING_RAW = fault_free_training_dict["fault_free_training"]
DF_FF_TEST_RAW = fault_free_testing_dict["fault_free_testing"]

DF_F_TRAINING_RAW = faulty_training_dict["faulty_training"]
DF_F_TEST_RAW = faulty_testing_dict["faulty_testing"]

In [4]:
# In-control X training data
scaler_incontrol = StandardScaler()
X_INCONTROL_TRAIN_FULL_DF = DF_FF_TRAINING_RAW.drop(columns=COLUMNS_TO_REMOVE, axis=1)

scaler_incontrol.fit(X_INCONTROL_TRAIN_FULL_DF)
X_INCONTROL_TRAIN_FULL_SCALED = scaler_incontrol.transform(X_INCONTROL_TRAIN_FULL_DF)
print("In-control training data shape:", X_INCONTROL_TRAIN_FULL_SCALED.shape)

# In-control X testing data
X_INCONTROL_TEST_FULL_DF = DF_F_TEST_RAW.drop(columns=COLUMNS_TO_REMOVE, axis=1)
X_INCONTROL_TEST_FULL_SCALED = scaler_incontrol.fit_transform(X_INCONTROL_TEST_FULL_DF)
print("In-control testing data shape:", X_INCONTROL_TEST_FULL_SCALED.shape)

In-control training data shape: (250000, 52)
In-control testing data shape: (9600000, 52)


In [5]:
# Out-of-control X training data (use same scaler from in-control training)
X_OUT_OF_CONTROL_TRAIN_FULL_DF = DF_F_TRAINING_RAW.drop(columns=COLUMNS_TO_REMOVE, axis=1)
X_OUT_OF_CONTROL_TRAIN_FULL_SCALED = scaler_incontrol.transform(X_OUT_OF_CONTROL_TRAIN_FULL_DF)
print("Out-of-control training data shape:", X_OUT_OF_CONTROL_TRAIN_FULL_SCALED.shape)

# Out-of-control X testing data (use same scaler from in-control training)
X_OUT_OF_CONTROL_TEST_FULL_DF = DF_F_TEST_RAW.drop(columns=COLUMNS_TO_REMOVE, axis=1)
X_OUT_OF_CONTROL_TEST_FULL_SCALED = scaler_incontrol.transform(X_OUT_OF_CONTROL_TEST_FULL_DF)
print("Out-of-control testing data shape:", X_OUT_OF_CONTROL_TEST_FULL_SCALED.shape)

# Extract y labels (faultNumber)
y_INCONTROL_TRAIN_FULL = DF_FF_TRAINING_RAW[TARGET_VARIABLE_COLUMN_NAME].values
y_INCONTROL_TEST_FULL = DF_FF_TEST_RAW[TARGET_VARIABLE_COLUMN_NAME].values
y_OUT_OF_CONTROL_TRAIN_FULL = DF_F_TRAINING_RAW[TARGET_VARIABLE_COLUMN_NAME].values
y_OUT_OF_CONTROL_TEST_FULL = DF_F_TEST_RAW[TARGET_VARIABLE_COLUMN_NAME].values

print("\ny labels shapes:")
print("In-control training labels shape:", y_INCONTROL_TRAIN_FULL.shape)
print("In-control testing labels shape:", y_INCONTROL_TEST_FULL.shape)
print("Out-of-control training labels shape:", y_OUT_OF_CONTROL_TRAIN_FULL.shape)
print("Out-of-control testing labels shape:", y_OUT_OF_CONTROL_TEST_FULL.shape)

Out-of-control training data shape: (5000000, 52)
Out-of-control testing data shape: (9600000, 52)

y labels shapes:
In-control training labels shape: (250000,)
In-control testing labels shape: (480000,)
Out-of-control training labels shape: (5000000,)
Out-of-control testing labels shape: (9600000,)


In [6]:
datasets = {
    'X_INCONTROL_TRAIN_FULL_SCALED': X_INCONTROL_TRAIN_FULL_SCALED,
    'X_INCONTROL_TEST_FULL_SCALED': X_INCONTROL_TEST_FULL_SCALED,
    'X_OUT_OF_CONTROL_TRAIN_FULL_SCALED': X_OUT_OF_CONTROL_TRAIN_FULL_SCALED,
    'X_OUT_OF_CONTROL_TEST_FULL_SCALED': X_OUT_OF_CONTROL_TEST_FULL_SCALED,
    'y_INCONTROL_TRAIN_FULL': y_INCONTROL_TRAIN_FULL,
    'y_INCONTROL_TEST_FULL': y_INCONTROL_TEST_FULL,
    'y_OUT_OF_CONTROL_TRAIN_FULL': y_OUT_OF_CONTROL_TRAIN_FULL,
    'y_OUT_OF_CONTROL_TEST_FULL': y_OUT_OF_CONTROL_TEST_FULL,
}

# export
for var_name, data in datasets.items():
    # Convert to DataFrame if it's a numpy array
    if hasattr(data, 'shape'):
        df = pd.DataFrame(data)
    else:
        df = data
    
    # Create filename with variable name, fault number, and simulation run
    filename = f"{var_name}.csv"
    filepath = os.path.join(export_dir, filename)
    
    # Export to CSV
    df.to_csv(filepath, index=False)
    print(f"Exported: {filename} (shape: {data.shape})")

Exported: X_INCONTROL_TRAIN_FULL_SCALED.csv (shape: (250000, 52))
Exported: X_INCONTROL_TEST_FULL_SCALED.csv (shape: (9600000, 52))
Exported: X_OUT_OF_CONTROL_TRAIN_FULL_SCALED.csv (shape: (5000000, 52))
Exported: X_OUT_OF_CONTROL_TEST_FULL_SCALED.csv (shape: (9600000, 52))
Exported: y_INCONTROL_TRAIN_FULL.csv (shape: (250000,))
Exported: y_INCONTROL_TEST_FULL.csv (shape: (480000,))
Exported: y_OUT_OF_CONTROL_TRAIN_FULL.csv (shape: (5000000,))
Exported: y_OUT_OF_CONTROL_TEST_FULL.csv (shape: (9600000,))


## Selected data export

In [7]:
simulation_run = 1
fault_number = 2

DF_FF_TRAINING_SELECTED_RUN = DF_FF_TRAINING_RAW.query("simulationRun == @simulation_run")
DF_FF_TEST_SELECTED_RUN = DF_FF_TEST_RAW.query("simulationRun == @simulation_run")

DF_F_TRAINING_SELECTED_RUN_AND_FAULT = DF_F_TRAINING_RAW.query("faultNumber == @fault_number and simulationRun == @simulation_run")
DF_F_TESTDF_F_TRAINING_SELECTED_RUN_AND_FAULT = DF_F_TEST_RAW.query("faultNumber == @fault_number and simulationRun == @simulation_run")


# In-control selected data (transform using the same scaler already fit on full training data)
X_INCONTROL_TRAIN_SELECTED_DF = DF_FF_TRAINING_SELECTED_RUN.drop(columns=COLUMNS_TO_REMOVE, axis=1)
X_INCONTROL_TEST_SELECTED_DF = DF_FF_TEST_SELECTED_RUN.drop(columns=COLUMNS_TO_REMOVE, axis=1)

# Use the existing scaler_incontrol (already fit on full training data) - DO NOT re-fit
X_INCONTROL_TRAIN_SELECTED_SCALED = scaler_incontrol.transform(X_INCONTROL_TRAIN_SELECTED_DF)
X_INCONTROL_TEST_SELECTED_SCALED = scaler_incontrol.transform(X_INCONTROL_TEST_SELECTED_DF)

# Out-of-control selected data (transform using the same scaler)
X_OUT_OF_CONTROL_TEST_SELECTED_DF = DF_F_TESTDF_F_TRAINING_SELECTED_RUN_AND_FAULT.drop(columns=COLUMNS_TO_REMOVE, axis=1)
X_OUT_OF_CONTROL_TRAIN_SELECTED_DF = DF_F_TRAINING_SELECTED_RUN_AND_FAULT.drop(columns=COLUMNS_TO_REMOVE, axis=1)

X_OUT_OF_CONTROL_TEST_SELECTED_SCALED = scaler_incontrol.transform(X_OUT_OF_CONTROL_TEST_SELECTED_DF)
X_OUT_OF_CONTROL_TRAIN_SELECTED_SCALED = scaler_incontrol.transform(X_OUT_OF_CONTROL_TRAIN_SELECTED_DF)

# Extract y labels for selected data
y_INCONTROL_TRAIN_SELECTED = DF_FF_TRAINING_SELECTED_RUN[TARGET_VARIABLE_COLUMN_NAME].values
y_INCONTROL_TEST_SELECTED = DF_FF_TEST_SELECTED_RUN[TARGET_VARIABLE_COLUMN_NAME].values
y_OUT_OF_CONTROL_TRAIN_SELECTED = DF_F_TRAINING_SELECTED_RUN_AND_FAULT[TARGET_VARIABLE_COLUMN_NAME].values
y_OUT_OF_CONTROL_TEST_SELECTED = DF_F_TESTDF_F_TRAINING_SELECTED_RUN_AND_FAULT[TARGET_VARIABLE_COLUMN_NAME].values

print("In-control training data shape:", X_INCONTROL_TRAIN_SELECTED_SCALED.shape)
print("In-control test data shape:", X_INCONTROL_TEST_SELECTED_SCALED.shape)
print("Out-of-control training data shape:", X_OUT_OF_CONTROL_TRAIN_SELECTED_SCALED.shape)
print("Out-of-control test data shape:", X_OUT_OF_CONTROL_TEST_SELECTED_SCALED.shape)

print("\nSelected y labels shapes:")
print("In-control training labels shape:", y_INCONTROL_TRAIN_SELECTED.shape)
print("In-control testing labels shape:", y_INCONTROL_TEST_SELECTED.shape)
print("Out-of-control training labels shape:", y_OUT_OF_CONTROL_TRAIN_SELECTED.shape)
print("Out-of-control testing labels shape:", y_OUT_OF_CONTROL_TEST_SELECTED.shape)

In-control training data shape: (500, 52)
In-control test data shape: (960, 52)
Out-of-control training data shape: (500, 52)
Out-of-control test data shape: (960, 52)

Selected y labels shapes:
In-control training labels shape: (500,)
In-control testing labels shape: (960,)
Out-of-control training labels shape: (500,)
Out-of-control testing labels shape: (960,)


In [8]:
# Create directory for exported data if it doesn't exist
export_dir = "data/exported_csv"
os.makedirs(export_dir, exist_ok=True)

# Get fault_number and simulation_run from your variables
# Adjust these variable names if they're different in your notebook
try:
    fault_num = fault_number
except NameError:
    fault_num = input("Enter fault_number: ")

try:
    sim_run = simulation_run
except NameError:
    sim_run = input("Enter simulation_run: ")

# Export each variable as CSV


datasets_selected = {
    'X_INCONTROL_TRAIN_SELECTED_SCALED': X_INCONTROL_TRAIN_SELECTED_SCALED,
    'X_INCONTROL_TEST_SELECTED_SCALED': X_INCONTROL_TEST_SELECTED_SCALED,
    'X_OUT_OF_CONTROL_TRAIN_SELECTED_SCALED': X_OUT_OF_CONTROL_TRAIN_SELECTED_SCALED,
    'X_OUT_OF_CONTROL_TEST_SELECTED_SCALED': X_OUT_OF_CONTROL_TEST_SELECTED_SCALED,
    'y_INCONTROL_TRAIN_SELECTED': y_INCONTROL_TRAIN_SELECTED,
    'y_INCONTROL_TEST_SELECTED': y_INCONTROL_TEST_SELECTED,
    'y_OUT_OF_CONTROL_TRAIN_SELECTED': y_OUT_OF_CONTROL_TRAIN_SELECTED,
    'y_OUT_OF_CONTROL_TEST_SELECTED': y_OUT_OF_CONTROL_TEST_SELECTED,
}

for var_name, data in datasets_selected.items():
    # Convert to DataFrame if it's a numpy array
    if hasattr(data, 'shape'):
        df = pd.DataFrame(data)
    else:
        df = data
    
    # Create filename with variable name, fault number, and simulation run
    filename = f"{var_name}_fault_{fault_num}_sim_{sim_run}.csv"
    filepath = os.path.join(export_dir, filename)
    
    # Export to CSV
    df.to_csv(filepath, index=False)
    print(f"Exported: {filename} (shape: {data.shape})")



print(f"\nAll files exported to: {export_dir}")


Exported: X_INCONTROL_TRAIN_SELECTED_SCALED_fault_2_sim_1.csv (shape: (500, 52))
Exported: X_INCONTROL_TEST_SELECTED_SCALED_fault_2_sim_1.csv (shape: (960, 52))
Exported: X_OUT_OF_CONTROL_TRAIN_SELECTED_SCALED_fault_2_sim_1.csv (shape: (500, 52))
Exported: X_OUT_OF_CONTROL_TEST_SELECTED_SCALED_fault_2_sim_1.csv (shape: (960, 52))
Exported: y_INCONTROL_TRAIN_SELECTED_fault_2_sim_1.csv (shape: (500,))
Exported: y_INCONTROL_TEST_SELECTED_fault_2_sim_1.csv (shape: (960,))
Exported: y_OUT_OF_CONTROL_TRAIN_SELECTED_fault_2_sim_1.csv (shape: (500,))
Exported: y_OUT_OF_CONTROL_TEST_SELECTED_fault_2_sim_1.csv (shape: (960,))

All files exported to: data/exported_csv


## Import again as a test

In [9]:
# Import the CSV files and convert them back to numpy arrays
# Set the fault number and simulation run to match the exported files
fault_num = 2
sim_run = 1

# Import each CSV file for X data
X_INCONTROL_TRAIN_FULL_SCALED_imported = pd.read_csv(
    f"data/exported_csv/X_INCONTROL_TRAIN_FULL_SCALED.csv"
).values

X_INCONTROL_TEST_FULL_SCALED_imported = pd.read_csv(
    f"data/exported_csv/X_INCONTROL_TEST_FULL_SCALED.csv"
).values

X_OUT_OF_CONTROL_TRAIN_FULL_SCALED_imported = pd.read_csv(
    f"data/exported_csv/X_OUT_OF_CONTROL_TRAIN_FULL_SCALED.csv"
).values

X_OUT_OF_CONTROL_TEST_FULL_SCALED_imported = pd.read_csv(
    f"data/exported_csv/X_OUT_OF_CONTROL_TEST_FULL_SCALED.csv"
).values

X_INCONTROL_TRAIN_SELECTED_SCALED_imported = pd.read_csv(
    f"data/exported_csv/X_INCONTROL_TRAIN_SELECTED_SCALED_fault_{fault_num}_sim_{sim_run}.csv"
).values

X_INCONTROL_TEST_SELECTED_SCALED_imported = pd.read_csv(
    f"data/exported_csv/X_INCONTROL_TEST_SELECTED_SCALED_fault_{fault_num}_sim_{sim_run}.csv"
).values

X_OUT_OF_CONTROL_TRAIN_SELECTED_SCALED_imported = pd.read_csv(
    f"data/exported_csv/X_OUT_OF_CONTROL_TRAIN_SELECTED_SCALED_fault_{fault_num}_sim_{sim_run}.csv"
).values

X_OUT_OF_CONTROL_TEST_SELECTED_SCALED_imported = pd.read_csv(
    f"data/exported_csv/X_OUT_OF_CONTROL_TEST_SELECTED_SCALED_fault_{fault_num}_sim_{sim_run}.csv"
).values

# Import each CSV file for y data
y_INCONTROL_TRAIN_FULL_imported = pd.read_csv(
    f"data/exported_csv/y_INCONTROL_TRAIN_FULL.csv"
).values.flatten()

y_INCONTROL_TEST_FULL_imported = pd.read_csv(
    f"data/exported_csv/y_INCONTROL_TEST_FULL.csv"
).values.flatten()

y_OUT_OF_CONTROL_TRAIN_FULL_imported = pd.read_csv(
    f"data/exported_csv/y_OUT_OF_CONTROL_TRAIN_FULL.csv"
).values.flatten()

y_OUT_OF_CONTROL_TEST_FULL_imported = pd.read_csv(
    f"data/exported_csv/y_OUT_OF_CONTROL_TEST_FULL.csv"
).values.flatten()

y_INCONTROL_TRAIN_SELECTED_imported = pd.read_csv(
    f"data/exported_csv/y_INCONTROL_TRAIN_SELECTED_fault_{fault_num}_sim_{sim_run}.csv"
).values.flatten()

y_INCONTROL_TEST_SELECTED_imported = pd.read_csv(
    f"data/exported_csv/y_INCONTROL_TEST_SELECTED_fault_{fault_num}_sim_{sim_run}.csv"
).values.flatten()

y_OUT_OF_CONTROL_TRAIN_SELECTED_imported = pd.read_csv(
    f"data/exported_csv/y_OUT_OF_CONTROL_TRAIN_SELECTED_fault_{fault_num}_sim_{sim_run}.csv"
).values.flatten()

y_OUT_OF_CONTROL_TEST_SELECTED_imported = pd.read_csv(
    f"data/exported_csv/y_OUT_OF_CONTROL_TEST_SELECTED_fault_{fault_num}_sim_{sim_run}.csv"
).values.flatten()

# Verify the shapes match the original data
print("=== X Data Shapes ===")
print("Imported In-control full training data shape:", X_INCONTROL_TRAIN_FULL_SCALED_imported.shape)
print("Imported In-control full test data shape:", X_INCONTROL_TEST_FULL_SCALED_imported.shape)
print("Imported Out-of-control full training data shape:", X_OUT_OF_CONTROL_TRAIN_FULL_SCALED_imported.shape)
print("Imported Out-of-control full test data shape:", X_OUT_OF_CONTROL_TEST_FULL_SCALED_imported.shape)

print("\nImported In-control training data shape:", X_INCONTROL_TRAIN_SELECTED_SCALED_imported.shape)
print("Imported In-control test data shape:", X_INCONTROL_TEST_SELECTED_SCALED_imported.shape)
print("Imported Out-of-control training data shape:", X_OUT_OF_CONTROL_TRAIN_SELECTED_SCALED_imported.shape)
print("Imported Out-of-control test data shape:", X_OUT_OF_CONTROL_TEST_SELECTED_SCALED_imported.shape)

print("\n=== y Data Shapes ===")
print("Imported In-control full training labels shape:", y_INCONTROL_TRAIN_FULL_imported.shape)
print("Imported In-control full test labels shape:", y_INCONTROL_TEST_FULL_imported.shape)
print("Imported Out-of-control full training labels shape:", y_OUT_OF_CONTROL_TRAIN_FULL_imported.shape)
print("Imported Out-of-control full test labels shape:", y_OUT_OF_CONTROL_TEST_FULL_imported.shape)

print("\nImported In-control training labels shape:", y_INCONTROL_TRAIN_SELECTED_imported.shape)
print("Imported In-control test labels shape:", y_INCONTROL_TEST_SELECTED_imported.shape)
print("Imported Out-of-control training labels shape:", y_OUT_OF_CONTROL_TRAIN_SELECTED_imported.shape)
print("Imported Out-of-control test labels shape:", y_OUT_OF_CONTROL_TEST_SELECTED_imported.shape)

# Verify the data matches exactly
print("\n=== Verifying X Data Integrity ===")
print("In-control full train match:", np.allclose(X_INCONTROL_TRAIN_FULL_SCALED, X_INCONTROL_TRAIN_FULL_SCALED_imported))
print("In-control full test match:", np.allclose(X_INCONTROL_TEST_FULL_SCALED, X_INCONTROL_TEST_FULL_SCALED_imported))
print("Out-of-control full train match:", np.allclose(X_OUT_OF_CONTROL_TRAIN_FULL_SCALED, X_OUT_OF_CONTROL_TRAIN_FULL_SCALED_imported))
print("Out-of-control full test match:", np.allclose(X_OUT_OF_CONTROL_TEST_FULL_SCALED, X_OUT_OF_CONTROL_TEST_FULL_SCALED_imported))

print("\nIn-control train match:", np.allclose(X_INCONTROL_TRAIN_SELECTED_SCALED, X_INCONTROL_TRAIN_SELECTED_SCALED_imported))
print("In-control test match:", np.allclose(X_INCONTROL_TEST_SELECTED_SCALED, X_INCONTROL_TEST_SELECTED_SCALED_imported))
print("Out-of-control train match:", np.allclose(X_OUT_OF_CONTROL_TRAIN_SELECTED_SCALED, X_OUT_OF_CONTROL_TRAIN_SELECTED_SCALED_imported))
print("Out-of-control test match:", np.allclose(X_OUT_OF_CONTROL_TEST_SELECTED_SCALED, X_OUT_OF_CONTROL_TEST_SELECTED_SCALED_imported))

print("\n=== Verifying y Data Integrity ===")
print("In-control full train labels match:", np.allclose(y_INCONTROL_TRAIN_FULL, y_INCONTROL_TRAIN_FULL_imported))
print("In-control full test labels match:", np.allclose(y_INCONTROL_TEST_FULL, y_INCONTROL_TEST_FULL_imported))
print("Out-of-control full train labels match:", np.allclose(y_OUT_OF_CONTROL_TRAIN_FULL, y_OUT_OF_CONTROL_TRAIN_FULL_imported))
print("Out-of-control full test labels match:", np.allclose(y_OUT_OF_CONTROL_TEST_FULL, y_OUT_OF_CONTROL_TEST_FULL_imported))

print("\nIn-control train labels match:", np.allclose(y_INCONTROL_TRAIN_SELECTED, y_INCONTROL_TRAIN_SELECTED_imported))
print("In-control test labels match:", np.allclose(y_INCONTROL_TEST_SELECTED, y_INCONTROL_TEST_SELECTED_imported))
print("Out-of-control train labels match:", np.allclose(y_OUT_OF_CONTROL_TRAIN_SELECTED, y_OUT_OF_CONTROL_TRAIN_SELECTED_imported))
print("Out-of-control test labels match:", np.allclose(y_OUT_OF_CONTROL_TEST_SELECTED, y_OUT_OF_CONTROL_TEST_SELECTED_imported))

=== X Data Shapes ===
Imported In-control full training data shape: (250000, 52)
Imported In-control full test data shape: (9600000, 52)
Imported Out-of-control full training data shape: (5000000, 52)
Imported Out-of-control full test data shape: (9600000, 52)

Imported In-control training data shape: (500, 52)
Imported In-control test data shape: (960, 52)
Imported Out-of-control training data shape: (500, 52)
Imported Out-of-control test data shape: (960, 52)

=== y Data Shapes ===
Imported In-control full training labels shape: (250000,)
Imported In-control full test labels shape: (480000,)
Imported Out-of-control full training labels shape: (5000000,)
Imported Out-of-control full test labels shape: (9600000,)

Imported In-control training labels shape: (500,)
Imported In-control test labels shape: (960,)
Imported Out-of-control training labels shape: (500,)
Imported Out-of-control test labels shape: (960,)

=== Verifying X Data Integrity ===
In-control full train match: True
In-con