# Tennessee Eastman Process (TEP) - Quick Start Notebook

This notebook runs a minimal, fast workflow:
- Import refactored modules
- Load and explore data
- Preprocess features
- Train and evaluate a small RandomForest on a sample

You can expand it later for deeper experimentation.


In [None]:
import os, sys
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

# Local imports
sys.path.append(os.path.dirname(os.path.abspath('.')))
from te_data_loader import TEPDataLoader
from te_models import TEPModelTrainer

print('✅ Imports OK')


✅ Imports OK


In [None]:
# Load data (faultNumber is the target)
loader = TEPDataLoader(data_dir='data/')
train_df, test_df = loader.load_data()

print('Train shape:', train_df.shape)
print('Test shape:', test_df.shape)
print('Columns:', list(train_df.columns)[:8], '...')

# Quick target check
print('Target value counts (train):')
print(train_df['faultNumber'].value_counts().head())


Loading TEP data...
Training data loaded: (250000, 55)
Testing data loaded: (480000, 55)
Train shape: (250000, 55)
Test shape: (480000, 55)
Columns: ['faultNumber', 'simulationRun', 'sample', 'xmeas_1', 'xmeas_2', 'xmeas_3', 'xmeas_4', 'xmeas_5'] ...
Target value counts (train):
faultNumber
0.0    250000
Name: count, dtype: int64


In [None]:
# Preprocess (normalize features); sample to run fast
proc_train, proc_test = loader.preprocess_data(normalize=True, apply_pca=False)

# If dataset is huge, sample to speed up demo
sample_n = min(20000, len(proc_train))
proc_train_sample = proc_train.sample(n=sample_n, random_state=42)
proc_test_sample = proc_test.sample(n=sample_n, random_state=42)

print('Processed train sample:', proc_train_sample.shape)
print('Processed test sample:', proc_test_sample.shape)


Preprocessing data...
Normalizing features...
Data preprocessing completed
Processed train sample: (20000, 55)
Processed test sample: (20000, 55)


In [None]:
# Prepare data
features = [c for c in proc_train_sample.columns if c != 'faultNumber']
X_train = proc_train_sample[features].values
y_train = (proc_train_sample['faultNumber'] > 0).astype(int).values
X_test = proc_test_sample[features].values
y_test = (proc_test_sample['faultNumber'] > 0).astype(int).values

print('Class balance (train):', np.bincount(y_train))


Class balance (train): [20000]


In [None]:
# Train a small RandomForest (with single-class fallback)
if len(np.unique(y_train)) < 2 or len(np.unique(y_test)) < 2:
    print('⚠️ Only one class present in data. Skipping model training.')
    print('Class distribution (train):', np.bincount(y_train))
    print('Class distribution (test):', np.bincount(y_test))
else:
    rf = RandomForestClassifier(n_estimators=100, max_depth=None, n_jobs=-1, random_state=42)
    rf.fit(X_train, y_train)

    y_pred = rf.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='binary', zero_division=0)

    print(f'Accuracy: {acc:.4f}\nF1: {f1:.4f}')
    print('\nClassification report (binary 0=normal,1=fault):')
    print(classification_report(y_test, y_pred, zero_division=0))


⚠️ Only one class present in data. Skipping model training.
Class distribution (train): [20000]
Class distribution (test): [20000]
