In [1]:
import pickle

# Load preprocessed data
with open('preprocessed_data.pkl', 'rb') as f:
    X_train, X_test, y_train, y_test = pickle.load(f)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)


X_train shape: (8000, 8)
X_test shape: (2000, 8)


In [3]:
import re

def clean_column_names(df):
    df = df.copy()
    df.columns = [re.sub(r'[\[\]<>]', '', col).replace(' ', '_') for col in df.columns]
    return df

# After loading your data
with open('preprocessed_data.pkl', 'rb') as f:
    X_train, X_test, y_train, y_test = pickle.load(f)

# Clean the column names
X_train = clean_column_names(X_train)
X_test = clean_column_names(X_test)

print("Cleaned X_train columns:", X_train.columns)


Cleaned X_train columns: Index(['Air_temperature_K', 'Process_temperature_K', 'Rotational_speed_rpm',
       'Torque_Nm', 'Tool_wear_min', 'Torque_per_Wear', 'Type_L', 'Type_M'],
      dtype='object')


In [4]:
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# Base learners
xgb_model = XGBClassifier(random_state=42, learning_rate=0.2, max_depth=6, n_estimators=300, 
                          use_label_encoder=False, eval_metric='logloss')
rf_model = RandomForestClassifier(random_state=42, n_estimators=200, class_weight='balanced')
lr_model = LogisticRegression(max_iter=1000, class_weight='balanced')


In [5]:
from sklearn.ensemble import StackingClassifier

# Define base estimators as a list of tuples (name, model)
estimators = [
    ('xgb', xgb_model),
    ('rf', rf_model),
    ('lr', lr_model)
]

# Create the stacking classifier with a logistic regression as the final estimator
stacking_clf = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(max_iter=1000, class_weight='balanced'),
    cv=5,
    n_jobs=-1
)


In [6]:
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

ensemble_pipeline = Pipeline([
    ('smote', SMOTE(random_state=42)),
    ('stack', stacking_clf)
])


In [7]:
stacking_clf.fit(X_train, y_train)


In [9]:
ensemble_pipeline.fit(X_train, y_train)


In [10]:
from sklearn.metrics import classification_report

y_pred_stack = stacking_clf.predict(X_test)
print("Classification Report for Stacking Ensemble:")
print(classification_report(y_test, y_pred_stack))


Classification Report for Stacking Ensemble:
              precision    recall  f1-score   support

           0       0.99      0.98      0.99      1939
           1       0.57      0.77      0.66        61

    accuracy                           0.98      2000
   macro avg       0.78      0.88      0.82      2000
weighted avg       0.98      0.98      0.98      2000



In [11]:
y_pred_ensemble = ensemble_pipeline.predict(X_test)
print("Classification Report for Stacking Ensemble with SMOTE:")
print(classification_report(y_test, y_pred_ensemble))


Classification Report for Stacking Ensemble with SMOTE:
              precision    recall  f1-score   support

           0       0.99      0.98      0.99      1939
           1       0.57      0.77      0.66        61

    accuracy                           0.98      2000
   macro avg       0.78      0.88      0.82      2000
weighted avg       0.98      0.98      0.98      2000



In [13]:
from sklearn.calibration import CalibratedClassifierCV
# Calibrate the stacking ensemble
calibrated_clf = CalibratedClassifierCV(estimator=stacking_clf, cv=5, method='sigmoid')

# Fit the calibrated classifier on the training data
calibrated_clf.fit(X_train, y_train)

from sklearn.metrics import classification_report

# Predict using the calibrated classifier
y_pred_calibrated = calibrated_clf.predict(X_test)

# Print the classification report for the calibrated model
print("Classification Report for Calibrated Stacking Ensemble:")
print(classification_report(y_test, y_pred_calibrated))


Classification Report for Calibrated Stacking Ensemble:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1939
           1       0.80      0.61      0.69        61

    accuracy                           0.98      2000
   macro avg       0.90      0.80      0.84      2000
weighted avg       0.98      0.98      0.98      2000



In [14]:
# Get calibrated predicted probabilities for the failure class
y_prob_calibrated = calibrated_clf.predict_proba(X_test)[:, 1]
print("Calibrated probabilities (first 10):", y_prob_calibrated[:10])


Calibrated probabilities (first 10): [0.00747762 0.00987755 0.00518673 0.0046924  0.00953585 0.28698452
 0.00666263 0.0287966  0.0042167  0.00363404]


In [15]:
import numpy as np

thresholds = [0.3, 0.4, 0.5, 0.6]
for t in thresholds:
    y_pred_tuned = np.where(y_prob_calibrated >= t, 1, 0)
    print(f"\nClassification Report for Threshold {t}:")
    print(classification_report(y_test, y_pred_tuned))



Classification Report for Threshold 0.3:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1939
           1       0.69      0.69      0.69        61

    accuracy                           0.98      2000
   macro avg       0.84      0.84      0.84      2000
weighted avg       0.98      0.98      0.98      2000


Classification Report for Threshold 0.4:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1939
           1       0.73      0.67      0.70        61

    accuracy                           0.98      2000
   macro avg       0.86      0.83      0.85      2000
weighted avg       0.98      0.98      0.98      2000


Classification Report for Threshold 0.5:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1939
           1       0.80      0.61      0.69        61

    accuracy                           0.98      2000
  