In [1]:
import pickle

# Load preprocessed data
with open('preprocessed_data.pkl', 'rb') as f:
    X_train, X_test, y_train, y_test = pickle.load(f)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)


X_train shape: (8000, 8)
X_test shape: (2000, 8)


In [3]:
import re

def clean_column_names(df):
    df = df.copy()
    df.columns = [re.sub(r'[\[\]<>]', '', col).replace(' ', '_') for col in df.columns]
    return df

# After loading your data
with open('preprocessed_data.pkl', 'rb') as f:
    X_train, X_test, y_train, y_test = pickle.load(f)

# Clean the column names
X_train = clean_column_names(X_train)
X_test = clean_column_names(X_test)

print("Cleaned X_train columns:", X_train.columns)


Cleaned X_train columns: Index(['Air_temperature_K', 'Process_temperature_K', 'Rotational_speed_rpm',
       'Torque_Nm', 'Tool_wear_min', 'Torque_per_Wear', 'Type_L', 'Type_M'],
      dtype='object')


In [23]:
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier

# Base learners
xgb_model = XGBClassifier(random_state=42, learning_rate=0.2, max_depth=6, n_estimators=300, 
                          use_label_encoder=False, eval_metric='logloss')
rf_model = RandomForestClassifier(random_state=42, n_estimators=200, class_weight='balanced')
lr_model = LogisticRegression(max_iter=1000, class_weight='balanced')
lgbm_model = LGBMClassifier(random_state=42)


In [24]:
from sklearn.ensemble import StackingClassifier

# Define base estimators as a list of tuples (name, model)
estimators_updated = [
    ('xgb', xgb_model),
    ('rf', rf_model),
    ('lr', lr_model),
    ('lgbm', lgbm_model)
]

# Create the stacking classifier with a logistic regression as the final estimator
stacking_clf_updated = StackingClassifier(
    estimators=estimators_updated,
    final_estimator=LogisticRegression(max_iter=1000, class_weight='balanced'),
    cv=5,
    n_jobs=-1
)


In [25]:
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

ensemble_pipeline_updated = Pipeline([
    ('smote', SMOTE(random_state=42)),
    ('stack', stacking_clf_updated)
])


In [26]:
ensemble_pipeline_updated.fit(X_train, y_train)


In [27]:
from sklearn.calibration import CalibratedClassifierCV

calibrated_ensemble_updated = CalibratedClassifierCV(estimator=ensemble_pipeline_updated, cv=5, method='sigmoid')
calibrated_ensemble_updated.fit(X_train, y_train)


found 0 physical cores < 1
  File "C:\Users\kealankuar.wh\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
    raise ValueError(f"found {cpu_count_physical} physical cores < 1")


In [28]:
from sklearn.metrics import classification_report

# Predict using the calibrated ensemble
y_pred_updated = calibrated_ensemble_updated.predict(X_test)
print("Classification Report for Updated Calibrated Ensemble with LightGBM:")
print(classification_report(y_test, y_pred_updated))


Classification Report for Updated Calibrated Ensemble with LightGBM:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1939
           1       0.66      0.62      0.64        61

    accuracy                           0.98      2000
   macro avg       0.82      0.81      0.81      2000
weighted avg       0.98      0.98      0.98      2000



In [32]:
from sklearn.ensemble import VotingClassifier
from imblearn.pipeline import Pipeline
from imblearn.combine import SMOTETomek

# Define your base models 
voting_estimators = [
    ('xgb', xgb_model),     # XGBoost model
    ('rf', rf_model),       # RandomForest model
    ('lr', lr_model),       # Logistic Regression model
    ('lgbm', lgbm_model)    # LightGBM model
]

# Create a weighted voting classifier (using soft voting for probabilities)
voting_clf = VotingClassifier(
    estimators=voting_estimators,
    voting='soft', 
    weights=[3, 1, 1, 1]  # Adjust weights based on model performance
)

#include a sampling step (SMOTE) in pipeline:
voting_pipeline = Pipeline([
    ('smotetomek', SMOTE(random_state=42)),
    ('voting', voting_clf)
])

# Fit the voting ensemble on your training data
voting_pipeline.fit(X_train, y_train)

# Evaluate on the test set
y_pred_voting = voting_pipeline.predict(X_test)
print("Classification Report for Weighted Voting Ensemble:")
print(classification_report(y_test, y_pred_voting))


Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 7722, number of negative: 7722
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000879 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1534
[LightGBM] [Info] Number of data points in the train set: 15444, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Classification Report for Weighted Voting Ensemble:
              precision    recall  f1-score   support

           0       0.99      0.98      0.99      1939
           1       0.52      0.75      0.62        61

    accuracy                           0.97      2000
   macro avg       0.76      0.87      0.80      2000
weighted avg       0.98      0.97      0.97      2000

