In [18]:
# Data wrangling
import pandas as pd
import numpy as np
# Data visualization
import matplotlib.pyplot as plt 
import seaborn as sns
import plotly.express as px
# Off FutureWarnings
import warnings 
warnings.filterwarnings('ignore')
#Resampling
from imblearn.over_sampling import SMOTENC 
from sklearn.utils import class_weight
#Dimension Reduction
from sklearn.decomposition import PCA
# Preprocessing
from sklearn.preprocessing import StandardScaler, PowerTransformer, OrdinalEncoder, OneHotEncoder 
from sklearn.preprocessing import LabelEncoder
# Models
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
# Models Pipelines
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline
# Model evaluation
from sklearn import metrics
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, balanced_accuracy_score, recall_score, precision_score, f1_score
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import confusion_matrix
# Save model
import pickle

In [19]:
## Upload df
df = pd.read_csv('cleaned_data.csv') 
df

Unnamed: 0.1,Unnamed: 0,Type,Air_temperature,Process_temperature,Rotational_speed,Torque,Tool_wear,Machine_failure,TWF,HDF,PWF,OSF,RNF,Failure_type
0,0,Medium,298.1,308.6,1551,42.8,0,0,0,0,0,0,0,NF
1,1,Low,298.2,308.7,1408,46.3,3,0,0,0,0,0,0,NF
2,2,Low,298.1,308.5,1498,49.4,5,0,0,0,0,0,0,NF
3,3,Low,298.2,308.6,1433,39.5,7,0,0,0,0,0,0,NF
4,4,Low,298.2,308.7,1408,40.0,9,0,0,0,0,0,0,NF
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9971,9995,Medium,298.8,308.4,1604,29.5,14,0,0,0,0,0,0,NF
9972,9996,High,298.9,308.4,1632,31.8,17,0,0,0,0,0,0,NF
9973,9997,Medium,299.0,308.6,1645,33.4,22,0,0,0,0,0,0,NF
9974,9998,High,299.0,308.7,1408,48.5,25,0,0,0,0,0,0,NF


In [20]:
from sklearn.metrics import f1_score 

def get_metrics(y_true, y_pred, unique_classes):
    # Calculating F1 scores for each class
    f1_scores_per_class = f1_score(y_true, y_pred, average=None, labels=unique_classes)
    recall_scores_per_class = recall_score(y_true, y_pred, average=None, labels=unique_classes)
    precision_scores_per_class = precision_score(y_true, y_pred, average=None, labels=unique_classes)
    class_f1_scores = dict(zip(unique_classes, f1_scores_per_class))
    class_recall_scores = dict(zip(unique_classes, recall_scores_per_class))
    class_precision_scores = dict(zip(unique_classes, precision_scores_per_class))
    dict_metrics = {
    'Accuracy': accuracy_score(y_true, y_pred),
    'Balanced Accuracy': balanced_accuracy_score(y_true, y_pred),
    'Macro Recall': recall_score(y_true, y_pred, average='macro'), 
    'Macro Precision': precision_score(y_true, y_pred, average='macro'), 
    'Macro F1': f1_score(y_true, y_pred, average='macro'),
    'F1 Scores per Class': class_f1_scores,
    'Recall Scores per Class': class_recall_scores,
    'Precision Scores per Class': class_precision_scores
    }
    return dict_metrics

In [21]:
df

Unnamed: 0.1,Unnamed: 0,Type,Air_temperature,Process_temperature,Rotational_speed,Torque,Tool_wear,Machine_failure,TWF,HDF,PWF,OSF,RNF,Failure_type
0,0,Medium,298.1,308.6,1551,42.8,0,0,0,0,0,0,0,NF
1,1,Low,298.2,308.7,1408,46.3,3,0,0,0,0,0,0,NF
2,2,Low,298.1,308.5,1498,49.4,5,0,0,0,0,0,0,NF
3,3,Low,298.2,308.6,1433,39.5,7,0,0,0,0,0,0,NF
4,4,Low,298.2,308.7,1408,40.0,9,0,0,0,0,0,0,NF
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9971,9995,Medium,298.8,308.4,1604,29.5,14,0,0,0,0,0,0,NF
9972,9996,High,298.9,308.4,1632,31.8,17,0,0,0,0,0,0,NF
9973,9997,Medium,299.0,308.6,1645,33.4,22,0,0,0,0,0,0,NF
9974,9998,High,299.0,308.7,1408,48.5,25,0,0,0,0,0,0,NF


In [22]:
NUMERIC_FEATURES = ['Air_temperature', 'Process_temperature', 'Rotational_speed', 'Torque', 'Tool_wear']
CATEGORIC_FEATURES = ['Type']

In [23]:
# Create preprocessor ColumnTransformer to do OneHotEncoder for CATEGORIC_FEATURES and StandardScaler() for NUMERIC_FEATURES
# Define the pipelines for numeric and categorical transformations
num_pipeline = Pipeline([
    ('num_features', StandardScaler()) 
    ])
cat_pipeline = Pipeline([ 
    ('cat_features', OneHotEncoder())
    ])
# Create the ColumnTransformer
preprocessor = ColumnTransformer(transformers=[
    ('num_trans', num_pipeline, NUMERIC_FEATURES),
    ('cat_trans', cat_pipeline, CATEGORIC_FEATURES) ])
# Fit and transform the data
df_transformed = preprocessor.fit_transform(df)
# Converting the transformed data back to a dataframe for easier visualization
# The transformed data will have new column names, especially for the one hot encoded categories 
encoded_feature_names = preprocessor.named_transformers_['cat_trans'].get_feature_names_out(CATEGORIC_FEATURES) 
new_column_names = list(NUMERIC_FEATURES) + list(encoded_feature_names)
df_transformed = pd.DataFrame(df_transformed, columns=new_column_names)
df_transformed.head()


Unnamed: 0,Air_temperature,Process_temperature,Rotational_speed,Torque,Tool_wear,Type_High,Type_Low,Type_Medium
0,-0.951551,-0.946692,0.065483,0.289789,-1.695147,0.0,0.0,1.0
1,-0.901538,-0.879314,-0.732576,0.643119,-1.647949,0.0,1.0,0.0
2,-0.951551,-1.014071,-0.230301,0.956069,-1.616484,0.0,1.0,0.0
3,-0.901538,-0.946692,-0.593055,-0.043351,-1.585019,0.0,1.0,0.0
4,-0.901538,-0.879314,-0.732576,0.007125,-1.553553,0.0,1.0,0.0


In [24]:
# df = df[df['Failure_type'] != 'TWF']

In [25]:
df_model = df.copy()
X = df_model[NUMERIC_FEATURES + CATEGORIC_FEATURES]
y = df['Failure_type']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2, stratify=y)

In [26]:
import xgboost as xgb
from xgboost import XGBClassifier

In [27]:
unique_classes = np.unique(y_train)
unique_classes

array(['HDF', 'NF', 'OSF', 'PWF', 'TWF'], dtype=object)

In [28]:
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

### Class Weights

In [29]:
# Creating pipeline without PCA analysis and balanced class with parameter by model 
pip_model_no_pca = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', XGBClassifier(random_state=2023)) 
])

# Fit pipeline with sample weights
weights = class_weight.compute_sample_weight(class_weight='balanced', y=y_train_encoded) 
pip_model_no_pca.fit(X_train, y_train_encoded, model__sample_weight=weights)

# Step 1: Generate Predictions
y_pred_encoded = pip_model_no_pca.predict(X_test)
y_pred = label_encoder.inverse_transform(y_pred_encoded)

# Step 2: Evaluate Metrics
metrics = get_metrics(y_test, y_pred, unique_classes)

# Step 3: View Results metrics
metrics

{'Accuracy': 0.9819639278557114,
 'Balanced Accuracy': 0.7658684016242157,
 'Macro Recall': 0.7658684016242157,
 'Macro Precision': 0.6616253259147585,
 'Macro F1': 0.7054009505090526,
 'F1 Scores per Class': {'HDF': 0.9047619047619048,
  'NF': 0.9906639004149378,
  'OSF': 0.7894736842105263,
  'PWF': 0.8421052631578947,
  'TWF': 0.0},
 'Recall Scores per Class': {'HDF': 0.9047619047619048,
  'NF': 0.9870801033591732,
  'OSF': 0.9375,
  'PWF': 1.0,
  'TWF': 0.0},
 'Precision Scores per Class': {'HDF': 0.9047619047619048,
  'NF': 0.9942738157209786,
  'OSF': 0.6818181818181818,
  'PWF': 0.7272727272727273,
  'TWF': 0.0}}

In [17]:
fine_tune_params = {
    'model__n_estimators': [100],
    'model__max_depth': [None, 2, 6, 20],
    'model__lambda': [0, 0.001, 0.01, 0.1, 1, 10, 100],
    'model__alpha': [0, 0.001, 0.01, 0.1, 1, 10, 100],
    'model__eta': [0.2, 0.3, 0.4],
}
# Running a new GridSearchCV for fine-tuning
fine_tune_grid = GridSearchCV(pip_model_no_pca, fine_tune_params, cv=3, scoring='f1_macro', n_jobs=-1, verbose=1)
fine_tune_grid.fit(X_train, y_train_encoded, model__sample_weight=weights)

# Collecting and printing the fine-tuned results
fine_tuned_results = pd.DataFrame(fine_tune_grid.cv_results_) 
fine_tuned_best_index = fine_tuned_results['mean_test_score'].idxmax()
fine_tuned_best_params = fine_tuned_results.loc[fine_tuned_best_index, 'params']

# Print best model parameters
print("Best fine-tuned model parameters:") 
print(fine_tuned_best_params)
# Finding the best estimator paramaters 
tuned_model = fine_tune_grid.best_estimator_ 
y_pred_encoded = tuned_model.predict(X_test)
y_pred = label_encoder.inverse_transform(y_pred_encoded)
# View new perfomance (focus on F1-score) 
get_metrics(y_test, y_pred, unique_classes)

Fitting 3 folds for each of 588 candidates, totalling 1764 fits


KeyboardInterrupt: 

Training Performance

In [30]:
# Creating pipeline without PCA analysis and balanced class with parameter by model 
pip_model_no_pca = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', XGBClassifier(random_state=2023)) 
])

# Fit pipeline with sample weights
weights = class_weight.compute_sample_weight(class_weight='balanced', y=y_train_encoded) 
tuned_model.fit(X_train, y_train_encoded, model__sample_weight=weights)

# Step 1: Generate Predictions
y_pred_encoded = tuned_model.predict(X_train)
y_pred = label_encoder.inverse_transform(y_pred_encoded)

# Step 2: Evaluate Metrics
tuned_metrics = get_metrics(y_train, y_pred, unique_classes)

# Step 3: View Results metrics
tuned_metrics

NameError: name 'tuned_model' is not defined

Testing Performance

In [None]:
# Creating pipeline without PCA analysis and balanced class with parameter by model 
pip_model_near_miss = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', XGBClassifier(random_state=2023)) 
])

# Fit pipeline with sample weights
weights = class_weight.compute_sample_weight(class_weight='balanced', y=y_train_encoded) 
tuned_model.fit(X_train, y_train_encoded, model__sample_weight=weights)

# Step 1: Generate Predictions
y_pred_encoded = tuned_model.predict(X_test)
y_pred = label_encoder.inverse_transform(y_pred_encoded)

# Step 2: Evaluate Metrics
tuned_metrics = get_metrics(y_test, y_pred, unique_classes)

# Step 3: View Results metrics
tuned_metrics

{'Accuracy': 0.9839679358717435,
 'Balanced Accuracy': 0.766281838316722,
 'Macro Recall': 0.766281838316722,
 'Macro Precision': 0.6626252587991719,
 'Macro F1': 0.7061134766834248,
 'F1 Scores per Class': {'HDF': 0.9047619047619048,
  'NF': 0.9917098445595854,
  'OSF': 0.7692307692307693,
  'PWF': 0.8648648648648649,
  'TWF': 0.0},
 'Recall Scores per Class': {'HDF': 0.9047619047619048,
  'NF': 0.9891472868217054,
  'OSF': 0.9375,
  'PWF': 1.0,
  'TWF': 0.0},
 'Precision Scores per Class': {'HDF': 0.9047619047619048,
  'NF': 0.9942857142857143,
  'OSF': 0.6521739130434783,
  'PWF': 0.7619047619047619,
  'TWF': 0.0}}

### Undersampling

In [102]:
from imblearn.under_sampling import NearMiss, RandomUnderSampler, TomekLinks


Nearmiss

In [103]:

# Creating pipeline without PCA analysis and balanced class with parameter by model 
pip_model_undersample = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('nearmiss', NearMiss(version=3, n_neighbors=30)),
    ('model', XGBClassifier(random_state=2023))
])

# Fit pipeline with sample weights
pip_model_undersample.fit(X_train, y_train_encoded)

# Step 1: Generate Predictions
y_pred_encoded = pip_model_undersample.predict(X_test)
y_pred = label_encoder.inverse_transform(y_pred_encoded)

# Step 2: Evaluate Metrics
metrics = get_metrics(y_test, y_pred, unique_classes)

# Step 3: View Results metrics
metrics

{'Accuracy': 0.41583166332665333,
 'Balanced Accuracy': 0.8229605020302696,
 'Macro Recall': 0.8229605020302696,
 'Macro Precision': 0.3446258400877103,
 'Macro F1': 0.3322653158193071,
 'F1 Scores per Class': {'HDF': 0.38961038961038963,
  'NF': 0.5719557195571956,
  'OSF': 0.5423728813559322,
  'PWF': 0.03140333660451423,
  'TWF': 0.12598425196850394},
 'Recall Scores per Class': {'HDF': 0.7142857142857143,
  'NF': 0.4005167958656331,
  'OSF': 1.0,
  'PWF': 1.0,
  'TWF': 1.0},
 'Precision Scores per Class': {'HDF': 0.26785714285714285,
  'NF': 1.0,
  'OSF': 0.37209302325581395,
  'PWF': 0.015952143569292122,
  'TWF': 0.06722689075630252}}

RandomUndersample

In [109]:
sampling_strategy = {
    0: 30,   # HDF 
    1: 30,   # NF 
    2: 30,   # OSF
    3: 30,   # PWF
    4: 30    # TWF
}


In [111]:

# Creating pipeline without PCA analysis and balanced class with parameter by model 
pip_model_undersample = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('undersampler', RandomUnderSampler(sampling_strategy={1: 30}, random_state=2023)),
    ('model', XGBClassifier(random_state=2023))
])

# Fit pipeline with sample weights
pip_model_undersample.fit(X_train, y_train_encoded)

# Step 1: Generate Predictions
y_pred_encoded = pip_model_undersample.predict(X_test)
y_pred = label_encoder.inverse_transform(y_pred_encoded)

# Step 2: Evaluate Metrics
metrics = get_metrics(y_test, y_pred, unique_classes)

# Step 3: View Results metrics
metrics

{'Accuracy': 0.8086172344689379,
 'Balanced Accuracy': 0.9416758951642674,
 'Macro Recall': 0.9416758951642674,
 'Macro Precision': 0.3220734664396636,
 'Macro F1': 0.38370990579115555,
 'F1 Scores per Class': {'HDF': 0.30158730158730157,
  'NF': 0.8911174785100286,
  'OSF': 0.4,
  'PWF': 0.2191780821917808,
  'TWF': 0.10666666666666667},
 'Recall Scores per Class': {'HDF': 0.9047619047619048,
  'NF': 0.8036175710594315,
  'OSF': 1.0,
  'PWF': 1.0,
  'TWF': 1.0},
 'Precision Scores per Class': {'HDF': 0.18095238095238095,
  'NF': 1.0,
  'OSF': 0.25,
  'PWF': 0.12307692307692308,
  'TWF': 0.056338028169014086}}

In [34]:
cm = confusion_matrix(y_test, y_pred)
cm

array([[  20,    0,    1,    0,    0],
       [  42, 1721,   48,   22,  102],
       [   0,    0,   16,    0,    0],
       [   0,    0,    0,   16,    0],
       [   0,    2,    0,    0,    6]])

In [56]:
# import matplotlib.pyplot as plt
# from collections import Counter
# from imblearn.under_sampling import RandomUnderSampler
# from imblearn.pipeline import Pipeline as ImbPipeline
# from xgboost import XGBClassifier
# from sklearn.compose import ColumnTransformer
# from sklearn.preprocessing import StandardScaler, OneHotEncoder

# # Define the plot function for class distribution
# def plot_class_distribution(y, title):
#     counter = Counter(y)
#     classes = list(counter.keys())
#     values = list(counter.values())

#     plt.figure(figsize=(10, 6))
#     plt.bar(classes, values, color='skyblue')
#     plt.xlabel('Class')
#     plt.ylabel('Frequency')
#     plt.title(title)
#     plt.show()

# # Assuming you have preprocessor defined
# # Example preprocessor:
# # numerical_features = ['feature1', 'feature2', ...]
# # categorical_features = ['feature3', 'feature4', ...]
# # preprocessor = ColumnTransformer(
# #     transformers=[
# #         ('num', StandardScaler(), numerical_features),
# #         ('cat', OneHotEncoder(), categorical_features)
# #     ]
# # )

# # Visualize the original class distribution
# plot_class_distribution(y_train_encoded, 'Original Class Distribution')

# # Define the undersampler
# undersampler = RandomUnderSampler(sampling_strategy={1: 30}, random_state=2023)

# # Apply the undersampling only to visualize
# X_resampled, y_resampled = undersampler.fit_resample(X_train, y_train_encoded)

# # Visualize the resampled class distribution
# plot_class_distribution(y_resampled, 'Resampled Class Distribution')

# # Create a new pipeline with RandomUnderSampler
# pip_model_undersample = ImbPipeline(steps=[
#     ('preprocessor', preprocessor),
#     ('undersampler', undersampler),
#     ('model', XGBClassifier(random_state=2023))
# ])

# # Fit the pipeline
# pip_model_undersample.fit(X_train, y_train_encoded)

# # Step 1: Generate Predictions
# y_pred_encoded = pip_model_undersample.predict(X_test)
# y_pred = label_encoder.inverse_transform(y_pred_encoded)

# # Step 2: Evaluate Metrics
# metrics = get_metrics(y_test, y_pred, unique_classes)

# # Step 3: View Results metrics
# print(metrics)


TomekLinks

In [43]:
from imblearn.combine import SMOTETomek

In [128]:
sampling_strategy = {
    0: 7735,   # HDF 
    1: 7735,   # NF 
    2: 7735,   # OSF
    3: 7735,   # PWF
    4: 100000    # TWF
}

In [129]:

# Creating pipeline without PCA analysis and balanced class with parameter by model 
pip_model_undersample = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote_tomek', SMOTETomek(sampling_strategy=sampling_strategy, random_state=2023)),  # Adjust sampling_strategy as needed
    ('model', XGBClassifier(random_state=2023, objective='multi:softmax'))
])

# Fit pipeline with sample weights
pip_model_undersample.fit(X_train, y_train_encoded)

# Step 1: Generate Predictions
y_pred_encoded = pip_model_undersample.predict(X_test)
y_pred = label_encoder.inverse_transform(y_pred_encoded)

# Step 2: Evaluate Metrics
metrics = get_metrics(y_test, y_pred, unique_classes)

# Step 3: View Results metrics
metrics

{'Accuracy': 0.9769539078156313,
 'Balanced Accuracy': 0.8583656330749354,
 'Macro Recall': 0.8583656330749354,
 'Macro Precision': 0.6815806235430918,
 'Macro F1': 0.7457694465723662,
 'F1 Scores per Class': {'HDF': 0.9545454545454546,
  'NF': 0.9880083420229405,
  'OSF': 0.8648648648648649,
  'PWF': 0.75,
  'TWF': 0.17142857142857143},
 'Recall Scores per Class': {'HDF': 1.0,
  'NF': 0.979328165374677,
  'OSF': 1.0,
  'PWF': 0.9375,
  'TWF': 0.375},
 'Precision Scores per Class': {'HDF': 0.9130434782608695,
  'NF': 0.9968437664387164,
  'OSF': 0.7619047619047619,
  'PWF': 0.625,
  'TWF': 0.1111111111111111}}

In [135]:
fine_tune_params = {
    'model__n_estimators': [100],
    'model__max_depth': [None],
    'model__lambda': [0, 0.001, 0.01, 0.1, 1, 10, 100],
    'model__alpha': [0, 0.001, 0.01, 0.1, 1, 10, 100],
    'model__eta': [0.2],
}
# Running a new GridSearchCV for fine-tuning
fine_tune_grid = GridSearchCV(pip_model_undersample, fine_tune_params, cv=3, scoring='f1_macro', n_jobs=-1, verbose=1)
fine_tune_grid.fit(X_train, y_train_encoded)

# Collecting and printing the fine-tuned results
fine_tuned_results = pd.DataFrame(fine_tune_grid.cv_results_) 
fine_tuned_best_index = fine_tuned_results['mean_test_score'].idxmax()
fine_tuned_best_params = fine_tuned_results.loc[fine_tuned_best_index, 'params']

# Print best model parameters
print("Best fine-tuned model parameters:") 
print(fine_tuned_best_params)
# Finding the best estimator paramaters 
tuned_model = fine_tune_grid.best_estimator_ 
y_pred_encoded = tuned_model.predict(X_test)
y_pred = label_encoder.inverse_transform(y_pred_encoded)
# View new perfomance (focus on F1-score) 
get_metrics(y_test, y_pred, unique_classes)

Fitting 3 folds for each of 49 candidates, totalling 147 fits
Best fine-tuned model parameters:
{'model__alpha': 0.001, 'model__eta': 0.2, 'model__lambda': 0.01, 'model__max_depth': None, 'model__n_estimators': 100}


{'Accuracy': 0.9774549098196392,
 'Balanced Accuracy': 0.8584689922480621,
 'Macro Recall': 0.8584689922480621,
 'Macro Precision': 0.6792641900686065,
 'Macro F1': 0.7459309913010922,
 'F1 Scores per Class': {'HDF': 0.9333333333333333,
  'NF': 0.9882720875684128,
  'OSF': 0.8421052631578947,
  'PWF': 0.7894736842105263,
  'TWF': 0.17647058823529413},
 'Recall Scores per Class': {'HDF': 1.0,
  'NF': 0.9798449612403101,
  'OSF': 1.0,
  'PWF': 0.9375,
  'TWF': 0.375},
 'Precision Scores per Class': {'HDF': 0.875,
  'NF': 0.9968454258675079,
  'OSF': 0.7272727272727273,
  'PWF': 0.6818181818181818,
  'TWF': 0.11538461538461539}}

In [136]:
# Example for XGBoost classifier
importance = pip_model_undersample.named_steps['model'].feature_importances_

# Match feature importance with feature names if available
feature_names = pip_model_undersample.named_steps['preprocessor'].transformers_[0][1].get_feature_names_out()

# Print feature importance scores
for feature_name, importance_score in zip(feature_names, importance):
    print(f"{feature_name}: {importance_score}")


Air_temperature: 0.07484892010688782
Process_temperature: 0.014372119680047035
Rotational_speed: 0.1466379016637802
Torque: 0.29766470193862915
Tool_wear: 0.41233253479003906


In [130]:
cm = confusion_matrix(y_test, y_pred)
cm

array([[  21,    0,    0,    0,    0],
       [   2, 1895,    5,    9,   24],
       [   0,    0,   16,    0,    0],
       [   0,    1,    0,   15,    0],
       [   0,    5,    0,    0,    3]])

In [133]:
cm = confusion_matrix(y_train, y_pred)
cm

array([[  85,    0,    0,    0,    0],
       [   0, 7735,    0,    0,    0],
       [   0,    0,   62,    0,    0],
       [   0,    0,    0,   64,    0],
       [   0,    0,    0,    0,   34]])

In [82]:
y_train.value_counts()

Failure_type
NF     7735
HDF      85
PWF      64
OSF      62
TWF      34
Name: count, dtype: int64

In [90]:
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

print("Label Encoder Mapping:")
print(label_mapping)

Label Encoder Mapping:
{'HDF': 0, 'NF': 1, 'OSF': 2, 'PWF': 3, 'TWF': 4}


In [101]:

# Creating pipeline without PCA analysis and balanced class with parameter by model 
sampling_strategy = {
    0: 7735,   # HDF 
    1: 7735,   # NF 
    2: 7735,   # OSF
    3: 7735,   # PWF
    4: 100000    # TWF
}

pip_model_undersample = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote_tomek', SMOTETomek(sampling_strategy=sampling_strategy, random_state=2023)),
    ('model', XGBClassifier(random_state=2023, objective='multi:softmax'))
])

# Fit pipeline with sample weights
pip_model_undersample.fit(X_train, y_train_encoded)

# Step 1: Generate Predictions
y_pred_encoded = pip_model_undersample.predict(X_test)
y_pred = label_encoder.inverse_transform(y_pred_encoded)

# Step 2: Evaluate Metrics
metrics = get_metrics(y_test, y_pred, unique_classes)

# Step 3: View Results metrics
metrics

{'Accuracy': 0.9769539078156313,
 'Balanced Accuracy': 0.8583656330749354,
 'Macro Recall': 0.8583656330749354,
 'Macro Precision': 0.6815806235430918,
 'Macro F1': 0.7457694465723662,
 'F1 Scores per Class': {'HDF': 0.9545454545454546,
  'NF': 0.9880083420229405,
  'OSF': 0.8648648648648649,
  'PWF': 0.75,
  'TWF': 0.17142857142857143},
 'Recall Scores per Class': {'HDF': 1.0,
  'NF': 0.979328165374677,
  'OSF': 1.0,
  'PWF': 0.9375,
  'TWF': 0.375},
 'Precision Scores per Class': {'HDF': 0.9130434782608695,
  'NF': 0.9968437664387164,
  'OSF': 0.7619047619047619,
  'PWF': 0.625,
  'TWF': 0.1111111111111111}}

In [73]:

categorical_features_indices = [i for i, feature in enumerate(NUMERIC_FEATURES + CATEGORIC_FEATURES) if feature in CATEGORIC_FEATURES]

# Create the pipeline with SMOTETomek and XGBClassifier
pip_model_undersample = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote_tomek', SMOTETomek(sampling_strategy='auto', random_state=2023, categorical_features=categorical_features_indices)),  # Adjust sampling_strategy as needed
    ('model', XGBClassifier(random_state=2023))
])

# Fit pipeline
pip_model_undersample.fit(X_train, y_train_encoded)

# Obtain transformed data after preprocessing
X_train_transformed = preprocessor.fit_transform(X_train)

# Apply SMOTETomek separately for visualization
X_resampled, y_resampled = SMOTETomek(sampling_strategy='auto', random_state=2023).fit_resample(X_train_transformed, y_train)

# Apply PCA to visualize in 2D
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train_transformed)
X_resampled_pca = pca.transform(X_resampled)

# Visualize the data before and after sampling
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.title('Before SMOTETomek (PCA)')
sns.scatterplot(x=X_train_pca[:, 0], y=X_train_pca[:, 1], hue=y_train, palette='Set1', alpha=0.7)
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(loc='best')

plt.subplot(1, 2, 2)
plt.title('After SMOTETomek (PCA)')
sns.scatterplot(x=X_resampled_pca[:, 0], y=X_resampled_pca[:, 1], hue=y_resampled, palette='Set1', alpha=0.7)
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(loc='best')

plt.tight_layout()
plt.show()


TypeError: SMOTETomek.__init__() got an unexpected keyword argument 'categorical_features'

In [67]:
from imblearn.combine import SMOTEENN
# Creating pipeline without PCA analysis and balanced class with parameter by model 
pip_model_undersample = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote_tomek', SMOTEENN(random_state=2023)),  # Adjust sampling_strategy as needed
    ('model', XGBClassifier(random_state=2023, eval_metric='mlogloss', objective='multi:softmax'))
])

# Fit pipeline with sample weights
pip_model_undersample.fit(X_train, y_train_encoded)

# Step 1: Generate Predictions
y_pred_encoded = pip_model_undersample.predict(X_test)
y_pred = label_encoder.inverse_transform(y_pred_encoded)

# Step 2: Evaluate Metrics
metrics = get_metrics(y_test, y_pred, unique_classes)

# Step 3: View Results metrics
metrics

{'Accuracy': 0.9664328657314629,
 'Balanced Accuracy': 0.8436950904392765,
 'Macro Recall': 0.8436950904392765,
 'Macro Precision': 0.5987454556290601,
 'Macro F1': 0.6856955960032054,
 'F1 Scores per Class': {'HDF': 0.8235294117647058,
  'NF': 0.982437745740498,
  'OSF': 0.8,
  'PWF': 0.7272727272727273,
  'TWF': 0.09523809523809523},
 'Recall Scores per Class': {'HDF': 1.0,
  'NF': 0.9684754521963824,
  'OSF': 1.0,
  'PWF': 1.0,
  'TWF': 0.25},
 'Precision Scores per Class': {'HDF': 0.7,
  'NF': 0.9968085106382979,
  'OSF': 0.6666666666666666,
  'PWF': 0.5714285714285714,
  'TWF': 0.058823529411764705}}