#Data Importation

In [None]:
pip install scikit-learn==1.2.2 imbalanced-learn==0.8.0

Collecting imbalanced-learn==0.8.0
  Downloading imbalanced_learn-0.8.0-py3-none-any.whl (206 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m206.5/206.5 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: imbalanced-learn
  Attempting uninstall: imbalanced-learn
    Found existing installation: imbalanced-learn 0.10.1
    Uninstalling imbalanced-learn-0.10.1:
      Successfully uninstalled imbalanced-learn-0.10.1
Successfully installed imbalanced-learn-0.8.0


In [None]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np

Mounted at /content/drive


In [None]:
df = pd.read_csv("/content/drive/MyDrive/Thesis/encoded_A14I_dataset")
df.head()

Unnamed: 0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure
0,1,298.1,308.6,1551,42.8,0,0
1,0,298.2,308.7,1408,46.3,3,0
2,0,298.1,308.5,1498,49.4,5,0
3,0,298.2,308.6,1433,39.5,7,0
4,0,298.2,308.7,1408,40.0,9,0


In [None]:
feature_columns = ['Type', 'Air temperature [K]', 'Process temperature [K]', 'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]']
target_column = 'Machine failure'

X = df[feature_columns].to_numpy()
Y = df[target_column].to_numpy()

#Implement models

##set up functions

In [None]:
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imbpipeline
from sklearn.metrics import make_scorer
from sklearn.metrics import f1_score

In [None]:
def custom_f1(Y, Y_pred):
    return f1_score(Y, Y_pred, average="weighted")

In [None]:
def custom_f1_minority(Y, Y_pred):
    classes_of_interest = [1, 2, 3, 4]
    return f1_score(Y, Y_pred, labels=classes_of_interest, average="weighted")

In [None]:
def custom_f1_majority(Y, Y_pred):
    classes_of_interest = [0]
    return f1_score(Y, Y_pred, labels=classes_of_interest, average="weighted")

In [None]:
def nested_cv_model(X, Y, grid, classifier):
    # Outer CV
    outer_cv = StratifiedKFold(n_splits=5, shuffle=True)

    train_score_majority = []
    test_score_majority = []

    train_score_minority = []
    test_score_minority = []

    train_wa = []
    test_wa = []
    cv_wa = []

    aggregated_results = []

    for train_index, test_index in outer_cv.split(X, Y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = Y[train_index], Y[test_index]

        pipeline = imbpipeline(steps=[['smote', SMOTE()],
                                      ['scaler', MinMaxScaler()],
                                      ['classifier', classifier]])

        F1 = make_scorer(custom_f1)

        # inner CV for hyperparameter tuning
        inner_cv = StratifiedKFold(n_splits=5, shuffle=True)
        grid_search = RandomizedSearchCV(pipeline,
                               grid,
                               scoring=F1,
                               cv=inner_cv,
                               n_jobs=-1)

        grid_search.fit(X_train, y_train)

        Y_pred_train = grid_search.predict(X_train)
        train_score_majority.append(custom_f1_majority(Y_pred_train, y_train))
        train_score_minority.append(custom_f1_minority(Y_pred_train, y_train))

        Y_pred_test = grid_search.predict(X_test)
        test_score_majority.append(custom_f1_majority(Y_pred_test, y_test))
        test_score_minority.append(custom_f1_minority(Y_pred_test, y_test))

        train_wa.append(custom_f1(y_train, Y_pred_train))
        test_wa.append(custom_f1(y_test, Y_pred_test))
        cv_wa.append(grid_search.best_score_)

        aggregated_results.append((grid_search.best_params_, grid_search.best_score_))

    #best parameters across all folds
    best_params_global = max(aggregated_results, key=lambda x: x[1])[0]

    # Averaging the scores across all outer folds
    average_train_score_majority = np.mean(train_score_majority)
    average_test_score_majority = np.mean(test_score_majority)

    average_train_score_minority = np.mean(train_score_minority)
    average_test_score_minority = np.mean(test_score_minority)

    average_train_wa = np.mean(train_wa)
    average_test_wa =  np.mean(test_wa)
    average_cv_wa = np.mean(cv_wa)

    return {'Parameters':best_params_global}, {'majority train':average_train_score_majority, 'majority test':average_test_score_majority }, {'minority train':average_train_score_minority,'minority test': average_test_score_minority}, {'weighted average train':average_train_wa, 'weighted average cv':average_cv_wa, 'weighted average test':average_test_wa}

##Logistic Regression

In [None]:
from scipy.stats import uniform
from sklearn.linear_model import LogisticRegression

In [None]:
param_grid_lr = {
    'classifier__C': uniform(loc=0.1, scale=10).rvs(size=5),
    'classifier__max_iter': [100, 200, 300, 400],
    'classifier__tol': np.logspace(-5, -2, 5),
    'classifier__solver': ['lbfgs', 'newton-cg', 'sag', 'saga']
}
logistic_regression = LogisticRegression(multi_class='multinomial')

In [None]:
lr_parameters, lr_majority, lr_minority, lr_average = nested_cv_model(X,Y, param_grid_lr, logistic_regression)
lr_parameters, lr_average

({'Parameters': {'classifier__tol': 1e-05,
   'classifier__solver': 'newton-cg',
   'classifier__max_iter': 100,
   'classifier__C': 8.642628161954384}},
 {'weighted average train': 0.6604072336364917,
  'weighted average cv': 0.6685273268979357,
  'weighted average test': 0.6574097457171133})

In [None]:
lr_majority, lr_minority

({'majority train': 0.6682877299226251, 'majority test': 0.6653646301425098},
 {'minority train': 0.29339350750873144, 'minority test': 0.289055421965851})

##Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
decision_tree = DecisionTreeClassifier()

param_grid_dt = {
    'classifier__criterion': ['gini', 'entropy'],
    'classifier__max_depth': [3, 5, 8],
    'classifier__min_samples_leaf': [2, 4, 6, 8],
    'classifier__ccp_alpha': [0.0001, 0.001, 0.01]
}

In [None]:
dt_parameters, dt_majority, dt_minority, dt_average = nested_cv_model(X,Y, param_grid_dt, decision_tree)
dt_parameters, dt_average

({'Parameters': {'classifier__min_samples_leaf': 6,
   'classifier__max_depth': 8,
   'classifier__criterion': 'entropy',
   'classifier__ccp_alpha': 0.0001}},
 {'weighted average train': 0.7751270749484824,
  'weighted average cv': 0.7732649682568231,
  'weighted average test': 0.7704896944014527})

In [None]:
dt_majority, dt_minority

({'majority train': 0.7803282742304755, 'majority test': 0.776853308573559},
 {'minority train': 0.3669647671228404, 'minority test': 0.3200632310994583})

##RF

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
random_forest = RandomForestClassifier()

param_grid_rf = {
    'classifier__n_estimators': [10, 30, 50],
    'classifier__max_depth': [3, 5, 8],
    'classifier__min_samples_leaf': [2, 4, 6, 8],
    'classifier__bootstrap': [True, False]
}

In [None]:
rf_parameters, rf_majority, rf_minority, rf_average = nested_cv_model(X,Y, param_grid_rf, random_forest)
rf_parameters, rf_average

({'Parameters': {'classifier__n_estimators': 30,
   'classifier__min_samples_leaf': 8,
   'classifier__max_depth': 8,
   'classifier__bootstrap': False}},
 {'weighted average train': 0.8233118978175593,
  'weighted average cv': 0.8432052288901215,
  'weighted average test': 0.8179819066592058})

In [None]:
rf_majority, rf_minority

({'majority train': 0.8329958137151969, 'majority test': 0.8283985608854575},
 {'minority train': 0.34645878593253826, 'minority test': 0.32267710651271053})