In [1]:
import os
import time
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import make_scorer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [2]:
path_parent = os.getcwd()
saveFolder = os.path.join(path_parent, 'Data_CNC_Processed')
print(saveFolder)
filename = 'pred_machining_process'
compression = 'gzip'
df = pd.read_pickle(f'{saveFolder}\\{filename}.pkl',compression=compression)

c:\Users\nnnpo\Desktop\Data_CNC_Processed


In [3]:
colsX = np.array([item for item in df.columns if item not in 'Machining_Process'])
X = df[colsX].values
y = df['Machining_Process'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)

In [4]:
# Make scorer
scorer_ACC = make_scorer(accuracy_score,)
scorer_PRE = make_scorer(precision_score, average='macro')
scorer_REC = make_scorer(recall_score, average='macro')
scorer_F1 = make_scorer(f1_score, average='macro')

scorings = {'accuracy':scorer_ACC,
            'precision':scorer_PRE,
            'recall': scorer_REC,
            'f1': scorer_F1
           }

In [5]:
def calculate_scores(clf, name, data):
    df = pd.DataFrame()
    X_train, X_test, y_train, y_test = data
    sc = cross_validate(estimator=clf,
                                X=X_train,
                                y=y_train,
                                cv=10,
                                scoring=scorings,
                                n_jobs=-1)
                                
    ACC = sc['test_accuracy']
    PRE = sc['test_precision']
    REC = sc['test_recall']
    F1 = sc['test_f1']

    data = {'clf': f"{name} (Train)",
                'ACC': f"{ACC.mean():6.3f}",
                'PRE': f"{PRE.mean():6.3f}",
                'REC': f"{REC.mean():6.3f} ",
                'F1': f"{F1.mean():6.3f}",
                'Type': 'Train'}

    df = df.append(data, ignore_index=True)
    df = df[['clf', 'ACC', 'PRE', 'REC', 'F1', 'Type']]

    # Testing scores
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    y_proba = clf.predict_proba(X_test)

    ACC = accuracy_score(y_true=y_test, y_pred=y_pred)
    PRE = precision_score(y_true=y_test, y_pred=y_pred, average='macro')
    REC = recall_score(y_true=y_test, y_pred=y_pred, average='macro')
    F1 = f1_score(y_true=y_test, y_pred=y_pred, average='macro')

    data = {'clf': f"{name} (Test)", 'ACC': ACC, 'PRE': PRE, 'REC': REC, 'F1': F1, 'Type': 'Test'}
    df = df.append(data, ignore_index=True)
    return df

In [6]:
# Classifier
tree = DecisionTreeClassifier(criterion='entropy', max_depth=None, random_state=1)

base = DecisionTreeClassifier(criterion='entropy', 
                              max_depth=1,
                              random_state=1)

ada = AdaBoostClassifier(base_estimator=base,
                         n_estimators=500, 
                         learning_rate=0.1,
                         random_state=1)


# Gradient Boosting Machine
gbm = GradientBoostingClassifier(loss='deviance',
                                learning_rate=0.1,
                                n_estimators=100,
                                min_samples_split=2, #The minimum number of samples required to split an internal node
                                min_samples_leaf=1, #The minimum number of samples required to be at a leaf node. 
                                max_depth=3 #The maximum depth of the individual regression estimators.
                                )

# XGBoost                    
xgb = XGBClassifier(eval_metric="logloss", use_label_encoder=False, n_estimators=100)

# LightGBM
lgb = LGBMClassifier(n_estimators=100)

# CatBoost
cat = CatBoostClassifier(verbose=0, n_estimators=100)

In [7]:
clfs = {'Tree': tree, 'AdaBoost': ada, 'GBM-SKL': gbm, 'XGBoost': xgb, 'LightGBM': lgb, 'CatBoost': cat }
#clfs = {'CatBoost': cat }

data =  (X_train, X_test, y_train, y_test)

df_all = []
for name, clf in clfs.items():
    tic = time.perf_counter()
    df_out = calculate_scores(clf, name, data)
    toc = time.perf_counter()
    ep_time = toc - tic
    print(f'Calculating {name}, Elapse Time: {ep_time:0.4f} seconds')
    df_all.append(df_out)

Calculating Tree, Elapse Time: 3.2926 seconds
Calculating AdaBoost, Elapse Time: 54.3321 seconds
Calculating GBM-SKL, Elapse Time: 151.0059 seconds
Calculating XGBoost, Elapse Time: 46.4320 seconds
Calculating LightGBM, Elapse Time: 11.2594 seconds
Calculating CatBoost, Elapse Time: 24.9130 seconds


In [8]:
df = pd.concat(df_all, axis=0 )
df = df.set_index('clf')
display(df)

Unnamed: 0_level_0,ACC,PRE,REC,F1,Type
clf,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Tree (Train),0.631,0.628,0.628,0.627,Train
Tree (Test),0.657791,0.654832,0.656254,0.65534,Test
AdaBoost (Train),0.216,0.203,0.184,0.166,Train
AdaBoost (Test),0.221592,0.224202,0.19044,0.174247,Test
GBM-SKL (Train),0.333,0.351,0.311,0.317,Train
GBM-SKL (Test),0.343,0.36759,0.320241,0.326516,Test
XGBoost (Train),0.528,0.536,0.516,0.522,Train
XGBoost (Test),0.566306,0.572127,0.551802,0.558534,Test
LightGBM (Train),0.546,0.557,0.534,0.541,Train
LightGBM (Test),0.570129,0.581637,0.556599,0.564734,Test
