In [None]:
'''
All Rights Reserved - Pouyan Dinarvand
date: 23/07/2021
-> Performing ML classification (for predicting Good/Bad quality) algorithms on AM dataset 
'''

# import libraries
!pip install hvplot
import numpy as np
import pandas as pd
import panel as pn
from panel.interact import interact

import hvplot.pandas 
import holoviews as hv

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.tree import export_text
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import GridSearchCV

from joblib import dump, load
from sklearn.pipeline import Pipeline


# read data
df = pd.read_csv('AM_Ti6Al4V_process_property_parameters_one_Hot_encoded_relative_density.csv')


#@title ML Classification - Model Training {run:'auto'}

#@markdown >  Classification
algorithm = "KNN" #@param ["NeuralNetwork","KNN", "Tree","RandomForest", "AdaBoost", "GradientBoosting", "GaussianNB", "GaussianProcess", "SVM"]
test_train_ratio = 0.3 #@param {type:"slider", min:0, max:1, step:0.05}
scale_train_test = True #@param {type:"boolean"}

use_PCA = False #@param {type:"boolean"}
include_machine_type_and_layer_tickness = False #@param {type:"boolean"}
threshold_good_bad_quality_AM = 99.5 #@param {type:"slider", min:0, max:100, step:0.1}
print_params = True #@param {type:"boolean"}


#@markdown >  Optimization
cv =  2 #@param {type:"integer"}
only_print_results_best_model = True #@param {type:"boolean"}
#@markdown >  Saving ML Model
save_model = True #@param {type:"boolean"}

if(print_params):
    print('*************** Classification Parameters ****************')
    print('algorithm = ', algorithm)
    print('test_train_ratio = ', test_train_ratio)
    print('scale_train_test = ', scale_train_test)
    print('use_PCA = ', use_PCA)
    print('include_machine_type_and_layer_tickness = ', include_machine_type_and_layer_tickness)
    print('threshold_good_bad_quality_AM = ', threshold_good_bad_quality_AM)

    print('*************** Optimization Parameters ***************')
    print('cv = ', cv)
    print('save_model = ', save_model)


######################################### Feature Selection #########################################
# get data
if(include_machine_type_and_layer_tickness):
    feature_names_list = ['Laser_Power', 'Laser_Speed', 'Hatch_Spacing', 'Layer_Thickness', 'Powder_Size', 'Laser_Energy', 
                          'Machine_Type_Concept Laser M2', 'Machine_Type_Concept laser M',
                          'Machine_Type_Concept laser M3', 'Machine_Type_EOS M270',
                          'Machine_Type_SLM', 'Machine_Type_SLM 125 HL',
                          'Machine_Type_SLM 250 HL', 'Machine_Type_SLM 280 HL',
                          'Machine_Type_Self-developed SLM', 'Machine_Type_Trumpf LF250']


else:
    feature_names_list = ['Laser_Power', 'Laser_Speed', 'Hatch_Spacing', 'Powder_Size', 'Laser_Energy']


X = df.copy()[feature_names_list].values

#binarize target variable for classification
y = np.where(df.copy()['Relative_Density'].values >= threshold_good_bad_quality_AM, 1, 0) # 1 = Good quality & 0 = bad quality


'''
if(print_params):
    print('number of good quality (label 1) samples = ', sum(y))
    print('number of bad quality (label 0) samples = ', len(y) - sum(y))
    print('Ratio ones (good quality parts) in the dataset = ', sum(np.copy(y)/len(y)))
'''

#print(X)
#print(y)

# compute mutual information between features and target variable
if(print_params):

    print('*********************  Mutual Information *********************')
    MI_df = pd.DataFrame({'Features': feature_names_list, 
                          'MI': mutual_info_classif(X = np.copy(X), y = np.copy(y))
                         })
    print(MI_df)
    print('****************************************************************')



#spliting data into train & test sets for classification
X_train, X_test, y_train, y_test = train_test_split(np.copy(X), np.copy(y), test_size= test_train_ratio, random_state = 123)

# initialize ML pipleline
pipeline_steps = []

# scaling data
if(scale_train_test):
    pipeline_steps.append(('scaler', StandardScaler()))
    print('StandardScaler added to the pipeline.')

# Dimensionality Reduction via PCA
if(use_PCA):
    n_components_pca = 3 
    feature_names_list = [ 'PC'+str(i+1) for i in np.arange(0,n_components_pca)]
    pipeline_steps.append(('pca', PCA(n_components = n_components_pca)))
    print('PCA with n_components = ' + str(n_components_pca) + ' added to the pipeline.')

if(print_params):
    print('Name of features:')
    print(feature_names_list)

    '''
    print('X_train :', X_train, 'X_test :', X_test)
    print('y_train :', y_train,'y_test :',  y_test)
    '''
    print('X_train shape :', X_train.shape, 'X_test shape :', X_test.shape)
    print('y_train shape :', y_train.shape, 'y_test shape :',  y_test.shape)

############################# Model training ###################################

# KNN
if(algorithm == 'KNN'):
    
    param_grid = [{'model__n_neighbors' : np.arange(1,25),
                   'model__p': [1,2,3,5,10,100],
                   'model__weights': ['uniform', 'distance'],
                   }] # [{exp1},{exp2}]
               
        
    pipeline_steps.append(('model', KNeighborsClassifier()))
    

# NeuralNetwork
elif(algorithm == 'NeuralNetwork'):

    '''
    param_grid = [{'model__activation' : ['identity', 'logistic', 'tanh', 'relu'],
                    'model__solver': ['lbfgs', 'sgd', 'adam'],
                    'model__alpha': [0.001,0.01,0.1,1,10],
                    'model__hidden_layer_sizes': [ (2,) ,(5,), (10,), (100,), (2,2), (5,5), (10,10), (2,2,2,2,2), (5,5,5,5,5), (10,10,10)],
                    'model__learning_rate_init': [0.0001,0.01,0.1,1],
                    'model__max_iter': [50,500],
                    'model__learning_rate' : ['constant', 'invscaling', 'adaptive'],
                    'model__early_stopping': [True, False],
                  }]
    '''
    param_grid = [{'model__activation' : ['tanh'],
                    'model__solver': ['lbfgs'],
                    'model__alpha': [0.001,0.01,0.1,1,10],
                    'model__hidden_layer_sizes': [ (100,), (100,100),  (100,100,100)],
                    'model__learning_rate_init': [0.0001,0.01,0.1,1],
                    'model__max_iter': [50,500],
                    'model__learning_rate' : ['constant'],
                    'model__early_stopping': [True],
                  }]

    pipeline_steps.append(('model', MLPClassifier(verbose = False, random_state = 123)))



# Tree 
elif(algorithm == 'Tree'):
    
    param_grid = [{'model__max_depth' : [None, 2,5, 10, 25,50,100],
                   'model__criterion': ['entropy', 'gini'],
                   'model__splitter' : ['best', 'random'],
                   }] # [{exp1},{exp2}]
        
    pipeline_steps.append(('model', tree.DecisionTreeClassifier(random_state = 123)))
    

# RandomForest
elif(algorithm == 'RandomForest'):

    '''
    param_grid = [{'model__n_estimators' :[100,1000,5000],
                   'model__criterion': ['entropy', 'gini'],
                   'model__max_depth': [None, 2, 5, 10, 25, 50, 100],
                  }]
    '''
    param_grid = [{'model__n_estimators' :[5000],
                   'model__criterion': ['entropy'],
                   'model__max_depth': [None],
                  }]
    
    pipeline_steps.append(('model', RandomForestClassifier(random_state = 123)))
    

# AdaBoost
elif(algorithm == 'AdaBoost'):

    param_grid = [{'model__n_estimators' :[100,1000,5000],
                   'model__learning_rate': [0.0001, 0.01,0.1,1,10],
                  }]
    pipeline_steps.append(('model', AdaBoostClassifier(random_state = 123)))
    

# GradientBoosting
elif(algorithm == 'GradientBoosting'):

    param_grid = [{'model__n_estimators' :[100, 1000,5000],
                   'model__learning_rate': [0.0001, 0.01,0.1,1,10],
                   'model__max_depth': [None, 2, 5, 10, 25, 50, 100],
                   'model__criterion': ['friedman_mse', 'mse', 'mae'],
                  }]
    pipeline_steps.append(('model', GradientBoostingClassifier(random_state = 123)))
    

# GaussianNB
elif(algorithm == 'GaussianNB'):
    
    param_grid = [{
                  }]
    pipeline_steps.append(('model', GaussianNB()))
    

# GaussianProcess
elif(algorithm == 'GaussianProcess'):
    
    param_grid = [{'model__kernel' :[1 * RBF(10), 0.1 * RBF(10), 0.001 * RBF(10) ,1 * RBF(1), 0.1 * RBF(1), 0.001 * RBF(1), 1 * RBF(0.01), 0.1 * RBF(0.01), 0.001 * RBF(0.01),],
                  }]
    pipeline_steps.append(('model', GaussianProcessClassifier(random_state = 123)))
    

# SVM
elif(algorithm == 'SVM'):
    
    param_grid = [{'model__C' :[0.0001,0.001,0.01,0.1,1,10,100,1000],
                   'model__gamma' :['scale', 'auto'],
                   'model__kernel' :['linear', 'poly', 'rbf', 'sigmoid'],
                  }]
    pipeline_steps.append(('model', SVC(random_state = 123, verbose = False))) # set prabability =True to get prob estimates
    

#########################################  Fit Pipeline #########################################
print(algorithm + ' added to the pipeline.')
my_pipeline = Pipeline(pipeline_steps)

print('****************************************************************')

print('param_grid:')
print(param_grid)

print('Optimization in process ... ')

model = GridSearchCV(estimator = my_pipeline, param_grid = param_grid, n_jobs = -1, cv = cv).fit(X_train, y_train) 
        
if(print_params):
    print('Best hyper-parameters found in the hyper-parameters grid:' )
    print(model.best_params_)

    print('~~~~~~~~~~~~~~~~~~~~ Best Algorithm hyper-parameters ~~~~~~~~~~~~~~~~~~~~')
    print(model.best_estimator_.get_params)
    print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
            
    print('Optimization cv results:')
    for i, j, k in zip(model.cv_results_['params'], model.cv_results_['std_test_score'], model.cv_results_['mean_test_score']): 
        if( only_print_results_best_model):# only print the best cv score
            if(i == model.best_params_): 
                print('- params: ', i)
                print('std cv score = ', np.round(j,2) , ' , mean cv score = ', np.round(k,2) )
                break        
        else:
            print('- params: ', i)
            print('std cv score = ', np.round(j,2) , ' , mean cv score = ', np.round(k,2) )

    # specific prints for tree-based ML models
    if(algorithm == 'Tree' or algorithm == 'GradientBoosting' or algorithm == 'AdaBoost' or algorithm == 'RandomForest'):
        if(algorithm == 'Tree'):
            text_tree = export_text(model.best_estimator_['model'], feature_names = feature_names_list)
            print('Best Tree Text: ')
            print(text_tree)

        print('***** Feature Importances *****')
        ft_imp = pd.DataFrame({'Features': feature_names_list, 
                               'Importance': model.best_estimator_['model'].feature_importances_})
        print(ft_imp)

######################################### Prediction on test set #########################################
#y_pred_train = model.predict(np.copy(X_train))
y_pred_test = model.predict(np.copy(X_test))


'''
print( 'accuracy_score on train set = ', accuracy_score(y_true = np.copy(y_train), y_pred = np.copy(y_pred_train)) )
print( 'accuracy_score on test set = ', accuracy_score(y_true = np.copy(y_test), y_pred = np.copy(y_pred_test)) )
'''
print('****************************************************************')

'''
print('Classification Report on the train set:')
print(classification_report(y_true = np.copy(y_train), y_pred = np.copy(y_pred_train), labels=[0, 1]))
'''

print('Classification Report on the test set:')
print(classification_report(y_true = np.copy(y_test), y_pred = np.copy(y_pred_test), labels=[0, 1]))

print('Confusion Matrix')
cm = confusion_matrix(y_true = np.copy(y_test), y_pred = np.copy(y_pred_test), labels=[0, 1])
cm_plot = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels = [0, 1])
cm_plot.plot()

########################################## Save Model (pickle) #########################################
if(save_model):
    dump(model.best_estimator_, 'model.joblib')  
    print('model saved as model.joblib')

########################################## Clear Memory #########################################
try:
    del my_pipeline, model, X, y, param_grid
except Exception as e:
    pass    