### Helper Functions

In [299]:
# helper functions for models-comparison
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, auc, confusion_matrix


def get_metrics(y_true, y_pred, print_metrics=True):
    """
    Get accuracy, precision, recall, f1-score, auc, confusion matrix
    """
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, average='weighted')
    rec = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    conf_mat = confusion_matrix(y_true, y_pred)

    if print_metrics:
        print('Accuracy: {:.2f}'.format(acc*100))
        print('Precision: {:.2f}'.format(prec))
        print('Recall: {:.2f}'.format(rec))
        print('F1-score: {:.2f}'.format(f1))
        print('Confusion matrix:')
        print(pd.DataFrame(conf_mat))

    return acc, prec, rec, f1, conf_mat


# take the model and the train and test data and return the metrics
def get_metrics_model(model, X_train, y_train, X_test, y_test, print_metrics=True):
    """
    Get accuracy, precision, recall, f1-score, auc, confusion matrix
    """
    # get model name
    if print_metrics:
       print("training model: {}".format(model.__class__.__name__))
    model.fit(X_train, y_train)
    if print_metrics:
        print("predicting model: {}".format(model.__class__.__name__))
    y_pred = model.predict(X_test)
    if print_metrics:
        print("evaluating model: {}".format(model.__class__.__name__))
    acc, prec, rec, f1, conf_mat = get_metrics(y_test, y_pred, print_metrics)

    if print_metrics:
        print('saving model: {}'.format(model.__class__.__name__))
    
    # # save the trained model
    # model_name = model.__class__.__name__
    # model.save('models/{}.h5'.format(model_name))

    return acc, prec, rec, f1, conf_mat


# compute the metrics for all the models and return a dataframe with the results
def get_metrics_all_models(models, X_train, y_train, X_test, y_test):
    """
    Get accuracy, precision, recall, f1-score, auc, confusion matrix for all models
    """
    metrics = []
    for model in models:
        print('Model: {}'.format(model.__class__.__name__))
        acc, prec, rec, f1, conf_mat = get_metrics_model(model, X_train, y_train, X_test, y_test, print_metrics=False)
        metrics.append([acc, prec, rec, f1, conf_mat])

    df_metrics = pd.DataFrame(metrics, columns=['Accuracy', 'Precision', 'Recall', 'F1-score', 'Confusion matrix'])
    df_metrics.index = [str(model) for model in models]

    return df_metrics


In [71]:
from sklearn.model_selection import train_test_split
# Define the number of samples and features
num_samples = 1000
num_features = 6
# Create a random feature matrix
X = np.random.rand(num_samples, num_features)
# Create corresponding labels 6 classes (0, 1, 2, 3, 4, 5)
Y = np.random.randint(6, size=num_samples)

# split the data into training (80%) and testing (20%) sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# check the size of the training and testing sets
print("Training set size: ", X_train.shape[0])
print("Testing set size: ", X_test.shape[0])

Training set size:  800
Testing set size:  200


# - Models

In [313]:
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import  GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from xgboost import XGBClassifier

models = [svm.SVC(kernel='linear'), RandomForestClassifier(n_estimators=1), KNeighborsClassifier(n_neighbors=14),DecisionTreeClassifier(max_depth=3), GaussianNB(),
          LogisticRegression(), GradientBoostingClassifier(n_estimators=100, learning_rate=.1, max_depth=3, random_state=0), AdaBoostClassifier(n_estimators=1000, random_state=0), 
          ExtraTreesClassifier(n_estimators=1, random_state=0), XGBClassifier(n_estimators=1000, learning_rate=0.008, max_depth=3, random_state=0)]


### Linear SVM Model

In [315]:
acc, prec, rec, f1, conf_mat = get_metrics_model(models[0], X_train, Y_train, X_test, Y_test)


training model: SVC
predicting model: SVC
evaluating model: SVC
Accuracy: 17.50
Precision: 0.15
Recall: 0.17
F1-score: 0.14
Confusion matrix:
   0   1  2   3  4   5
0  1  12  8  10  1   5
1  0  11  5  11  1   4
2  0   8  4   7  1   5
3  0   9  6  13  0   6
4  1  10  3   5  0  10
5  3  12  9  13  0   6


### Random Forest

In [318]:
acc, prec, rec, f1, conf_mat = get_metrics_model(models[1], X_train, Y_train, X_test, Y_test) # check n_estimators, max_depth, random_state


training model: RandomForestClassifier
predicting model: RandomForestClassifier
evaluating model: RandomForestClassifier
Accuracy: 17.00
Precision: 0.18
Recall: 0.17
F1-score: 0.17
Confusion matrix:
   0  1  2  3  4   5
0  6  4  5  4  9   9
1  3  6  2  7  3  11
2  1  8  2  5  5   4
3  3  5  6  7  7   6
4  2  7  3  3  5   9
5  5  5  7  9  9   8


### KNN

In [319]:
acc, prec, rec, f1, conf_mat = get_metrics_model(models[2], X_train, Y_train, X_test, Y_test) # check n_neighbors


training model: KNeighborsClassifier
predicting model: KNeighborsClassifier
evaluating model: KNeighborsClassifier
Accuracy: 21.00
Precision: 0.21
Recall: 0.21
F1-score: 0.21
Confusion matrix:
    0   1  2   3  4  5
0   4   8  5   9  9  2
1   2   7  7   5  4  7
2   5   6  4   4  2  4
3   6   5  4  14  3  2
4   4   6  6   2  7  4
5  10  12  5   7  3  6


### Decision Tree

In [321]:
acc, prec, rec, f1, conf_mat = get_metrics_model(models[3], X_train, Y_train, X_test, Y_test) # check max_depth, random_state


training model: DecisionTreeClassifier
predicting model: DecisionTreeClassifier
evaluating model: DecisionTreeClassifier
Accuracy: 18.00
Precision: 0.31
Recall: 0.18
F1-score: 0.09
Confusion matrix:
   0   1  2  3  4  5
0  0  32  0  5  0  0
1  0  31  1  0  0  0
2  0  23  1  1  0  0
3  0  31  0  3  0  0
4  0  28  1  0  0  0
5  0  38  2  2  0  1


  _warn_prf(average, modifier, msg_start, len(result))


### Naive Bayes 

In [322]:
acc, prec, rec, f1, conf_mat = get_metrics_model(models[4], X_train, Y_train, X_test, Y_test) 


training model: GaussianNB
predicting model: GaussianNB
evaluating model: GaussianNB
Accuracy: 15.50
Precision: 0.14
Recall: 0.15
F1-score: 0.14
Confusion matrix:
   0   1  2   3  4   5
0  2  16  5   6  2   6
1  0   9  7   9  2   5
2  1   7  4   5  1   7
3  1   9  8  11  0   5
4  2   8  3   6  0  10
5  6  12  8   7  5   5


### Logistic Regression

In [323]:
acc, prec, rec, f1, conf_mat = get_metrics_model(models[5], X_train, Y_train, X_test, Y_test)


training model: LogisticRegression
predicting model: LogisticRegression
evaluating model: LogisticRegression
Accuracy: 17.50
Precision: 0.18
Recall: 0.17
F1-score: 0.16
Confusion matrix:
   0   1  2   3  4  5
0  2  11  7   8  3  6
1  0  10  7  10  2  3
2  0   8  4   5  2  6
3  0   6  7  13  1  7
4  1  11  3   4  1  9
5  4  13  8  10  3  5


### Gradient Boosting

In [324]:
acc, prec, rec, f1, conf_mat = get_metrics_model(models[6], X_train, Y_train, X_test, Y_test) # check n_estimators, learning_rate, max_depth, random_state


training model: GradientBoostingClassifier
predicting model: GradientBoostingClassifier
evaluating model: GradientBoostingClassifier
Accuracy: 17.50
Precision: 0.18
Recall: 0.17
F1-score: 0.18
Confusion matrix:
   0  1  2   3  4   5
0  4  7  6   9  7   4
1  3  7  6  11  1   4
2  4  6  4   7  1   3
3  3  9  4   5  4   9
4  3  7  3   7  4   5
5  6  6  7   7  6  11


### AdaBoost

In [325]:
acc, prec, rec, f1, conf_mat = get_metrics_model(models[7], X_train, Y_train, X_test, Y_test) # check n_estimators, random_state


training model: AdaBoostClassifier
predicting model: AdaBoostClassifier
evaluating model: AdaBoostClassifier
Accuracy: 16.50
Precision: 0.17
Recall: 0.17
F1-score: 0.16
Confusion matrix:
   0   1  2  3  4   5
0  7   6  3  6  8   7
1  2   9  1  7  3  10
2  5   5  4  5  2   4
3  3   6  3  6  5  11
4  5  10  3  7  1   3
5  8   8  5  8  8   6


### Extra Trees

In [326]:
acc, prec, rec, f1, conf_mat = get_metrics_model(models[8], X_train, Y_train, X_test, Y_test) # check n_estimators, random_state


training model: ExtraTreesClassifier
predicting model: ExtraTreesClassifier
evaluating model: ExtraTreesClassifier
Accuracy: 19.00
Precision: 0.20
Recall: 0.19
F1-score: 0.19
Confusion matrix:
   0   1   2  3  4  5
0  7  13   5  4  4  4
1  3  12   3  4  5  5
2  4   5   3  4  6  3
3  5   7   7  5  4  6
4  4   7   5  4  5  4
5  5   5  17  4  6  6


### XGBoost

In [328]:
acc, prec, rec, f1, conf_mat = get_metrics_model(models[9], X_train, Y_train, X_test, Y_test) # check n_estimators, learning_rate, max_depth, random_state


training model: XGBClassifier
predicting model: XGBClassifier
evaluating model: XGBClassifier
Accuracy: 20.50
Precision: 0.21
Recall: 0.20
F1-score: 0.20
Confusion matrix:
   0  1  2   3  4   5
0  5  8  6  11  5   2
1  4  9  6   6  3   4
2  1  7  6   7  2   2
3  2  8  2   7  6   9
4  5  8  3   4  3   6
5  8  8  8   6  2  11


### Models Comparison

In [310]:
df_metrics = get_metrics_all_models(models, X_train, Y_train, X_test, Y_test)

Model: SVC
Model: RandomForestClassifier
Model: KNeighborsClassifier
Model: DecisionTreeClassifier
Model: GaussianNB
Model: LogisticRegression
Model: GradientBoostingClassifier


  _warn_prf(average, modifier, msg_start, len(result))


Model: AdaBoostClassifier
Model: ExtraTreesClassifier
Model: XGBClassifier


In [332]:
# print the metrics for all the models in a dataframe format excluding the confusion matrix
df_metrics.drop('Confusion matrix', axis=1)



Unnamed: 0,Accuracy,Precision,Recall,F1-score
SVC(kernel='linear'),0.175,0.152964,0.175,0.143108
RandomForestClassifier(n_estimators=1),0.15,0.155626,0.15,0.151507
KNeighborsClassifier(n_neighbors=14),0.21,0.211353,0.21,0.206117
DecisionTreeClassifier(max_depth=3),0.18,0.313467,0.18,0.086912
GaussianNB(),0.155,0.139515,0.155,0.137228
LogisticRegression(),0.175,0.180009,0.175,0.155284
GradientBoostingClassifier(random_state=0),0.175,0.184897,0.175,0.17655
"AdaBoostClassifier(n_estimators=1000, random_state=0)",0.165,0.165197,0.165,0.163117
"ExtraTreesClassifier(n_estimators=1, random_state=0)",0.19,0.199047,0.19,0.18852
"XGBClassifier(base_score=None, booster=None, callbacks=None,\n colsample_bylevel=None, colsample_bynode=None,\n colsample_bytree=None, early_stopping_rounds=None,\n enable_categorical=False, eval_metric=None, feature_types=None,\n gamma=None, gpu_id=None, grow_policy=None, importance_type=None,\n interaction_constraints=None, learning_rate=0.008, max_bin=None,\n max_cat_threshold=None, max_cat_to_onehot=None,\n max_delta_step=None, max_depth=3, max_leaves=None,\n min_child_weight=None, missing=nan, monotone_constraints=None,\n n_estimators=1000, n_jobs=None, num_parallel_tree=None,\n objective='multi:softprob', predictor=None, ...)",0.205,0.210491,0.205,0.203186


In [336]:
# print the confusion matrix for all the models in a table format
for i in range(len(models)):
    print('Model: {}'.format(models[i].__class__.__name__))
    print(df_metrics.iloc[i]['Confusion matrix'])
    print('')

Model: SVC
[[ 1 12  8 10  1  5]
 [ 0 11  5 11  1  4]
 [ 0  8  4  7  1  5]
 [ 0  9  6 13  0  6]
 [ 1 10  3  5  0 10]
 [ 3 12  9 13  0  6]]

Model: RandomForestClassifier
[[ 4  7  9 10  3  4]
 [ 6  4  2  9  4  7]
 [ 5  2  4  7  6  1]
 [ 6  2  6  6  6  8]
 [ 8  5  3  4  4  5]
 [10 11  4  7  3  8]]

Model: KNeighborsClassifier
[[ 4  8  5  9  9  2]
 [ 2  7  7  5  4  7]
 [ 5  6  4  4  2  4]
 [ 6  5  4 14  3  2]
 [ 4  6  6  2  7  4]
 [10 12  5  7  3  6]]

Model: DecisionTreeClassifier
[[ 0 32  0  5  0  0]
 [ 0 31  1  0  0  0]
 [ 0 23  1  1  0  0]
 [ 0 31  0  3  0  0]
 [ 0 28  1  0  0  0]
 [ 0 38  2  2  0  1]]

Model: GaussianNB
[[ 2 16  5  6  2  6]
 [ 0  9  7  9  2  5]
 [ 1  7  4  5  1  7]
 [ 1  9  8 11  0  5]
 [ 2  8  3  6  0 10]
 [ 6 12  8  7  5  5]]

Model: LogisticRegression
[[ 2 11  7  8  3  6]
 [ 0 10  7 10  2  3]
 [ 0  8  4  5  2  6]
 [ 0  6  7 13  1  7]
 [ 1 11  3  4  1  9]
 [ 4 13  8 10  3  5]]

Model: GradientBoostingClassifier
[[ 4  7  6  9  7  4]
 [ 3  7  6 11  1  4]
 [ 4  6  4  7