# Import libraries

In [1]:
import warnings
warnings.filterwarnings( 'ignore' )

import os
import pickle
import joblib
import numpy    as np
from sklearn import metrics

# Data

In [2]:
# Load data
X = np.load('Data/data.npz', allow_pickle=True)['X']
y = np.load('Data/data.npz', allow_pickle=True)['y']

# Label encoding for targe class
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)
pickle.dump(le, open('Model/Label_encoder.pkl', 'wb'))

## Prediction model

In [3]:
import xgboost

# Parameters
params = {
    'n_estimators'      : 500,
    'learning_rate'     : 1e-1,
    'max_depth'         : 10,
    'reg_alpha'         : 20, 
    'reg_lambda'        : 1.0, 
    'gamma'             : 1, 
    'min_child_weight'  : 2, 
    'max_leaves'        : 2, 
    'eval_metric'       : ['auc', 'aucpr', 'mlogloss']
}


# Setup model
model = xgboost.XGBClassifier(objective           = 'multi:softmax',  
                              n_jobs              = -1,                                   
                              validate_parameters = True, 
                              verbosity           = 1,
                              tree_method         = 'hist',
                              **params)

# Train model
weights = [y.shape[0] / np.where(y == i)[0].shape[0] for i in np.unique(y)]

model.fit(X, y,
          eval_set = [ (X, y) ],
          sample_weight = [weights[int(x)] for x in y],
          verbose = 50);

[0]	validation_0-auc:0.58062	validation_0-aucpr:0.16563	validation_0-mlogloss:2.00307
[50]	validation_0-auc:0.72916	validation_0-aucpr:0.61725	validation_0-mlogloss:1.00277
[100]	validation_0-auc:0.76837	validation_0-aucpr:0.67483	validation_0-mlogloss:0.82584
[150]	validation_0-auc:0.78600	validation_0-aucpr:0.70294	validation_0-mlogloss:0.74900
[200]	validation_0-auc:0.79545	validation_0-aucpr:0.71742	validation_0-mlogloss:0.70725
[250]	validation_0-auc:0.80221	validation_0-aucpr:0.72691	validation_0-mlogloss:0.67992
[300]	validation_0-auc:0.80782	validation_0-aucpr:0.73465	validation_0-mlogloss:0.65898
[350]	validation_0-auc:0.81204	validation_0-aucpr:0.74021	validation_0-mlogloss:0.64326
[400]	validation_0-auc:0.81569	validation_0-aucpr:0.74518	validation_0-mlogloss:0.63078
[450]	validation_0-auc:0.81880	validation_0-aucpr:0.75021	validation_0-mlogloss:0.62034
[499]	validation_0-auc:0.82136	validation_0-aucpr:0.75340	validation_0-mlogloss:0.61210


In [4]:
# Get predictions
pred = model.predict(X)

# Model Evaluation
Accuracy = 100*metrics.accuracy_score(y, pred)
Recall = metrics.recall_score(y, pred, average='macro')
Precision = metrics.precision_score(y, pred, average='macro')   
CM = metrics.confusion_matrix(y, pred)

print(f'Accuracy: {Accuracy:.3f}%')
print(f'Precision: {Precision:.3f}')
print(f'Recall: {Recall:.3f}')
CM

Accuracy: 74.363%
Precision: 0.645
Recall: 0.832


array([[   641,    239,      7,      4,     83,      0,      0,      0],
       [   208,    593,      3,      2,     61,      0,      0,      0],
       [   267,    691, 149448,  40243,   2176,      2,      7,      3],
       [   146,    215,  50096, 128516,   2021,      0,     21,      5],
       [     6,     12,      1,      6,    119,      0,      0,      0],
       [     0,      0,      0,      0,      0,     14,      0,      0],
       [     0,      0,      0,      0,      0,      0,    408,      0],
       [     0,      0,      0,      0,      0,      0,      0,    236]],
      dtype=int64)

### Model 1

In [5]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=300,
                               n_jobs=-1, 
                               max_depth=5, 
                               class_weight='balanced')

model.fit(X, y)

# Get predictions
pred = model.predict(X)

# Model Evaluation
Accuracy = 100*metrics.accuracy_score(y, pred)
Recall = metrics.recall_score(y, pred, average='macro')
Precision = metrics.precision_score(y, pred, average='macro')   
CM = metrics.confusion_matrix(y, pred)

print(f'Accuracy: {Accuracy:.3f}%')
print(f'Precision: {Precision:.3f}')
print(f'Recall: {Recall:.3f}')
CM

Accuracy: 73.216%
Precision: 0.579
Recall: 0.816


array([[   687,    244,     15,      7,     18,      0,      3,      0],
       [   181,    651,      9,      5,     21,      0,      0,      0],
       [   455,    625, 152688,  37594,   1282,      0,    151,     42],
       [   298,    185,  58569, 120889,    844,      0,    217,     18],
       [    14,     22,      7,      7,     94,      0,      0,      0],
       [     0,      0,      0,      0,      0,     14,      0,      0],
       [     0,      0,      0,      0,      0,      0,    408,      0],
       [     0,      0,      2,      0,      0,      0,      7,    227]],
      dtype=int64)

### Model 2

In [6]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=300,
                               n_jobs=-1, 
                               max_depth=15, 
                               class_weight='balanced')

model.fit(X, y)

In [7]:
# Get predictions
pred = model.predict(X)

# Model Evaluation
Accuracy = 100*metrics.accuracy_score(y, pred)
Recall = metrics.recall_score(y, pred, average='macro')
Precision = metrics.precision_score(y, pred, average='macro')   
CM = metrics.confusion_matrix(y, pred)

print(f'Accuracy: {Accuracy:.3f}%')
print(f'Precision: {Precision:.3f}')
print(f'Recall: {Recall:.3f}')
CM

Accuracy: 82.778%
Precision: 0.957
Recall: 0.956


array([[   974,      0,      0,      0,      0,      0,      0,      0],
       [     0,    866,      1,      0,      0,      0,      0,      0],
       [     0,      0, 168184,  24653,      0,      0,      0,      0],
       [     0,      0,  40188, 140832,      0,      0,      0,      0],
       [     0,      0,      0,      0,    144,      0,      0,      0],
       [     0,      0,      0,      0,      0,     14,      0,      0],
       [     0,      0,      0,      0,      0,      0,    408,      0],
       [     0,      0,      0,      0,      0,      0,      0,    236]],
      dtype=int64)

In [8]:
joblib.dump(model, "Model/model.joblib", compress=3)