# Import libraries

In [8]:
import warnings
warnings.filterwarnings( 'ignore' )

import os
import pickle
import numpy    as np
from sklearn import metrics

# Data

In [9]:
# Load data
X = np.load('Data/data.npz', allow_pickle=True)['X']
y = np.load('Data/data.npz', allow_pickle=True)['y']

# Label encoding for targe class
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)
pickle.dump(le, open('Model/Label_encoder.pkl', 'wb'))

### Model 1

In [10]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=300,
                               n_jobs=-1, 
                               max_depth=5, 
                               class_weight='balanced')

model.fit(X, y)

In [11]:
# Get predictions
pred = model.predict(X)

# Model Evaluation
Accuracy = 100*metrics.accuracy_score(y, pred)
Recall = metrics.recall_score(y, pred, average='macro')
Precision = metrics.precision_score(y, pred, average='macro')   
CM = metrics.confusion_matrix(y, pred)

print(f'Accuracy: {Accuracy:.3f}%')
print(f'Precision: {Precision:.3f}')
print(f'Recall: {Recall:.3f}')
CM

Accuracy: 85.366%
Precision: 0.693
Recall: 0.858


array([[  1078,    607,    170,    104,     89,      0,      0,      0],
       [  3290,  13181,   1731,   1021,    213,      0,      1,      1],
       [  1488,   1507, 256234,  26871,      4,      0,     14,      0],
       [  2015,   1900,  47658, 246271,     12,      1,     64,      1],
       [     0,      0,      2,      1,     97,      0,      0,      0],
       [     0,      0,      0,      0,      0,     15,      0,      0],
       [     0,      0,      0,      0,      0,      0,    641,      0],
       [     1,      0,      2,      2,      0,      3,      2,    325]],
      dtype=int64)

### Model 2

In [12]:
import xgboost

# Parameters
params = {
    'n_estimators'      : 500,
    'learning_rate'     : 1e-1,
    'max_depth'         : 10,
    'reg_alpha'         : 20, 
    'reg_lambda'        : 1.0, 
    'gamma'             : 1, 
    'min_child_weight'  : 2, 
    'max_leaves'        : 2, 
    'eval_metric'       : ['auc', 'aucpr', 'mlogloss']
}


# Setup model
model = xgboost.XGBClassifier(objective           = 'multi:softmax',  
                              n_jobs              = -1,                                   
                              validate_parameters = True, 
                              verbosity           = 1,
                              tree_method         = 'hist',
                              **params)

# Train model
weights = [y.shape[0] / np.where(y == i)[0].shape[0] for i in np.unique(y)]

model.fit(X, y,
          eval_set = [ (X, y) ],
          sample_weight = [weights[int(x)] for x in y],
          verbose = 50);

[0]	validation_0-auc:0.65337	validation_0-aucpr:0.17928	validation_0-mlogloss:2.02775
[50]	validation_0-auc:0.91622	validation_0-aucpr:0.67403	validation_0-mlogloss:1.05192
[100]	validation_0-auc:0.94565	validation_0-aucpr:0.71602	validation_0-mlogloss:0.77394
[150]	validation_0-auc:0.95853	validation_0-aucpr:0.73790	validation_0-mlogloss:0.63872
[200]	validation_0-auc:0.96664	validation_0-aucpr:0.75317	validation_0-mlogloss:0.55205
[250]	validation_0-auc:0.97190	validation_0-aucpr:0.76523	validation_0-mlogloss:0.49008
[300]	validation_0-auc:0.97536	validation_0-aucpr:0.77412	validation_0-mlogloss:0.44422
[350]	validation_0-auc:0.97794	validation_0-aucpr:0.78069	validation_0-mlogloss:0.40838
[400]	validation_0-auc:0.97985	validation_0-aucpr:0.78740	validation_0-mlogloss:0.37909
[450]	validation_0-auc:0.98148	validation_0-aucpr:0.79330	validation_0-mlogloss:0.35497
[499]	validation_0-auc:0.98272	validation_0-aucpr:0.79853	validation_0-mlogloss:0.33501


In [14]:
# Get predictions
pred = model.predict(X)

# Model Evaluation
Accuracy = 100*metrics.accuracy_score(y, pred)
Recall = metrics.recall_score(y, pred, average='macro')
Precision = metrics.precision_score(y, pred, average='macro')   
CM = metrics.confusion_matrix(y, pred)

print(f'Accuracy: {Accuracy:.3f}%')
print(f'Precision: {Precision:.3f}')
print(f'Recall: {Recall:.3f}')
CM

Accuracy: 92.400%
Precision: 0.716
Recall: 0.898


array([[  1277,    514,     22,     18,    217,      0,      0,      0],
       [  3897,  14124,    261,    166,    990,      0,      0,      0],
       [   353,    404, 266187,  18674,    496,      1,      2,      1],
       [   427,    793,  17070, 277839,   1779,      4,      6,      4],
       [     1,      0,      0,      2,     97,      0,      0,      0],
       [     0,      0,      0,      0,      0,     15,      0,      0],
       [     0,      0,      0,      0,      0,      0,    641,      0],
       [     0,      0,      0,      0,      0,      0,      0,    335]],
      dtype=int64)

In [13]:
pickle.dump(model, open('Model/model.pkl', 'wb'))