## Training Stage (Step-by-Step)

In [1]:
from model_optimizer import ModelSelector
import numpy as np
import pandas as pd
import sklearn
import sklearn.model_selection
import sklearn.ensemble
import sklearn.tree
import sklearn.svm
import pickle
from voting import VotingClassifier
from trainer import VotingTrainer

*Note*: ModelSelector class located in <i>"model_optimizer.py"</i> is responsible for applying grid/random hyper-parameter search based on given data and model parameters.

In [None]:
data = pd.read_csv('data/transform_data.csv').to_numpy()
X, y = data[:, :50], data[:, -1:].astype(np.int32)
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.2, random_state=0)

Defining grid parameters for hyperparameter search process

### Decision Tree

Importing trasformed data from csv file

In [26]:
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(15057, 50) (15057, 1) (3765, 50) (3765, 1)


Defining grid parameters for hyperparameter search process

In [27]:
DT = sklearn.tree.DecisionTreeClassifier(random_state=0)
grid_params = {'criterion':['gini', 'entropy'],
               'splitter':['best', 'random'],
               'max_depth':np.hstack((np.arange(2, 15, 1), np.arange(15, 30, 2), np.arange(30, 100, 5))),
               'min_samples_split':np.arange(2, 10, 1),
               'max_features':['auto', 'sqrt', 'log2'],
               'random_state':[0]
              }
print(grid_params)

{'criterion': ['gini', 'entropy'], 'splitter': ['best', 'random'], 'max_depth': array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 17, 19, 21,
       23, 25, 27, 29, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90,
       95]), 'min_samples_split': array([2, 3, 4, 5, 6, 7, 8, 9]), 'max_features': ['auto', 'sqrt', 'log2'], 'random_state': [0]}


In [28]:
model_selector = ModelSelector(DT, X_train, y_train)
search_cv = model_selector.parameter_search(grid_params, False, n_jobs=4)

Hyper parameter search started
Fitting 5 folds for each of 3360 candidates, totalling 16800 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  48 tasks      | elapsed:    5.6s
[Parallel(n_jobs=4)]: Done 644 tasks      | elapsed:   16.2s
[Parallel(n_jobs=4)]: Done 1644 tasks      | elapsed:   38.5s
[Parallel(n_jobs=4)]: Done 3044 tasks      | elapsed:  1.4min
[Parallel(n_jobs=4)]: Done 4844 tasks      | elapsed:  2.6min
[Parallel(n_jobs=4)]: Done 7044 tasks      | elapsed:  4.1min
[Parallel(n_jobs=4)]: Done 9644 tasks      | elapsed:  5.6min
[Parallel(n_jobs=4)]: Done 10448 tasks      | elapsed:  6.3min
[Parallel(n_jobs=4)]: Done 11298 tasks      | elapsed:  7.3min
[Parallel(n_jobs=4)]: Done 12248 tasks      | elapsed:  8.5min
[Parallel(n_jobs=4)]: Done 13298 tasks      | elapsed: 10.0min
[Parallel(n_jobs=4)]: Done 14448 tasks      | elapsed: 11.6min
[Parallel(n_jobs=4)]: Done 15698 tasks      | elapsed: 13.4min
[Parallel(n_jobs=4)]: Done 16800 out of 16800 | elapsed: 15.0min finished


In [29]:
print(model_selector.best_estimator)
print(search_cv.best_score_)
print(model_selector.best_report(y_train, search_cv.best_estimator_.predict(X_train)))
print(model_selector.best_report(y_test, search_cv.best_estimator_.predict(X_test)))

DecisionTreeClassifier(max_depth=8, max_features='auto', min_samples_split=7,
                       random_state=0)
0.42212939452198955
              precision    recall  f1-score   support

           0       0.56      0.51      0.53      4715
           1       0.47      0.71      0.57      5374
           2       0.51      0.28      0.36      4968

    accuracy                           0.50     15057
   macro avg       0.51      0.50      0.49     15057
weighted avg       0.51      0.50      0.49     15057

              precision    recall  f1-score   support

           0       0.50      0.46      0.48      1223
           1       0.41      0.61      0.49      1320
           2       0.37      0.21      0.27      1222

    accuracy                           0.43      3765
   macro avg       0.43      0.43      0.41      3765
weighted avg       0.43      0.43      0.41      3765



In [30]:
with open('models/DT.pkl','wb') as file:
    pickle.dump(model_selector.best_estimator, file)

### SVM

In [2]:
Data = np.loadtxt('data/transform_data.csv', skiprows=1, delimiter=',')
X, y = Data[:, :-1], Data[:, -1].astype('int')
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.2, random_state = 0)
trainer = VotingTrainer(X_train,y_train)

#### *Linear kernel*

In [5]:
svm = sklearn.svm.LinearSVC(random_state = 0)
c = np.logspace(-1, 2, 4)
grid_params = [{'C':c}]
model_selector = ModelSelector(svm, X_train, y_train)
search_cv = model_selector.parameter_search(grid_params, False, n_jobs=6)
print(model_selector.best_estimator)
print(search_cv.best_score_)
print(model_selector.best_report(y_test, search_cv.best_estimator_.predict(X_test)))
with open('models/LinearSVM.pkl','wb') as file:
    pickle.dump(model_selector.best_estimator, file)

Hyper parameter search started
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  20 out of  20 | elapsed:  1.8min finished


LinearSVC(random_state=0)
0.45274670166891384
              precision    recall  f1-score   support

           0       0.47      0.66      0.55      1223
           1       0.48      0.53      0.50      1320
           2       0.47      0.23      0.31      1222

    accuracy                           0.47      3765
   macro avg       0.47      0.47      0.45      3765
weighted avg       0.47      0.47      0.45      3765





#### *Polynomial kernel*
For some reason which was not detected the training process gets stuck for days. Therefore, the Polygnomial svm has not been trained. 

In [None]:
svm = sklearn.svm.SVC(random_state = 0)
df_shape = ['ovo']
c = np.logspace(-1, 2, 4)
grid_params = [{'kernel':['poly'], 'C':c, 'degree':[2], 'coef0':np.logspace(0, 2, 3), 'decision_function_shape':df_shape}]
model_selector = ModelSelector(svm, X_train, y_train)
search_cv = model_selector.parameter_search(grid_params, False, n_jobs=6)

Hyper parameter search started
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed: 17.1min


#### *RBF kernel*

In [4]:
svm = sklearn.svm.SVC(random_state = 0)
c = np.logspace(-1, 2, 4)
grid_params = [{'kernel':['rbf'], 'gamma':np.logspace(-2, 1, 4), 'C':c}]
model_selector = ModelSelector(svm, X_train, y_train)
search_cv = model_selector.parameter_search(grid_params, False, n_jobs=6)

Hyper parameter search started
Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:  8.8min
[Parallel(n_jobs=6)]: Done  80 out of  80 | elapsed: 23.2min finished


In [123]:
print(model_selector.best_estimator)
print(search_cv.best_score_)
print(model_selector.best_report(y_test, search_cv.best_estimator_.predict(X_test)))
with open('models/svm.pkl','wb') as file:
    pickle.dump(model_selector.best_estimator, file)

SVC(decision_function_shape='ovo', gamma=0.01, random_state=0)
0.4797109359528563
              precision    recall  f1-score   support

           0       0.63      0.53      0.58      1223
           1       0.46      0.65      0.54      1320
           2       0.45      0.32      0.38      1222

    accuracy                           0.51      3765
   macro avg       0.51      0.50      0.50      3765
weighted avg       0.51      0.51      0.50      3765



### Random Forest

In [9]:
random_forest=RandomForestClassifier(random_state=0)
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 20, num = 5)]
max_features = ['auto', 'sqrt', 'log2']
max_depth = [int(x) for x in np.linspace(2, 30, num = 10)]
min_samples_split = [2, 5]
bootstrap = [True, False]
criterion=['gini',"entropy"]
grid_params = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
                'min_samples_split': min_samples_split,
               'bootstrap': bootstrap,
              'criterion':criterion}
grid_params

{'n_estimators': [10, 12, 15, 17, 20],
 'max_features': ['auto', 'sqrt', 'log2'],
 'max_depth': [2, 5, 8, 11, 14, 17, 20, 23, 26, 30],
 'min_samples_split': [2, 5],
 'bootstrap': [True, False],
 'criterion': ['gini', 'entropy']}

In [10]:
model_selector = ModelSelector(random_forest,X_train,y_train)
search_cv = model_selector.parameter_search(grid_params,False,n_jobs=10)

Hyper parameter search started
Fitting 5 folds for each of 1200 candidates, totalling 6000 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    6.4s
[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed:   11.8s
[Parallel(n_jobs=10)]: Done 430 tasks      | elapsed:   30.8s
[Parallel(n_jobs=10)]: Done 780 tasks      | elapsed:  1.4min
[Parallel(n_jobs=10)]: Done 1230 tasks      | elapsed:  2.8min
[Parallel(n_jobs=10)]: Done 1780 tasks      | elapsed:  4.2min
[Parallel(n_jobs=10)]: Done 2430 tasks      | elapsed:  8.0min
[Parallel(n_jobs=10)]: Done 3180 tasks      | elapsed: 12.1min
[Parallel(n_jobs=10)]: Done 4030 tasks      | elapsed: 15.6min
[Parallel(n_jobs=10)]: Done 4980 tasks      | elapsed: 20.4min
[Parallel(n_jobs=10)]: Done 6000 out of 6000 | elapsed: 32.4min finished


In [8]:
print(model_selector.best_estimator)
print(search_cv.best_score_)
print(model_selector.best_report(y_train,search_cv.best_estimator_.predict(X_train)))
print(model_selector.best_report(y_test,search_cv.best_estimator_.predict(X_test)))

RandomForestClassifier(bootstrap=False, criterion='entropy', max_depth=14,
                       max_features='log2', n_estimators=20, random_state=0)
0.4558673310742417
              precision    recall  f1-score   support

           0       0.98      0.99      0.99      4715
           1       0.99      0.99      0.99      5374
           2       0.99      0.99      0.99      4968

    accuracy                           0.99     15057
   macro avg       0.99      0.99      0.99     15057
weighted avg       0.99      0.99      0.99     15057

              precision    recall  f1-score   support

           0       0.64      0.43      0.51      1223
           1       0.43      0.55      0.48      1320
           2       0.39      0.40      0.39      1222

    accuracy                           0.46      3765
   macro avg       0.49      0.46      0.46      3765
weighted avg       0.48      0.46      0.46      3765



In [9]:
with open('models/RF.pkl','wb') as file:
    pickle.dump(model_selector.best_estimator, file)

### Adaboost 

In [2]:
adaboost_clf = sklearn.ensemble.AdaBoostClassifier(random_state=0)
estimators_range = np.arange(500,1001,100)
lr_range = np.logspace(-2,2,4)
algorithms = ['SAMME','SAMME.R']
base_estimator = [ sklearn.tree.DecisionTreeClassifier(max_depth=i,random_state=0) for i in range(1,4)
                 ]
dist_params = {'n_estimators':estimators_range,
               'learning_rate':[0.01],
               'algorithm':algorithms,
               'base_estimator': base_estimator
            }
model_selector = ModelSelector(adaboost_clf,X_train,y_train)
search_cv = model_selector.parameter_search(dist_params,False,n_jobs=-1)

Hyper parameter search started
Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 14.1min
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed: 107.1min finished


In [6]:
print(model_selector.best_estimator)
print(search_cv.best_score_)
print(model_selector.best_report(y_train,search_cv.best_estimator_.predict(X_train)))

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=3,
                                                         random_state=0),
                   learning_rate=0.01, n_estimators=900, random_state=0)
0.4664948530906816
              precision    recall  f1-score   support

           0       0.67      0.53      0.59      4715
           1       0.51      0.69      0.59      5374
           2       0.49      0.40      0.44      4968

    accuracy                           0.54     15057
   macro avg       0.56      0.54      0.54     15057
weighted avg       0.55      0.54      0.54     15057



In [10]:
print(model_selector.best_report(y_test,search_cv.best_estimator_.predict(X_test)))    

              precision    recall  f1-score   support

           0       0.60      0.46      0.52      1223
           1       0.46      0.62      0.53      1320
           2       0.42      0.35      0.38      1222

    accuracy                           0.48      3765
   macro avg       0.49      0.48      0.48      3765
weighted avg       0.49      0.48      0.48      3765



In [7]:
with open('models/adaboost.pkl','wb') as file:
    pickle.dump(model_selector.best_estimator,file)

### MLP Classifier

In [5]:
train_data = np.loadtxt('training.csv',skiprows=1,delimiter=',')
test_data = np.loadtxt('test.csv',skiprows=1,delimiter=',')
X_train,y_train = train_data[:,:-1],train_data[:,-1].astype('int')
X_tset,y_test = test_data[:,:-1],test_data[:,-1].astype('int')

mlp_clf3 = sklearn.neural_network.MLPClassifier(random_state=0)


dist_params3 = {
    'hidden_layer_sizes': [(25,12,6),(25,10),(25,),(60,30,10),
                            (50,50,25,25),(25,10,5),(50,10,3),(40,25,10,6)],
    'activation': ['tanh', 'relu','logistic'],
    'solver': ['sgd', 'adam','lbfgs'],
    'alpha': [0.0001, 0.001],
    'learning_rate': ['constant','adaptive'],
    'max_iter':[500,1000]
            }

model_selector_3 = ModelSelector(mlp_clf3,X_train,y_train)
search_cv_3 = model_selector_3.parameter_search(dist_params3,False,n_jobs=-1)

Hyper parameter search started
Fitting 5 folds for each of 576 candidates, totalling 2880 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  9.3min
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed: 31.0min
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed: 57.9min
[Parallel(n_jobs=-1)]: Done 1226 tasks      | elapsed: 87.5min
[Parallel(n_jobs=-1)]: Done 1776 tasks      | elapsed: 125.0min
[Parallel(n_jobs=-1)]: Done 2426 tasks      | elapsed: 153.8min
[Parallel(n_jobs=-1)]: Done 2880 out of 2880 | elapsed: 171.1min finished


In [9]:
print(model_selector_3.best_report(y_train,model_selector_3.best_estimator.predict(X_train)))
print(model_selector_3.best_report(y_test,model_selector_3.best_estimator.predict(X_test)))

              precision    recall  f1-score   support

           0       0.59      0.56      0.57      4715
           1       0.52      0.58      0.55      5374
           2       0.46      0.41      0.44      4968

    accuracy                           0.52     15057
   macro avg       0.52      0.52      0.52     15057
weighted avg       0.52      0.52      0.52     15057

              precision    recall  f1-score   support

           0       0.57      0.55      0.56      1223
           1       0.50      0.56      0.52      1320
           2       0.42      0.38      0.40      1222

    accuracy                           0.50      3765
   macro avg       0.50      0.50      0.50      3765
weighted avg       0.50      0.50      0.50      3765



### VotingClassifier

In [3]:
with open('models/adaboost.pkl','rb') as f1:
    adaboost_clf = pickle.load(f1)
with open('models/DT.pkl','rb') as f:
    dt_clf = pickle.load(f)
with open('models/MLPClassifier.pkl','rb') as f2:
    mlp_clf = pickle.load(f2)
with open('models/svm.pkl','rb') as f3:
    svm_clf = pickle.load(f3)

models = [('adaboost',adaboost_clf),('mlp',mlp_clf),('dt',dt_clf),('svm',svm_clf)]  
trainer.train_vote_clf(models,100,1500)
print(trainer.get_report())
print(trainer.get_test_report(X_test,trainer.le.inverse_transform(y_test)))

Making random weight matrix
              precision    recall  f1-score   support

       brand       0.76      0.75      0.75      4715
      female       0.71      0.83      0.76      5374
        male       0.81      0.66      0.73      4968

    accuracy                           0.75     15057
   macro avg       0.76      0.75      0.75     15057
weighted avg       0.76      0.75      0.75     15057

              precision    recall  f1-score   support

       brand       0.63      0.53      0.58      1223
      female       0.46      0.65      0.54      1320
        male       0.45      0.32      0.38      1222

    accuracy                           0.51      3765
   macro avg       0.51      0.50      0.50      3765
weighted avg       0.51      0.51      0.50      3765

