## `Imports`

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle

from imblearn.over_sampling import SMOTE
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, RandomForestClassifier
from sklearn.linear_model import LassoCV, LogisticRegression, RidgeCV
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, OneHotEncoder, StandardScaler
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.math import confusion_matrix
from xgboost import XGBClassifier

In [2]:
# setting a random seed
np.random.seed(24)

## `DataFrame`

In [3]:
X_train = pd.read_csv('../data/X_train_b_cleaned02.csv')
X_test = pd.read_csv('../data/X_test_b_cleaned02.csv')
y_train = np.load('../data/y_train.npy')
y_test = np.load('../data/y_test.npy')

In [4]:
print(f'Number of features: {len(X_train.columns)}')

Number of features: 534


## `Modeling`

In [5]:
mo_list = []
train_scores = []
test_scores = []

#### RFC

In [6]:
pipe = Pipeline([
    ('ss', StandardScaler()),
    ('rfc', RandomForestClassifier(random_state=24))
])

pipe.fit(X_train, y_train);

In [7]:
mo = 'RFC'
tr_s = pipe.score(X_train, y_train)
te_s = pipe.score(X_test, y_test)

In [8]:
mo_list.append(mo)
train_scores.append(tr_s)
test_scores.append(te_s)

print((tr_s, te_s))

(1.0, 0.8813821272837666)


In [376]:
with open('../models/rfc.pkl', 'wb') as pickle_out:
    pickle.dump(pipe, pickle_out)

#### ETC

In [9]:
pipe = Pipeline([
    ('ss', StandardScaler()),
    ('etc', ExtraTreesClassifier(random_state=24))
])

pipe.fit(X_train, y_train);

In [10]:
mo = 'ETC'
tr_s = pipe.score(X_train, y_train)
te_s = pipe.score(X_test, y_test)

In [11]:
mo_list.append(mo)
train_scores.append(tr_s)
test_scores.append(te_s)

print((tr_s, te_s))

(1.0, 0.8803333885301099)


#### GBC

In [15]:
pipe = Pipeline([
    ('ss', StandardScaler()),
    ('gbc', GradientBoostingClassifier())
])

pipe.fit(X_train, y_train);

In [16]:
mo = 'GBC'
tr_s = pipe.score(X_train, y_train)
te_s = pipe.score(X_test, y_test)

In [17]:
mo_list.append(mo)
train_scores.append(tr_s)
test_scores.append(te_s)

print((tr_s, te_s))

(0.8925443438580997, 0.8748137108792846)


#### ABC

In [12]:
pipe = Pipeline([
    ('ss', StandardScaler()),
    ('abc', AdaBoostClassifier())
])

pipe.fit(X_train, y_train);

In [13]:
mo = 'ABC'
tr_s = pipe.score(X_train, y_train)
te_s = pipe.score(X_test, y_test)

In [14]:
mo_list.append(mo)
train_scores.append(tr_s)
test_scores.append(te_s)

print((tr_s, te_s))

(0.8807131817178185, 0.8814925208367831)


#### Log Reg (Ridge)

In [18]:
pipe = Pipeline([
    ('ss', StandardScaler()),
    ('logreg', LogisticRegression(max_iter=4000, random_state=24))
])

pipe.fit(X_train, y_train);

In [19]:
mo = 'Log Reg'
tr_s = pipe.score(X_train, y_train)
te_s = pipe.score(X_test, y_test)

In [20]:
mo_list.append(mo)
train_scores.append(tr_s)
test_scores.append(te_s)

print((tr_s, te_s))

(0.8822771767130345, 0.8807197659656676)


#### Log Reg (Lasso) 

In [None]:
# pipe = Pipeline([
#     ('ss', StandardScaler()),
#     ('logreg', LogisticRegression(penalty='l1', max_iter=4000, random_state=24, solver='liblinear'))
# ])

# pipe.fit(X_train, y_train)
# pipe.score(X_train, y_train), pipe.score(X_test, y_test)

* Takes too long to run

#### XGBC

In [21]:
pipe = Pipeline([
    ('ss', StandardScaler()),
    ('xbg', XGBClassifier(use_label_encoder=False))
])

pipe.fit(X_train, y_train);



In [22]:
mo = 'XBGC'
tr_s = pipe.score(X_train, y_train)
te_s = pipe.score(X_test, y_test)

In [23]:
mo_list.append(mo)
train_scores.append(tr_s)
test_scores.append(te_s)

print((tr_s, te_s))

(0.9024067122985207, 0.8802781917536016)


#### KNN

In [24]:
pipe = Pipeline([
    ('ss', StandardScaler()),
    ('knc', KNeighborsClassifier())
])

pipe.fit(X_train, y_train);

In [25]:
mo = 'KNN'
tr_s = pipe.score(X_train, y_train)
te_s = pipe.score(X_test, y_test)

In [26]:
mo_list.append(mo)
train_scores.append(tr_s)
test_scores.append(te_s)

print((tr_s, te_s))

(0.8818723780083904, 0.8619528619528619)


#### NN

In [27]:
from tensorflow.random import set_seed
set_seed(24)

In [28]:
# scale, fit & transform data
ss = StandardScaler()

X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

In [29]:
model = Sequential()

model.add(Dense(units=400, input_shape=X_train_sc[0].shape, activation='relu'))
model.add(Dropout(.2))
model.add(Dense(128, activation='relu'))
model.add(Dropout(.25))
model.add(Dense(64, activation='relu'))
model.add(Dropout(.2))
model.add(Dense(units=12, activation='softmax'))

early_stop = EarlyStopping(monitor='val_loss', patience=5, verbose=1)

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['acc'])

In [30]:
history = model.fit(X_train_sc, y_train, validation_data=(X_test_sc, y_test), epochs=40, batch_size=32, callbacks=[early_stop])

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 00010: early stopping


In [31]:
mo = 'NN'
tr_s = .8812
te_s = .8815

In [32]:
mo_list.append(mo)
train_scores.append(tr_s)
test_scores.append(te_s)

print((tr_s, te_s))

(0.8812, 0.8815)


In [33]:
model_df = pd.DataFrame()
model_df['Model'] = mo_list
model_df['Train'] = train_scores
model_df['Test'] = test_scores

In [39]:
model_df.sort_values(by='Test', ascending=False)

Unnamed: 0,Model,Train,Test
3,ABC,0.881707,0.881658
7,NN,0.8812,0.8815
0,RFC,0.999982,0.881493
1,ETC,1.0,0.880775
5,XBGC,0.895709,0.880775
2,GBC,0.893336,0.875476
4,Log Reg,0.886951,0.871888
6,KNN,0.882204,0.864878
