## UFC Fight-level dataset SVM, SGD & NN Notebook
(thre is no quick rule as to which kernel performs best in every scenario; testing & learning is key)

Kernel trick reference:
https://towardsdatascience.com/understanding-support-vector-machine-part-2-kernel-trick-mercers-theorem-e1e6848c6c4d

#### Import necessary modules

In [419]:
import os
import sys
import pandas as pd
import numpy as np
import pickle
import tensorflow as tf
from tensorflow import keras
from sklearn.metrics import classification_report, confusion_matrix, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from keras.layers import Input, Lambda, Dense, Dropout
from keras.models import Model, Sequential
from keras.optimizers import Adam
from keras.wrappers.scikit_learn import KerasClassifier
import matplotlib.pyplot as plt
%matplotlib inline

#### Helper functions

In [420]:
# rtns best params for C and Gamma; they are the parameters for a nonlinear support vector machine

def svc_parameter_optimization(X, y, nfolds):
    Cs = [0.001, 0.01, 0.1, 1, 10] # C is trade off betw. low train error and low test error (ability to generalize) 
    gammas = [0.001, 0.01, 0.1, 1] # free parameter of the Gaussian radial basis function
    param_grid = {'C': Cs, 'gamma' : gammas}
    grid_search = GridSearchCV(SVC(kernel='rbf'), param_grid, cv=nfolds) # instantiate grid search
    grid_search.fit(X, y)
    grid_search.best_params_
    return grid_search.best_params_

#### Set working directory

In [421]:
os.chdir(r'/Users/colella2/Google Drive/Graduate School/MScA/Courses/31008 Data Mining Principles/Final_Project/msca31008/fun/')

In [422]:
pwd

'/Users/colella2/Google Drive/Graduate School/MScA/Courses/31008 Data Mining Principles/Final_Project/msca31008/fun'

In [423]:
os.listdir('../fun/') # confirm items in function folder

['.DS_Store',
 'r.py',
 'tonum.py',
 'hierarchical_imputation.py',
 'fillna-kmeans.py',
 'settings.py',
 'pkl.py',
 'spl.py',
 'untitled',
 'ddict.py']

In [424]:
exec(open('r.py').read()) # test ability to read .py script from function folder

In [425]:
# read-in all the functions
for filename in os.listdir('../fun/'):
    if filename.endswith('.py'):
        exec(open(filename).read())
        continue
    else:
        continue

#### Open file of interest

In [426]:
os.chdir(r'/Users/colella2/Google Drive/Graduate School/MScA/Courses/31008 Data Mining Principles/Final_Project/msca31008/out')

with open('d3-fight-level-transform.pkl', 'rb') as f:
    data = pickle.load(f)

In [427]:
load( '../out/d3-fight-level-transform.pkl' )
print( X.shape )

(4368, 165)


#### Examine key-value pairs in dict

In [428]:
for key, value in data.items():
  print(key, value)

X [[ 2.91491784e-01 -9.60590063e-02  2.50234064e-01 ... -7.94744912e-03
   3.10245107e-03 -1.87224280e-03]
 [ 2.70915901e-01 -1.24214845e-01  2.32611941e-01 ...  7.75837707e-03
   1.68645662e-03 -2.72968561e-04]
 [-9.60671224e-03 -1.16270731e-02 -9.73710726e-03 ... -2.94963859e-01
   6.74519642e-04  2.24984217e-04]
 ...
 [-1.24919035e-02 -1.80251541e-01 -1.85625621e-01 ... -1.63732606e-03
   2.16095113e-03 -1.86365288e-03]
 [-1.20995932e-02 -1.73834821e-01 -1.79162807e-01 ... -1.52725608e-04
   7.74126643e-04 -3.98965666e-04]
 [-1.29588989e-02 -1.83460266e-01 -1.89608127e-01 ... -2.01230682e-03
   1.44181940e-03 -1.33855060e-03]]
y 0       1
1       1
2       1
3       0
4       0
       ..
4363    1
4364    1
4365    1
4366    1
4367    1
Name: Winner, Length: 4368, dtype: int64
cols Index(['title_bout', 'weight_class', 'no_of_rounds', 'date_year', 'date_month', 'date_dayofmonth', 'date_dayofweek', 'date_frisat', 'womens', 'weight_class_catch_weight',
       ...
       'Mean_avg_opp_H

In [429]:
print(X.shape) # view feature shape; 4368 rows, 165 columns

(4368, 165)


In [430]:
print(y.shape) # view predicted value shape; 4368 rows, 1 column

(4368,)


#### Train-test split

In [431]:
X_train , X_test, y_train, y_test = train_test_split(X, y, random_state = 718, test_size = 0.3)

### SVM

#### Fit model (linear kernel)
(find decision boundary for linearly separable data)

In [158]:
svclassifier_linear = SVC(kernel='linear')
svclassifier_linear.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [159]:
# accuracy against train data
print(classification_report(y_train,svclassifier_linear.predict(X_train)))

              precision    recall  f1-score   support

           0       0.61      0.16      0.26       981
           1       0.71      0.95      0.81      2076

    accuracy                           0.70      3057
   macro avg       0.66      0.56      0.53      3057
weighted avg       0.68      0.70      0.63      3057



In [160]:
# accuracy against test data
print(classification_report(y_test, svclassifier_linear.predict(X_test)))

              precision    recall  f1-score   support

           0       0.49      0.10      0.16       459
           1       0.66      0.94      0.78       852

    accuracy                           0.65      1311
   macro avg       0.57      0.52      0.47      1311
weighted avg       0.60      0.65      0.56      1311



In [161]:
# store predicted values on X_test & print confusion matrix
y_pred_linear = svclassifier_linear.predict(X_test)
print(confusion_matrix(y_test,y_pred_linear))

[[ 45 414]
 [ 47 805]]


### Begin section for non-linear investigation

#### Fit model (poly kernel)

In [162]:
svclassifier_poly = SVC(kernel='poly', degree=8)
svclassifier_poly.fit(X_train, y_train)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=8, gamma='auto_deprecated',
    kernel='poly', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [163]:
# accuracy against train data
print(classification_report(y_train,svclassifier_poly.predict(X_train)))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       981
           1       0.68      1.00      0.81      2076

    accuracy                           0.68      3057
   macro avg       0.34      0.50      0.40      3057
weighted avg       0.46      0.68      0.55      3057



  'precision', 'predicted', average, warn_for)


In [164]:
# accuracy against test data
print(classification_report(y_test, svclassifier_poly.predict(X_test)))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       459
           1       0.65      1.00      0.79       852

    accuracy                           0.65      1311
   macro avg       0.32      0.50      0.39      1311
weighted avg       0.42      0.65      0.51      1311



  'precision', 'predicted', average, warn_for)


In [165]:
# store predicted values on X_test & print confusion matrix
y_pred_poly = svclassifier_poly.predict(X_test)
print(confusion_matrix(y_test,y_pred_poly))

[[  0 459]
 [  0 852]]


#### Fit model (Gaussian kernel)
(this is a special case for rbf)

In [166]:
svclassifier_gaus = SVC(kernel='rbf')
svclassifier_gaus.fit(X_train, y_train)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [167]:
# accuracy against train data
print(classification_report(y_train,svclassifier_gaus.predict(X_train)))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       981
           1       0.68      1.00      0.81      2076

    accuracy                           0.68      3057
   macro avg       0.34      0.50      0.40      3057
weighted avg       0.46      0.68      0.55      3057



  'precision', 'predicted', average, warn_for)


In [168]:
# accuracy against test data
print(classification_report(y_test, svclassifier_gaus.predict(X_test)))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       459
           1       0.65      1.00      0.79       852

    accuracy                           0.65      1311
   macro avg       0.32      0.50      0.39      1311
weighted avg       0.42      0.65      0.51      1311



  'precision', 'predicted', average, warn_for)


In [169]:
# store predicted values on X_test & print confusion matrix
y_pred_gaus = svclassifier_gaus.predict(X_test)
print(confusion_matrix(y_test,y_pred_gaus))

[[  0 459]
 [  0 852]]


#### After first running simply (as was done above), commence optimization...

In [170]:
# commence grid search for best parameters on training set
svc_parameter_optimization(X_train, y_train, 5)

{'C': 1, 'gamma': 1}

In [171]:
# instantiate with optimal parameters
svclassifier_gaus_optim = SVC(kernel='rbf', C = 1, gamma = 1)
svclassifier_gaus_optim.fit(X_train, y_train)

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=1, kernel='rbf', max_iter=-1,
    probability=False, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [172]:
# view optimized results/accuracy on training data
print(classification_report(y_train,svclassifier_gaus_optim.predict(X_train)))

              precision    recall  f1-score   support

           0       0.91      0.48      0.63       981
           1       0.80      0.98      0.88      2076

    accuracy                           0.82      3057
   macro avg       0.86      0.73      0.75      3057
weighted avg       0.84      0.82      0.80      3057



In [173]:
# view optimized results/accuracy on testing data
print(classification_report(y_test, svclassifier_gaus_optim.predict(X_test)))

              precision    recall  f1-score   support

           0       0.47      0.13      0.20       459
           1       0.66      0.92      0.77       852

    accuracy                           0.64      1311
   macro avg       0.57      0.53      0.49      1311
weighted avg       0.60      0.64      0.57      1311



In [174]:
# store predicted values on X_test & print confusion matrix
y_pred_gaus_optim = svclassifier_gaus_optim.predict(X_test)
print(confusion_matrix(y_test,y_pred_gaus_optim))

[[ 60 399]
 [ 67 785]]


#### Fit model (Sigmoid kernel)
(suitable for binary classification problems; rtns 0 or 1; activation functino for Neural Networks)

In [175]:
svclassifier_sig = SVC(kernel='sigmoid')
svclassifier_sig.fit(X_train, y_train)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='sigmoid', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [176]:
# accuracy against train data
print(classification_report(y_train,svclassifier_sig.predict(X_train)))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       981
           1       0.68      1.00      0.81      2076

    accuracy                           0.68      3057
   macro avg       0.34      0.50      0.40      3057
weighted avg       0.46      0.68      0.55      3057



  'precision', 'predicted', average, warn_for)


In [177]:
# accuracy against test data
print(classification_report(y_test, svclassifier_sig.predict(X_test)))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       459
           1       0.65      1.00      0.79       852

    accuracy                           0.65      1311
   macro avg       0.32      0.50      0.39      1311
weighted avg       0.42      0.65      0.51      1311



  'precision', 'predicted', average, warn_for)


In [178]:
# store predicted values on X_test & print confusion matrix
y_pred_sig = svclassifier_sig.predict(X_test)
print(confusion_matrix(y_test,y_pred_sig))

[[  0 459]
 [  0 852]]


#### Conclusion
Linear SVM performs similarly to optimized Gaussian RBF on test sets.

### SGD

In [179]:
# instantiate SGD classifier
linear_sgd_classifier = SGDClassifier(random_state = 0)

In [180]:
# instantiate the GridSearchCV object and run search
params = {'alpha':[0.00001, 0.0001, 0.001, 0.01, 0.1, 1], 'loss':['hinge', 'log'], 'penalty':['l1','l2']}

grid_search = GridSearchCV(linear_sgd_classifier, params, cv = 5)

grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=SGDClassifier(alpha=0.0001, average=False,
                                     class_weight=None, early_stopping=False,
                                     epsilon=0.1, eta0=0.0, fit_intercept=True,
                                     l1_ratio=0.15, learning_rate='optimal',
                                     loss='hinge', max_iter=1000,
                                     n_iter_no_change=5, n_jobs=None,
                                     penalty='l2', power_t=0.5, random_state=0,
                                     shuffle=True, tol=0.001,
                                     validation_fraction=0.1, verbose=0,
                                     warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'alpha': [1e-05, 0.0001, 0.001, 0.01, 0.1, 1],
                         'loss': ['hinge', 'log'], 'penalty': ['l1', 'l2']},
             pre_dispatch='2*n_jobs', refit=True, ret

In [181]:
# best parameters & the corresponding score
print("Best CV params", grid_search.best_params_)
print("Best CV accuracy", grid_search.best_score_)
print("Test accuracy of best grid search hypers:", grid_search.score(X_test, y_test))

Best CV params {'alpha': 0.001, 'loss': 'log', 'penalty': 'l2'}
Best CV accuracy 0.6810598626104023
Test accuracy of best grid search hypers: 0.6559877955758963


In [182]:
%%time

# fit SGD classifier
linear_sgd_classifier.fit(X_train, y_train)

CPU times: user 62.1 ms, sys: 914 µs, total: 63 ms
Wall time: 62.4 ms


SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=0, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [183]:
# accuracy against train data
print(classification_report(y_train, linear_sgd_classifier.predict(X_train)))

              precision    recall  f1-score   support

           0       0.61      0.32      0.42       981
           1       0.74      0.90      0.81      2076

    accuracy                           0.72      3057
   macro avg       0.67      0.61      0.62      3057
weighted avg       0.70      0.72      0.69      3057



In [184]:
# accuracy against test data
print(classification_report(y_test, linear_sgd_classifier.predict(X_test)))

              precision    recall  f1-score   support

           0       0.51      0.22      0.30       459
           1       0.68      0.88      0.77       852

    accuracy                           0.65      1311
   macro avg       0.59      0.55      0.54      1311
weighted avg       0.62      0.65      0.61      1311



As one can see, the accuracy on the test set is 0.65, matching Gaussian-RBD and linear SVM. It also better precision and recall than optimized Gaussian-RBF & matches of betters linear SVM. It matches linear SVM on recall.

### Neural Network
ReLu activation function for hidden layers.
Sigmoid activation function for binary classification. CAUTION: Large neural nets trained on relatively small datasets can overfit the training data. Generalization error increases due to overfitting.

#### Instance 1
3 layers of 55 nodes/neurons/perceptrons mapping to 1 target variable

In [332]:
# instantiate classifier
classifier = Sequential()

# optimal size of the hidden layer is usually between the size of the input and size of the output layers
# general rule I used was # input features of 165 / # hidden layers = # nodes per layer

# ReLu is based on principle that linear models are easier to optimize
# random normal initializer generates tensors with a normal distribution

# hidden layer 1 - # of neurons comprising this layer is equal to number of features
classifier.add(Dense(55, activation = 'relu', kernel_initializer = 'random_normal', input_dim = 165))

# hidden layer 2
classifier.add(Dense(55, activation = 'relu', kernel_initializer = 'random_normal'))

# hidden layer 3
classifier.add(Dense(55, activation = 'relu', kernel_initializer = 'random_normal'))

# output layer
# single node for classification unless softmax is used -- then one node per class label
# Sigmoid function is the special case of Softmax function where the number of classes are 2
classifier.add(Dense(1, activation = 'sigmoid', kernel_initializer = 'random_normal'))

In [333]:
# compile NN
classifier.compile(optimizer = 'adam',loss = 'binary_crossentropy', metrics = ['accuracy'])

# use Adam optimization -- "adaptive moment estimation"; it's a combo of RMSProp + Momentum

In [334]:
%%time

# fit training data to model
# batch size 55 means 55 samples per gradient update AKA # of patterns shown to NN before weights updated
# also training optimization -- how many patterns to read at a time and keep in memory

# train w/ 60 epochs -- epoch is an iteration over entire data set
classifier.fit(X_train, y_train, batch_size = 55, epochs = 60)

Epoch 1/60
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 24/60
Epoch 25/60
Epoch 26/60
Epoch 27/60
Epoch 28/60
Epoch 29/60
Epoch 30/60
Epoch 31/60
Epoch 32/60
Epoch 33/60
Epoch 34/60
Epoch 35/60
Epoch 36/60
Epoch 37/60
Epoch 38/60
Epoch 39/60
Epoch 40/60
Epoch 41/60
Epoch 42/60
Epoch 43/60
Epoch 44/60
Epoch 45/60
Epoch 46/60
Epoch 47/60
Epoch 48/60
Epoch 49/60
Epoch 50/60
Epoch 51/60
Epoch 52/60
Epoch 53/60
Epoch 54/60
Epoch 55/60
Epoch 56/60
Epoch 57/60
Epoch 58/60
Epoch 59/60
Epoch 60/60
CPU times: user 28.1 s, sys: 3.24 s, total: 31.4 s
Wall time: 15.7 s


<keras.callbacks.History at 0x1a79d183c8>

In [335]:
# evaluate loss & metrics values for model
classifier.evaluate(X_train, y_train) # accuracy near 100% on training data, which is a sign of overfitting



[0.013447447802407175, 0.9970559371933267]

In [336]:
# predict on test data
# if prediction is greater than 0.5, output is 1; otherwise, output is 0
y_pred = classifier.predict(X_test)
y_pred = (y_pred > 0.5)

In [337]:
# accuracy on test: ~63%
confusion_matrix(y_test, y_pred) # TP + TN = 829; 829/1311 total = ~63% accurate

array([[161, 298],
       [231, 621]])

#### Instance 2
5 layers of 33 nodes mapping to 1 target variable

2 additional layers introduced b/c, empirically, deep NN's perform better

In [216]:
# instantiate classifier 2
classifier_2 = Sequential()

# hidden layer 1
classifier_2.add(Dense(33, activation = 'relu', kernel_initializer = 'random_normal', input_dim = 165))

# hidden layer 2
classifier_2.add(Dense(33, activation = 'relu', kernel_initializer = 'random_normal'))

# hidden layer 3
classifier_2.add(Dense(33, activation ='relu', kernel_initializer = 'random_normal'))

# hidden layer 4
classifier_2.add(Dense(33, activation = 'relu', kernel_initializer = 'random_normal'))

# hidden layer 5
classifier_2.add(Dense(33, activation = 'relu', kernel_initializer = 'random_normal'))

# output layer
classifier_2.add(Dense(1, activation = 'sigmoid', kernel_initializer = 'random_normal'))

In [217]:
# compile NN
classifier_2.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [218]:
%%time

# fit training data to model
classifier_2.fit(X_train, y_train, batch_size = 15, epochs = 70)

Epoch 1/70
Epoch 2/70
Epoch 3/70
Epoch 4/70
Epoch 5/70
Epoch 6/70
Epoch 7/70
Epoch 8/70
Epoch 9/70
Epoch 10/70
Epoch 11/70
Epoch 12/70
Epoch 13/70
Epoch 14/70
Epoch 15/70
Epoch 16/70
Epoch 17/70
Epoch 18/70
Epoch 19/70
Epoch 20/70
Epoch 21/70
Epoch 22/70
Epoch 23/70
Epoch 24/70
Epoch 25/70
Epoch 26/70
Epoch 27/70
Epoch 28/70
Epoch 29/70
Epoch 30/70
Epoch 31/70
Epoch 32/70
Epoch 33/70
Epoch 34/70
Epoch 35/70
Epoch 36/70
Epoch 37/70
Epoch 38/70
Epoch 39/70
Epoch 40/70
Epoch 41/70
Epoch 42/70
Epoch 43/70
Epoch 44/70
Epoch 45/70
Epoch 46/70
Epoch 47/70
Epoch 48/70
Epoch 49/70
Epoch 50/70
Epoch 51/70
Epoch 52/70
Epoch 53/70
Epoch 54/70
Epoch 55/70
Epoch 56/70
Epoch 57/70
Epoch 58/70
Epoch 59/70
Epoch 60/70
Epoch 61/70
Epoch 62/70
Epoch 63/70
Epoch 64/70
Epoch 65/70
Epoch 66/70
Epoch 67/70
Epoch 68/70
Epoch 69/70
Epoch 70/70
CPU times: user 1min 19s, sys: 9.64 s, total: 1min 29s
Wall time: 35.3 s


<keras.callbacks.History at 0x1a6f7f3400>

In [338]:
# evaluate loss & metrics values for model
classifier_2.evaluate(X_train, y_train) # accuracy near 94% on training data, which is also a sign of overfitting



[0.16292443052592293, 0.9421000981549247]

In [339]:
# predict on test data
# if prediction is greater than 0.5, output is 1; otherwise, output is 0
y_pred_2 = classifier_2.predict(X_test)
y_pred_2 = (y_pred_2 > 0.5)

In [340]:
# accuracy on test: ~63%
confusion_matrix(y_test, y_pred_2) # TP + TN = 802; 802/1311 total = ~63% accurate

array([[237, 222],
       [268, 584]])

#### Instance 3 -- tackle overfitting during model training
5 layers of 33 nodes mapping to 1 target variable

In [348]:
# instantiate classifier 3
classifier_3 = Sequential()

Implementation of dropout regularization to approximate training a large # of NN's w/ unique architectures in parallel, which we don't have the time or compute for.

In [349]:
# hidden layer 1
classifier_3.add(Dense(33, activation = 'relu', kernel_initializer = 'random_normal', input_dim = 165))
classifier_3.add(Dropout(0.1))

# hidden layer 2
classifier_3.add(Dense(33, activation = 'relu', kernel_initializer = 'random_normal'))
classifier_3.add(Dropout(0.1))

# hidden layer 3
classifier_3.add(Dense(33, activation = 'relu', kernel_initializer = 'random_normal'))
classifier_3.add(Dropout(0.1))

# hidden layer 4
classifier_3.add(Dense(33, activation = 'relu', kernel_initializer = 'random_normal'))
classifier_3.add(Dropout(0.1))

# hidden layer 5
classifier_3.add(Dense(33, activation = 'relu', kernel_initializer = 'random_normal'))
classifier_3.add(Dropout(0.1))

# output layer
classifier_3.add(Dense(1, activation = 'sigmoid', kernel_initializer = 'random_normal'))

In [350]:
# compile NN
classifier_3.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [351]:
%%time

# fit training data to model
classifier_3.fit(X_train,y_train, batch_size = 15, epochs = 60)

Epoch 1/60
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 24/60
Epoch 25/60
Epoch 26/60
Epoch 27/60
Epoch 28/60
Epoch 29/60
Epoch 30/60
Epoch 31/60
Epoch 32/60
Epoch 33/60
Epoch 34/60
Epoch 35/60
Epoch 36/60
Epoch 37/60
Epoch 38/60
Epoch 39/60
Epoch 40/60
Epoch 41/60
Epoch 42/60
Epoch 43/60
Epoch 44/60
Epoch 45/60
Epoch 46/60
Epoch 47/60
Epoch 48/60
Epoch 49/60
Epoch 50/60
Epoch 51/60
Epoch 52/60
Epoch 53/60
Epoch 54/60
Epoch 55/60
Epoch 56/60
Epoch 57/60
Epoch 58/60
Epoch 59/60
Epoch 60/60
CPU times: user 1min 50s, sys: 12.1 s, total: 2min 2s
Wall time: 54.6 s


<keras.callbacks.History at 0x1a7b27aa90>

In [352]:
# evaluate loss & metrics values for model
classifier_3.evaluate(X_train, y_train) # accuracy near 94% on training data



[0.09575048808007214, 0.9744847890283299]

In [353]:
# predict on test data
# if prediction is greater than 0.5, output is 1; otherwise, output is 0
y_pred_3 = classifier_3.predict(X_test)
y_pred_3 = (y_pred_3 > 0.5)

In [354]:
# accuracy on test: ~63%
confusion_matrix(y_test, y_pred_3) # TP + TN = 858; 858/1311 total = ~63% accurate

array([[160, 299],
       [185, 667]])

#### Instance 4
11 layers of 15 nodes mapping to 1 target variable

In [398]:
# instantiate classifier 4
classifier_4 = Sequential()

In [399]:
# hidden layer 1
classifier_4.add(Dense(15, activation = 'relu', kernel_initializer = 'random_normal', input_dim = 165))
classifier_4.add(Dropout(0.1))

# hidden layer 2
classifier_4.add(Dense(15, activation = 'relu', kernel_initializer = 'random_normal'))
classifier_4.add(Dropout(0.1))

# hidden layer 3
classifier_4.add(Dense(15, activation = 'relu', kernel_initializer = 'random_normal'))
classifier_4.add(Dropout(0.1))

# hidden layer 4
classifier_4.add(Dense(15, activation = 'relu', kernel_initializer = 'random_normal'))
classifier_4.add(Dropout(0.1))

# hidden layer 5
classifier_4.add(Dense(15, activation = 'relu', kernel_initializer = 'random_normal'))
classifier_4.add(Dropout(0.1))

# hidden layer 6
classifier_4.add(Dense(15, activation = 'relu', kernel_initializer = 'random_normal'))
classifier_4.add(Dropout(0.1))

# hidden layer 7
classifier_4.add(Dense(15, activation = 'relu', kernel_initializer = 'random_normal'))
classifier_4.add(Dropout(0.1))

# hidden layer 8
classifier_4.add(Dense(15, activation = 'relu', kernel_initializer = 'random_normal'))
classifier_4.add(Dropout(0.1))

# hidden layer 9
classifier_4.add(Dense(15, activation = 'relu', kernel_initializer = 'random_normal'))
classifier_4.add(Dropout(0.1))

# hidden layer 10
classifier_4.add(Dense(15, activation = 'relu', kernel_initializer = 'random_normal'))
classifier_4.add(Dropout(0.1))

# hidden layer 11
classifier_4.add(Dense(15, activation = 'relu', kernel_initializer = 'random_normal'))
classifier_4.add(Dropout(0.1))

# output layer
classifier_4.add(Dense(1, activation = 'sigmoid', kernel_initializer = 'random_normal'))

In [400]:
# compile NN
classifier_4.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [401]:
%%time

# fit training data to model
classifier_4.fit(X_train,y_train, batch_size = 15, epochs = 200)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

Epoch 83/200
Epoch 84/200
Epoch 85/200
Epoch 86/200
Epoch 87/200
Epoch 88/200
Epoch 89/200
Epoch 90/200
Epoch 91/200
Epoch 92/200
Epoch 93/200
Epoch 94/200
Epoch 95/200
Epoch 96/200
Epoch 97/200
Epoch 98/200
Epoch 99/200
Epoch 100/200
Epoch 101/200
Epoch 102/200
Epoch 103/200
Epoch 104/200
Epoch 105/200
Epoch 106/200
Epoch 107/200
Epoch 108/200
Epoch 109/200
Epoch 110/200
Epoch 111/200
Epoch 112/200
Epoch 113/200
Epoch 114/200
Epoch 115/200
Epoch 116/200
Epoch 117/200
Epoch 118/200
Epoch 119/200
Epoch 120/200
Epoch 121/200
Epoch 122/200
Epoch 123/200
Epoch 124/200
Epoch 125/200
Epoch 126/200
Epoch 127/200
Epoch 128/200
Epoch 129/200
Epoch 130/200
Epoch 131/200
Epoch 132/200
Epoch 133/200
Epoch 134/200
Epoch 135/200
Epoch 136/200
Epoch 137/200
Epoch 138/200
Epoch 139/200
Epoch 140/200
Epoch 141/200
Epoch 142/200
Epoch 143/200
Epoch 144/200
Epoch 145/200
Epoch 146/200
Epoch 147/200
Epoch 148/200
Epoch 149/200
Epoch 150/200
Epoch 151/200
Epoch 152/200
Epoch 153/200
Epoch 154/200
Epoch 155

Epoch 163/200
Epoch 164/200
Epoch 165/200
Epoch 166/200
Epoch 167/200
Epoch 168/200
Epoch 169/200
Epoch 170/200
Epoch 171/200
Epoch 172/200
Epoch 173/200
Epoch 174/200
Epoch 175/200
Epoch 176/200
Epoch 177/200
Epoch 178/200
Epoch 179/200
Epoch 180/200
Epoch 181/200
Epoch 182/200
Epoch 183/200
Epoch 184/200
Epoch 185/200
Epoch 186/200
Epoch 187/200
Epoch 188/200
Epoch 189/200
Epoch 190/200
Epoch 191/200
Epoch 192/200
Epoch 193/200
Epoch 194/200
Epoch 195/200
Epoch 196/200
Epoch 197/200
Epoch 198/200
Epoch 199/200
Epoch 200/200
CPU times: user 9min, sys: 1min, total: 10min 1s
Wall time: 4min 16s


<keras.callbacks.History at 0x1a87806160>

In [402]:
# evaluate loss & metrics values for model
classifier_4.evaluate(X_train, y_train) # accuracy near 91% on training data



[0.3397591671095122, 0.812234216571673]

In [403]:
# predict on test data
# if prediction is greater than 0.5, output is 1; otherwise, output is 0
y_pred_4 = classifier_4.predict(X_test)
y_pred_4 = (y_pred_4 > 0.5)

In [404]:
# accuracy on test: ~XX%
confusion_matrix(y_test, y_pred_4) # TP + TN = XXX; XXX/1311 total = ~XX% accurate

array([[295, 164],
       [338, 514]])

#### Instance 5

Grid Search w/ Keras Classifier & 10-fold CV

In [436]:
# function to create model

def classifier_model():
    
    # create model
    model = Sequential()
    
    # hidden layer 1
    model.add(Dense(33, activation = 'relu', kernel_initializer = 'random_normal', input_dim = 165))
    model.add(Dropout(0.1))

    # hidden layer 2
    model.add(Dense(33, activation = 'relu', kernel_initializer = 'random_normal'))
    model.add(Dropout(0.1))

    # hidden layer 3
    model.add(Dense(33, activation = 'relu', kernel_initializer = 'random_normal'))
    model.add(Dropout(0.1))

    # hidden layer 4
    model.add(Dense(33, activation = 'relu', kernel_initializer = 'random_normal'))
    model.add(Dropout(0.1))

    # hidden layer 5
    model.add(Dense(33, activation = 'relu', kernel_initializer = 'random_normal'))
    model.add(Dropout(0.1))

    # output layer
    model.add(Dense(1, activation = 'sigmoid', kernel_initializer = 'random_normal'))
    
    # compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return model

In [437]:
# instantiate keras classifier model
model = KerasClassifier(build_fn = classifier_model, verbose = 0)

In [438]:
# create grid search parameters

batches = [10, 15, 20, 25, 30]
epochs = [25, 50, 100, 150, 200]

params = dict(batch_size = batches, epochs = epochs)

In [439]:
%%time

# instantiate the GridSearchCV object and run search
grid_search_2 = GridSearchCV(estimator = model, param_grid = params, n_jobs = -1, cv = 10)
grid_search_2_fit = grid.fit(X_train, y_train)

CPU times: user 8min 2s, sys: 47.3 s, total: 8min 49s
Wall time: 2h 4min 53s


In [443]:
# summarize results
print(grid_search_2_fit.best_score_, grid_search_2_fit.best_params_)

0.6774615658959572 {'batch_size': 10, 'epochs': 200}


In [445]:
# evaluate NN on test set
grid_search_2_fit.score(X_test, y_test)

0.6536994710803304