In [None]:
%load_ext autoreload
%load_ext autotime
%autoreload 2

## Imports

In [None]:
# importing packages
import os, sys
import pandas as pd
import numpy as np
import warnings

# setting project path
gparent = os.path.join(os.pardir, os.pardir)
sys.path.append(gparent)

from src import helper_functions as f
from src import visualizations as v
from src import class_Database as d
from src import class_Harness as h

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.pipeline import make_pipeline 
from sklearn.metrics import (accuracy_score, recall_score, precision_score,
                             f1_score, make_scorer, confusion_matrix)
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC 
from sklearn.tree import DecisionTreeClassifier, export_graphviz, plot_tree
from sklearn.ensemble import (BaggingClassifier, AdaBoostClassifier,
                              RandomForestClassifier, VotingClassifier)
import xgboost as xgb
from xgboost import XGBClassifier, plot_importance

from keras import layers, models, optimizers, losses, metrics
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier

import matplotlib.pyplot as plt
import seaborn as sns
import graphviz
%matplotlib inline

sns.set_theme('talk')
plt.style.use('fivethirtyeight')
sns.set_palette("winter_r")
pd.options.display.max_rows = 300

## Creating The Database

In [None]:
# creating database & database class instance
database_name = 'outcomes.db'
db = d.Database(database_name)

**Run The Cell Below Once And Then Comment Out**

In [None]:
# file_name = 'anonymisedData.zip'
# db.populate(file_name, database_name)

### Database ERD

In [None]:
db.erd

## Preprocessing and Harness Objects

In [None]:
string_selector = make_column_selector(dtype_include='object')
number_selector = make_column_selector(dtype_include='number', dtype_exclude='object')

preprocessing_ohe = make_column_transformer((OneHotEncoder
                                         (handle_unknown='ignore'),string_selector))
preprocessing_ss = make_column_transformer((OneHotEncoder
                                         (handle_unknown='ignore'),string_selector),
                                          (StandardScaler(), number_selector))
preprocessing_mm = make_column_transformer((OneHotEncoder
                                         (handle_unknown='ignore'),string_selector),
                                          (MinMaxScaler(), number_selector))

In [None]:
# creating accuracy scorer
accuracy = f.acc_score

# creating f1 scorer
f1 = f.f1

# creating recall scorer 
recall = f.recall

# creating precision scorer
precision = f.precision

# creating Harness
modeling = h.Harness(('accuracy', 'f1', 'recall', 'precision'))

# STUDENTINFO Table

In [None]:
db.table_info('STUDENTINFO')

## Creating STUDENTINFO Data Frame

In [None]:
df = db.student_info()

In [None]:
df.info()

In [None]:
df.target.value_counts(normalize=True)

## Creating Holdout Data

In [None]:
# splitting data & target
X, y = f.X_y(df)

In [None]:
# splitting training and holdout data
X_training, X_holdout, y_training, y_holdout = f.test_train(X, y)

## Creating Training Data

In [None]:
X_train, X_test, y_train, y_test = f.test_train(X_training, y_training)

## Baseline Model: Dummy Classifier

In [None]:
dummy = DummyClassifier(strategy='most_frequent')

In [None]:
baseline = make_pipeline(preprocessing_ohe, dummy)

### Fitting Model

In [None]:
baseline.fit(X_train, y_train)

### Saving Model

In [None]:
# f.pickle_model(baseline, 'baseline')

### Cross Val Scores

In [None]:
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    modeling.report(baseline, X_train, y_train, ' Baseline', 'Baseline DummyCFL')

### Test Data Confusion Matrix

In [None]:
%%capture --no-display
f.confusion_report(baseline, X_test, y_test, 'baseline')

## First Simple Model: Naive Bayes Classifier

In [None]:
mnb = MultinomialNB()

In [None]:
fsm = make_pipeline(preprocessing_ohe, mnb)

### Fitting Model

In [None]:
fsm.fit(X_train, y_train)

### Saving Model

In [None]:
# f.pickle_model(fsm, 'fsm')

### Cross Val Scores

In [None]:
# checking cross val scores
modeling.report(fsm, X_train, y_train, ' fsm', 'FSM Naive Bayes')

### Test Data Confusion Matrix

In [None]:
f.confusion_report(fsm, X_test, y_test, 'fsm')

## KNeighborsClassifier

In [None]:
KNN = KNeighborsClassifier(n_jobs=-1)

In [None]:
knn = make_pipeline(preprocessing_ohe, KNN)

### Fitting Model

In [None]:
knn.fit(X_train, y_train)

In [None]:
# f.pickle_model(knn, 'knn')

### Cross Val Scores

In [None]:
# checking cross val scores
modeling.report(knn, X_train, y_train, 'knn', 'KNN')

### Test Data Confusion Matrix

In [None]:
f.confusion_report(knn, X_test, y_test)

## Logistic Regression Classifier

In [None]:
LR = LogisticRegression(max_iter=1000, n_jobs=-1, random_state = 2021, verbose=0)

In [None]:
logreg = make_pipeline(preprocessing_ohe, LR)

### Fitting Model

In [None]:
logreg.fit(X_train, y_train)

In [None]:
# f.pickle_model(logreg, 'logreg')

### Cross Val Scores

In [None]:
# checking cross val scores
modeling.report(logreg, X_train, y_train, 'logreg', 'Logistic Regression')

### Test Data Confusion Matrix

In [None]:
f.confusion_report(logreg, X_test, y_test)

## Support Vector Machine

In [None]:
SVC = SVC()

In [None]:
svc = make_pipeline(preprocessing_ohe, SVC)

### Fitting Model

In [None]:
svc.fit(X_train, y_train)

In [None]:
# f.pickle_model(svc, 'svc')

### Cross Val Scores

In [None]:
# checking cross val scores
modeling.report(svc, X_train, y_train, 'svc', 'Support Vector Classifier')

### Test Data Confusion Matrix

In [None]:
f.confusion_report(svc, X_test, y_test)

## Linear Support Vector Machine

In [None]:
LSVC = LinearSVC()

In [None]:
lsvc = make_pipeline(preprocessing_ohe, LSVC)

### Fitting Model

In [None]:
lsvc.fit(X_train, y_train)

In [None]:
# f.pickle_model(lsvc, 'lsvc')

### Cross Val Scores

In [None]:
# checking cross val scores
modeling.report(lsvc, X_train, y_train, 'lsvc', 'Linear SVC')

### Test Data Confusion Matrix

In [None]:
f.confusion_report(lsvc, X_test, y_test)

## Decision Tree

In [None]:
DT = DecisionTreeClassifier(random_state=2021)

In [None]:
dt = make_pipeline(preprocessing_ohe, DT)

### Fitting Model

In [None]:
dt.fit(X_train, y_train)

In [None]:
# f.pickle_model(dt, 'dt')

### Cross Val Scores

In [None]:
# checking cross val scores
modeling.report(dt, X_train, y_train, 'dt', 'Decision Tree')

### Test Data Confusion Matrix

In [None]:
f.confusion_report(dt, X_test, y_test)

## Bagging Classifier

In [None]:
BCLF = BaggingClassifier(base_estimator=DT, random_state=2021)

In [None]:
bclf = make_pipeline(preprocessing_ohe, BCLF)

### Fitting Model

In [None]:
bclf.fit(X_train, y_train)

In [None]:
# f.pickle_model(bclf, 'bclf')

### Cross Val Scores

In [None]:
# checking cross val scores
modeling.report(bclf, X_train, y_train, 'bclf', 'Bagging CLF')

### Test Data Confusion Matrix

In [None]:
f.confusion_report(bclf, X_test, y_test)

## Random Forest Classifier

In [None]:
RF = RandomForestClassifier(n_jobs=-1, random_state=2021)

In [None]:
rf = make_pipeline(preprocessing_ohe, RF)

### Fitting Model

In [None]:
rf.fit(X_train, y_train)

In [None]:
# f.pickle_model(rf, 'rf')

### Cross Val Scores

In [None]:
# checking cross val scores
modeling.report(rf, X_train, y_train, 'rf', 'Random Forest')

### Test Data Confusion Matrix

In [None]:
f.confusion_report(rf, X_test, y_test)

## AdaBoost

In [None]:
ADA = AdaBoostClassifier(base_estimator=DT, random_state=2021)

In [None]:
ada = make_pipeline(preprocessing_ohe, ADA)

### Fitting Model

In [None]:
ada.fit(X_train, y_train)

In [None]:
# f.pickle_model(ada, 'ada')

### Cross Val Scores

In [None]:
# checking cross val scores
modeling.report(ada, X_train, y_train, 'ada', 'AdaBoost')

### Test Data Confusion Matrix

In [None]:
f.confusion_report(ada, X_test, y_test)

## XGBoost

In [None]:
XGB = XGBClassifier(use_label_encoder=False)

In [None]:
xgb = make_pipeline(preprocessing_ohe, XGB)

### Fitting Model

In [None]:
%%capture --no-display
xgb.fit(X_train, y_train)

In [None]:
# f.pickle_model(xgb, 'xgb')

### Cross Val Scores

In [None]:
%%capture --no-display
# checking cross val scores
modeling.report(xgb, X_train, y_train, 'xgb', 'XGBoost')

### Test Data Confusion Matrix

In [None]:
%%capture --no-display
f.confusion_report(xgb, X_test, y_test)

## Model Summary 1

In [None]:
modeling.history

# STUDENTINFO & STUDENTVLE

In [None]:
df = db.sv_si()

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df.describe()

## Creating Holdout Data

In [None]:
# splitting data & target
X, y = f.X_y(df)

In [None]:
# splitting traing and holdout data
X_training, X_holdout, y_training, y_holdout = f.test_train(X, y)

## Creating Training Data

In [None]:
# train test split
X_train, X_test, y_train, y_test = f.test_train(X_training, y_training)

## MinMax Scaling
Since we've added numerical columns with very different scales to the data, we'll implement some scaling in the pipeline. K-Nearest Neighbors in particular is sensitive to differences in feature scale, and scaling can also help speed up convergence is certain algorithms.

## Naive Bayes Classifier

In [None]:
mnb_svsi = make_pipeline(preprocessing_mm, mnb)

### Fitting Model

In [None]:
mnb_svsi.fit(X_train, y_train)

### Saving Model

In [None]:
# f.pickle_model(mnb_svsi, 'mnb_svsi')

### Cross Val Scores

In [None]:
# checking cross val scores
modeling.report(mnb_svsi, X_train, y_train, ' mnb_svsi', 'Naive Bayes SVSI/MinMax()')

### Test Data Confusion Matrix

In [None]:
f.confusion_report(mnb_svsi, X_test, y_test)

## KNeighborsClassifier

In [None]:
knn_svsi = make_pipeline(preprocessing_mm, KNN)

### Fitting Model

In [None]:
knn_svsi.fit(X_train, y_train)

In [None]:
# f.pickle_model(knn_svsi, 'knn_svsi')

### Cross Val Scores

In [None]:
# checking cross val scores
modeling.report(knn_svsi, X_train, y_train, 'knn_svsi', 'KNN SVSI/MinMax()')

### Test Data Confusion Matrix

In [None]:
f.confusion_report(knn_svsi, X_test, y_test)

## Logistic Regression Classifier

In [None]:
logreg_svsi = make_pipeline(preprocessing_mm, LR)

### Fitting Model

In [None]:
logreg_svsi.fit(X_train, y_train)

In [None]:
# f.pickle_model(logreg_svsi, 'logreg_svsi')

### Cross Val Scores

In [None]:
# checking cross val scores
modeling.report(logreg_svsi, X_train, y_train, 'logreg_svsi', 'LogReg SVSI/MinMax()')

### Test Data Confusion Matrix

In [None]:
f.confusion_report(logreg_svsi, X_test, y_test)

## Support Vector Machine

In [None]:
svc_svsi= make_pipeline(preprocessing_mm, SVC)

### Fitting Model

In [None]:
svc_svsi.fit(X_train, y_train)

In [None]:
# f.pickle_model(svc, 'svc_svsi')

### Cross Val Scores

In [None]:
# checking cross val scores
modeling.report(svc_svsi, X_train, y_train, 'svc_svsi', 'Support Vector CLF SVSI/MinMax()')

### Test Data Confusion Matrix

In [None]:
f.confusion_report(svc_svsi, X_test, y_test)

## Linear Support Vector Machine

In [None]:
lsvc_svsi = make_pipeline(preprocessing_mm, LSVC)

### Fitting Model

In [None]:
lsvc_svsi.fit(X_train, y_train)

In [None]:
# f.pickle_model(lsvc_svsi, 'lsvc_svsi')

### Cross Val Scores

In [None]:
# checking cross val scores
modeling.report(lsvc_svsi, X_train, y_train, 'lsvc_svsi', 'Linear SVC SVSI/MinMax()')

### Test Data Confusion Matrix

In [None]:
f.confusion_report(lsvc_svsi, X_test, y_test)

## Decision Tree

In [None]:
dt_svsi = make_pipeline(preprocessing_mm, DT)

### Fitting Model

In [None]:
dt_svsi.fit(X_train, y_train)

In [None]:
# f.pickle_model(dt_svsi, 'dt_svsi')

### Cross Val Scores

In [None]:
# checking cross val scores
modeling.report(dt_svsi, X_train, y_train, 'dt_svsi', 'Decision Tree CLF SVSI/MinMax()')

### Test Data Confusion Matrix

In [None]:
f.confusion_report(dt_svsi, X_test, y_test)

## Bagging Classifier

In [None]:
bclf_svsi = make_pipeline(preprocessing_mm, BCLF)

### Fitting Model

In [None]:
bclf_svsi.fit(X_train, y_train)

In [None]:
# f.pickle_model(bclf_svsi, 'bclf_svsi')

### Cross Val Scores

In [None]:
# checking cross val scores
modeling.report(bclf_svsi, X_train, y_train, 'bclf_svsi', 'Bagging CLF SVSI/MinMax()')

### Test Data Confusion Matrix

In [None]:
f.confusion_report(bclf_svsi, X_test, y_test)

## Random Forest Classifier

In [None]:
rf_svsi = make_pipeline(preprocessing_mm, RF)

### Fitting Model

In [None]:
rf_svsi.fit(X_train, y_train)

In [None]:
# f.pickle_model(rf_svsi, 'rf_svsi')

### Cross Val Scores

In [None]:
# checking cross val scores
modeling.report(rf_svsi, X_train, y_train, 'rf_svsi', 'Random Forest SVSI/MinMax()')

### Test Data Confusion Matrix

In [None]:
f.confusion_report(rf_svsi, X_test, y_test)

## AdaBoost

In [None]:
ada_svsi = make_pipeline(preprocessing_mm, ADA)

### Fitting Model

In [None]:
ada_svsi.fit(X_train, y_train)

In [None]:
# f.pickle_model(ada_svsi, 'ada_svsi')

### Cross Val Scores

In [None]:
# checking cross val scores
modeling.report(ada_svsi, X_train, y_train, 'ada_svsi', 'AdaBoost SVSI/MinMax()')

### Test Data Confusion Matrix

In [None]:
f.confusion_report(ada_svsi, X_test, y_test)

## XGBoost

In [None]:
xgb_svsi = make_pipeline(preprocessing_mm, XGB)

### Fitting Model

In [None]:
%%capture --no-display
xgb_svsi.fit(X_train, y_train)

In [None]:
# f.pickle_model(xgb_svsi, 'xgb_svsi')

### Cross Val Scores

In [None]:
%%capture --no-display
# checking cross val scores
modeling.report(xgb_svsi, X_train, y_train, 'xgb_svsi', 'XGBoost SVSI/MinMax()')

### Test Data Confusion Matrix

In [None]:
%%capture --no-display
f.confusion_report(xgb_svsi, X_test, y_test)

## Model Summary 2

In [None]:
modeling.history

## Voting Classifier
Using the top three model in a voting classifier.

In [None]:
clf1 = LR
clf2 = RF
clf3 = XGB

estimators = [('lsvs',clf1), ('logreg', clf1), ('rf', clf2), ('xgb', clf3)]

VC = VotingClassifier(estimators=estimators, voting='hard', n_jobs=-1)

In [None]:
vc_svsi = make_pipeline(preprocessing_mm, VC)

### Fitting Model

In [None]:
%%capture --no-display
vc_svsi.fit(X_train, y_train)

In [None]:
# f.pickle_model(vc_svsi, 'vc_svsi')

### Cross Val Scores

In [None]:
%%capture --no-display
# checking cross val scores
modeling.report(vc_svsi, X_train, y_train, 'vc_svsi', 'Voting CLF SVSI/MinMax()')

### Test Data Confusion Matrix

In [None]:
%%capture --no-display
f.confusion_report(vc_svsi, X_test, y_test)

## Model Summary 3

In [None]:
modeling.history

## Neural Network

### Creating Validation Data

### Create Model

In [None]:
def create_model():
    # building model
    model = Sequential()
    model.add(Dense(64, input_dim=49, activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(16, activation='relu'))
    model.add(Dense(16, activation='relu'))
    model.add(Dense(16, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    # compiling
    model.compile(loss='binary_crossentropy',
                 optimizer='adam',
                 metrics=['accuracy'])
    return model

In [None]:
NN = KerasClassifier(build_fn=create_model,
                     epochs=10,
                     batch_size=32,
                     verbose=0)

### Pipeline

In [None]:
nn_1 = make_pipeline(preprocessing_mm, NN)

### Fitting Models

In [None]:
%%capture --no-display
nn_1.fit(X_train, y_train)

### Cross Val Scores

In [None]:
%%capture --no-display
modeling.report(nn_1, X_train, y_train, 'nn_1_svsi', 'Neural Net SVSI/MinMax()')

### Test Data Confusion Matrix

In [None]:
%%capture --no-display
f.confusion_report_nn(nn_1, X_test, y_test)

## Model Summary 4

In [None]:
modeling.history

In [None]:
# df = modeling.history
# f.df_plot(df, 'modeling_history')