In [1]:
#Import packages
import os #Allows us to get operating system information in python.
#In artemis video, he did not import os package

#Data Handling
import pandas as pd, numpy as np

#Time
import time

#Plotting
import matplotlib.pyplot as plt, seaborn as sns, scipy.stats, pylab

#Saving data
import pickle

#train and test split
from sklearn.model_selection import train_test_split

#Scalers
from sklearn import preprocessing

#TomekLinks and RandomUnderSampler
from imblearn.under_sampling import TomekLinks, RandomUnderSampler

#Hyperparameter optimization
import optuna

#Metrics
from sklearn.metrics import f1_score, balanced_accuracy_score, recall_score, roc_auc_score

#General Management
import gc as gc
gc.enable()
from joblib import dump, load
from warnings import filterwarnings

#Notebook configurations
filterwarnings('ignore')

In [2]:
#FINAL MODELS 
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

In [4]:
#IMPORT ORIGINAL DATA
f = open('CCF_ProcessedData.pckl','rb')
pickle_list = pickle.load(f)
f.close()

tomek_modeling_data = pickle_list[0]
y = pickle_list[1]
rus_tomek_modeling_data = pickle_list[2]
y2 = pickle_list[3]
test = pickle_list[4] #The most important variable we are importing in this script

In [3]:
#IMPORT THE DATA THAT WE WILL USE TO TRAIN FINAL MODELS BEFORE TESTING ON TEST DATA
def get_tomek_data():
    f = open('tomek_data.pckl','rb')
    data = pickle.load(f)
    f.close()
    return data

def get_rus_data():
    f = open('rus_data.pckl','rb')
    data = pickle.load(f)
    f.close()
    return data

### FINAL MODEL FOR TOMEK DATA 
Gaussian Na√Øve Bayes with varsmoothing = 9.125860889745052 * (10**-9)

In [5]:
tomek_data = get_tomek_data() #import tomek data that's been split up into train and dev sets
tomek_X_train = tomek_data[0]
tomek_X_dev = tomek_data[1]
tomek_y_train = tomek_data[2]
tomek_y_dev = tomek_data[3]

#Get the final model ready
vs_final = 9.125860889745052 * (10**-9)
fmodel_tomek = GaussianNB(var_smoothing=vs_final)

In [6]:
fmodel_tomek.fit(tomek_X_train,tomek_y_train)

GaussianNB(var_smoothing=9.125860889745052e-09)

In [8]:
#f1 = round(f1_score(y_true=test_classes, y_pred=predict, pos_label=1), 3)
#bal_acc = round(balanced_accuracy_score(test_classes, predict), 3)
#recall = round(recall_score(test_classes, predict, pos_label=1), 3)

recall_tomek = round(recall_score(tomek_y_dev,fmodel_tomek.predict(tomek_X_dev)),3)
f1_tomek = round(f1_score(y_true=tomek_y_dev, y_pred=fmodel_tomek.predict(tomek_X_dev)),3)

print("Recall of Final Tomek Model: {}".format(recall_tomek))
print("F1 Score of Final Tomek Model: {}".format(f1_tomek))

Recall of Final Tomek Model: 0.998
F1 Score of Final Tomek Model: 0.005


### FINAL MODEL FOR TOMEK+RUS DATA 
Random Forest Classifier with the following parameters:
 'rfc_num_exp': 3,
 'rfc_num_base': 9.8410100557842,
 'rfc_maxdepth': 5,
 'rfc_ml_exp': 3,
 'rfc_ml_base': 9.56629896082272,
 'rfc_bootstrap': True,
 'rfc_maxsamples': 0.9767807421240824

In [12]:
rus_data = get_rus_data() #import tomek+rus data that's been split up into train and dev sets
rus_X_train = rus_data[0]
rus_X_dev = rus_data[1]
rus_y_train = rus_data[2]
rus_y_dev = rus_data[3]

#FINAL VALUES
rfc_num = 9841
rfc_maxdepth = 5
rfc_minleaf = 9566
rfc_bootstrap = True
rfc_maxsamples = 0.9767807421240824

fmodel_rus = RandomForestClassifier(n_estimators=rfc_num, max_depth=rfc_maxdepth,
                                   min_samples_leaf=rfc_minleaf, bootstrap=rfc_bootstrap,
                                   max_samples=rfc_maxsamples, random_state=10)

In [13]:
fmodel_rus.fit(rus_X_train,rus_y_train)

RandomForestClassifier(max_depth=5, max_samples=0.9767807421240824,
                       min_samples_leaf=9566, n_estimators=9841,
                       random_state=10)

In [15]:
recall_rus = round(recall_score(rus_y_dev,fmodel_rus.predict(rus_X_dev)),3)
f1_rus = round(f1_score(y_true=rus_y_dev, y_pred=fmodel_rus.predict(rus_X_dev)),3)

print("Recall of Final Tomek+RUS Model: {}".format(recall_rus))
print("F1 Score of Final Tomek+RUS Model: {}".format(f1_rus))

Recall of Final Tomek+RUS Model: 1.0
F1 Score of Final Tomek+RUS Model: 0.667
