In [8]:
#Import packages
import os #Allows us to get operating system information in python.
#In artemis video, he did not import os package

#Data Handling
import pandas as pd, numpy as np

#Time
import time

#Plotting
import matplotlib.pyplot as plt, seaborn as sns, scipy.stats, pylab

#Saving data
import pickle

#train and test split
from sklearn.model_selection import train_test_split

#Scalers
from sklearn import preprocessing

#TomekLinks and RandomUnderSampler
from imblearn.under_sampling import TomekLinks, RandomUnderSampler

#Metrics
from sklearn.metrics import f1_score, balanced_accuracy_score, recall_score, roc_auc_score

#General Management
import gc as gc
gc.enable()
from joblib import dump, load
from warnings import filterwarnings

#Notebook configurations
filterwarnings('ignore')

In [2]:
#MODELS
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB 

  import pandas.util.testing as tm


In [3]:
f = open('CCF_ProcessedData.pckl','rb')
pickle_list = pickle.load(f)
f.close()

#pickle_list = [tomek_modeling_data, y, rus_tomek_modeling_data, y2]

tomek_modeling_data = pickle_list[0]
y = pickle_list[1]
rus_tomek_modeling_data = pickle_list[2]
y2 = pickle_list[3]
test = pickle_list[4]

In [5]:
#TEST
#print(tomek_modeling_data.head())
#print(rus_tomek_modeling_data.head())

#Double check column values before moving forward

#tomek_modeling_data - GOOD!
# for col in tomek_modeling_data.columns:
#     print(col, tomek_modeling_data[col].dtype) 
    
#rus_tomek_modeling_data - GOOD!
# for col in rus_tomek_modeling_data.columns:
#     print(col, rus_tomek_modeling_data[col].dtype)

# modeling_cat_cols = [col for col in modeling_data.columns if col not in ['Age', 'Flight Distance', 'Departure Delay in Minutes','Arrival Delay in Minutes', 'satisfaction']]
# t_mcol = [col for col in tomek_modeling_data.columns if col not in ['amount_4root','oldbalanceOrig_4root','newbalanceOrig_4root','oldbalanceDest_4root','newbalanceDest_4root']]
# t_rus_mcol = [col for col in rus_tomek_modeling_data.columns if col not in ['amount_4root','oldbalanceOrig_4root','newbalanceOrig_4root','oldbalanceDest_4root','newbalanceDest_4root']]

# # for col in modeling_cat_cols:
# #     modeling_data[col] = modeling_data[col].astype('category')

# #Tomek
# for col in t_mcol:
#     tomek_modeling_data[col] = tomek_modeling_data[col].astype('category')
    
# #Tomek + RUS
# for col in t_rus_mcol:
#     rus_tomek_modeling_data[col] = rus_tomek_modeling_data[col].astype('category')

Unnamed: 0,isFraud
0,0
1,0
2,0
3,0
4,0


# MODELING

In [4]:
#List of the 13 classifiers we will test
classifiers = [XGBClassifier(random_state=1,categorical_features=True),
               LGBMClassifier(random_state=1,is_unbalance=True),
               RandomForestClassifier(random_state=1),
               ExtraTreesClassifier(random_state=1),
               GradientBoostingClassifier(random_state=1),
               DecisionTreeClassifier(random_state=1),
               ExtraTreeClassifier(random_state=1),
               LogisticRegression(random_state=1),
               RidgeClassifier(random_state=1),
               SGDClassifier(random_state=1),
               KNeighborsClassifier(n_neighbors=10),
               GaussianNB(),
               MultinomialNB()]

In [12]:
#Function that will test the classifiers
def test_classifiers(train_data, train_classes, test_data, test_classes, classifiers):
    results = {} #Save the results in a dictionary
    
    #Loop through the different classifiers in the list
    for clf in classifiers:
        name = clf.__class__.__name__ #Grab the name of the classifier
        
        print("Now training {}...".format(name)) #Lets us know what classifier we are on
        
        start_time = time.time() #Start keeping track of time
        clf.fit(train_data, train_classes) #Fit the training data to the classifier
        predict = clf.predict(test_data) #Make predictions on test data
        
        #METRICS - Compare test data predictions with actual values
        f1 = round(f1_score(y_true=test_classes, y_pred=predict, pos_label=1), 3)
        bal_acc = round(balanced_accuracy_score(test_classes, predict), 3)
        recall = round(recall_score(test_classes, predict, pos_label=1), 3)
        #roc_auc = round(roc_auc_score(test_classes,clf.predict_proba(test_data)), 3)
        
        stop_time = time.time() #Stop keeping track of time
        runtime = round(stop_time - start_time, 3) #Calculate run time
        
        print("{} trained in {} with \n  F1: {} \n  Balanced Accuracy: {} \n  Recall: {}".format(name,runtime,f1,bal_acc,recall))
        
        results[name] = (f1, bal_acc, recall, runtime)
        
    return results

In [11]:
#tomek_modeling_data & y
Xtrain_tomek, Xdev_tomek, ytrain_tomek, ydev_tomek = train_test_split(tomek_modeling_data, y, stratify=y, test_size=0.1, random_state=5)

#rus_tomek_modeling_data & y2
Xtrain_rus, Xdev_rus, ytrain_rus, ydev_rus = train_test_split(rus_tomek_modeling_data, y2, stratify=y2, test_size=0.3, random_state=5)

### Tomek Modeling Data

In [13]:
#classifier_results_tomek = test_classifiers(Xtrain_tomek, ytrain_tomek, Xdev_tomek, ydev_tomek, classifiers)

Now training XGBClassifier...
XGBClassifier trained in 941.981 with 
  F1: 0.888 
  Balanced Accuracy: 0.913 
  Recall: 0.825
Now training LGBMClassifier...
LGBMClassifier trained in 45.119 with 
  F1: 0.121 
  Balanced Accuracy: 0.963 
  Recall: 0.943
Now training RandomForestClassifier...
RandomForestClassifier trained in 1551.686 with 
  F1: 0.864 
  Balanced Accuracy: 0.89 
  Recall: 0.78
Now training ExtraTreesClassifier...
ExtraTreesClassifier trained in 386.59 with 
  F1: 0.876 
  Balanced Accuracy: 0.897 
  Recall: 0.793
Now training GradientBoostingClassifier...
GradientBoostingClassifier trained in 1663.791 with 
  F1: 0.528 
  Balanced Accuracy: 0.71 
  Recall: 0.421
Now training DecisionTreeClassifier...
DecisionTreeClassifier trained in 56.936 with 
  F1: 0.904 
  Balanced Accuracy: 0.95 
  Recall: 0.9
Now training ExtraTreeClassifier...
ExtraTreeClassifier trained in 9.967 with 
  F1: 0.797 
  Balanced Accuracy: 0.895 
  Recall: 0.791
Now training LogisticRegression...
Lo

In [14]:
#5/18/23
#Save the dictionary so we do not have to re-run above code again
pickle_list = [classifier_results_tomek]
f = open('classifier_results_tomek.pckl','wb')
pickle.dump(pickle_list,f)
f.close()

### Tomek + RUS Modeling Data

In [16]:
#classifier_results_rus = test_classifiers(Xtrain_rus, ytrain_rus, Xdev_rus, ydev_rus, classifiers)

Now training XGBClassifier...
XGBClassifier trained in 0.939 with 
  F1: 0.994 
  Balanced Accuracy: 0.994 
  Recall: 0.997
Now training LGBMClassifier...
LGBMClassifier trained in 0.425 with 
  F1: 0.994 
  Balanced Accuracy: 0.994 
  Recall: 0.998
Now training RandomForestClassifier...
RandomForestClassifier trained in 0.881 with 
  F1: 0.992 
  Balanced Accuracy: 0.992 
  Recall: 0.997
Now training ExtraTreesClassifier...
ExtraTreesClassifier trained in 0.599 with 
  F1: 0.99 
  Balanced Accuracy: 0.99 
  Recall: 0.993
Now training GradientBoostingClassifier...
GradientBoostingClassifier trained in 1.657 with 
  F1: 0.988 
  Balanced Accuracy: 0.988 
  Recall: 0.995
Now training DecisionTreeClassifier...
DecisionTreeClassifier trained in 0.058 with 
  F1: 0.991 
  Balanced Accuracy: 0.991 
  Recall: 0.993
Now training ExtraTreeClassifier...
ExtraTreeClassifier trained in 0.026 with 
  F1: 0.971 
  Balanced Accuracy: 0.971 
  Recall: 0.97
Now training LogisticRegression...
LogisticRe

In [17]:
#5/18/23
#Save the dictionary so we do not have to re-run above code again
pickle_list = [classifier_results_rus]
f = open('classifier_results_rus.pckl','wb')
pickle.dump(pickle_list,f)
f.close()