In [1]:
#Import packages
import os #Allows us to get operating system information in python.
#In artemis video, he did not import os package

#Data Handling
import pandas as pd, numpy as np

#Time
import time

#Plotting
import matplotlib.pyplot as plt, seaborn as sns, scipy.stats, pylab

#Saving data
import pickle

#train and test split
from sklearn.model_selection import train_test_split

#Scalers
from sklearn import preprocessing

#TomekLinks and RandomUnderSampler
from imblearn.under_sampling import TomekLinks, RandomUnderSampler

#Hyperparameter optimization
import optuna

#Metrics
from sklearn.metrics import f1_score, balanced_accuracy_score, recall_score, roc_auc_score

#General Management
import gc as gc
gc.enable()
from joblib import dump, load
from warnings import filterwarnings

#Notebook configurations
filterwarnings('ignore')

In [2]:
#FINAL MODELS 
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

In [3]:
#IMPORT ORIGINAL DATA
f = open('CCF_ProcessedData.pckl','rb')
pickle_list = pickle.load(f)
f.close()

tomek_modeling_data = pickle_list[0]
y = pickle_list[1]
rus_tomek_modeling_data = pickle_list[2]
y2 = pickle_list[3]
test = pickle_list[4] #The most important variable we are importing in this script

In [4]:
#IMPORT THE DATA THAT WE WILL USE TO TRAIN FINAL MODELS BEFORE TESTING ON TEST DATA
def get_tomek_data():
    f = open('tomek_data.pckl','rb')
    data = pickle.load(f)
    f.close()
    return data

def get_rus_data():
    f = open('rus_data.pckl','rb')
    data = pickle.load(f)
    f.close()
    return data

### FINAL MODEL FOR TOMEK DATA 
Gaussian Naïve Bayes with varsmoothing = 9.125860889745052 * (10**-9)

In [5]:
tomek_data = get_tomek_data() #import tomek data that's been split up into train and dev sets
tomek_X_train = tomek_data[0]
tomek_X_dev = tomek_data[1]
tomek_y_train = tomek_data[2]
tomek_y_dev = tomek_data[3]

#Get the final model ready
vs_final = 9.125860889745052 * (10**-9)
fmodel_tomek = GaussianNB(var_smoothing=vs_final)

In [6]:
fmodel_tomek.fit(tomek_X_train,tomek_y_train)

GaussianNB(var_smoothing=9.125860889745052e-09)

In [7]:
recall_tomek = round(recall_score(tomek_y_dev,fmodel_tomek.predict(tomek_X_dev)),3)
f1_tomek = round(f1_score(y_true=tomek_y_dev, y_pred=fmodel_tomek.predict(tomek_X_dev)),3)

print("Recall of Final Tomek Model: {}".format(recall_tomek))
print("F1 Score of Final Tomek Model: {}".format(f1_tomek))

Recall of Final Tomek Model: 0.998
F1 Score of Final Tomek Model: 0.005


### FINAL MODEL FOR TOMEK+RUS DATA 
Random Forest Classifier with the following parameters:
 'rfc_num_exp': 3,
 'rfc_num_base': 9.8410100557842,
 'rfc_maxdepth': 5,
 'rfc_ml_exp': 3,
 'rfc_ml_base': 9.56629896082272,
 'rfc_bootstrap': True,
 'rfc_maxsamples': 0.9767807421240824

In [8]:
rus_data = get_rus_data() #import tomek+rus data that's been split up into train and dev sets
rus_X_train = rus_data[0]
rus_X_dev = rus_data[1]
rus_y_train = rus_data[2]
rus_y_dev = rus_data[3]

#FINAL VALUES
rfc_num = 9841
rfc_maxdepth = 5
rfc_minleaf = 9566
rfc_bootstrap = True
rfc_maxsamples = 0.9767807421240824

fmodel_rus = RandomForestClassifier(n_estimators=rfc_num, max_depth=rfc_maxdepth,
                                   min_samples_leaf=rfc_minleaf, bootstrap=rfc_bootstrap,
                                   max_samples=rfc_maxsamples, random_state=10)

In [9]:
fmodel_rus.fit(rus_X_train,rus_y_train)

RandomForestClassifier(max_depth=5, max_samples=0.9767807421240824,
                       min_samples_leaf=9566, n_estimators=9841,
                       random_state=10)

In [10]:
recall_rus = round(recall_score(rus_y_dev,fmodel_rus.predict(rus_X_dev)),3)
f1_rus = round(f1_score(y_true=rus_y_dev, y_pred=fmodel_rus.predict(rus_X_dev)),3)

print("Recall of Final Tomek+RUS Model: {}".format(recall_rus))
print("F1 Score of Final Tomek+RUS Model: {}".format(f1_rus))

Recall of Final Tomek+RUS Model: 1.0
F1 Score of Final Tomek+RUS Model: 0.667


# TEST DATA 

In [11]:
#Grab the column order of the training data for later
#test.head(10)
test.dtypes
test.shape

(63627, 10)

In [12]:
final_col_order = tomek_X_train.columns

### STEP 1 CHANGE THE DATA TYPES + OTHER STUFF

In [13]:
#First, lets do the basics and grab the num columns, cat columns, and labels for the data
test_y = test['isFraud']
test_y = pd.DataFrame(test_y,columns=['isFraud']).astype('int') 
test_numerical_columns = [x for x in test.columns if test[x].dtype != 'object' and x != 'isFraud' and x != 'step']
test_categorical_columns = [x for x in test.columns if x not in test_numerical_columns and x!='isFraud']

In [14]:
#Drop the isFraud column
test.drop('isFraud',axis=1,inplace=True)

### STEP 2 TRANSFORM THE DATA

In [15]:
#Get rid of the step column
test.drop('step',axis=1,inplace=True)
test.head(10)

Unnamed: 0,type,amount,nameOrig,oldbalanceOrig,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest
0,PAYMENT,1196.56,C667413222,11773.02,10576.47,M1671085229,0.0,0.0
1,PAYMENT,17978.48,C576954285,52372.86,34394.39,M25637026,0.0,0.0
2,CASH_OUT,45902.16,C1297817053,341.0,0.0,C289286626,1995692.55,2041594.71
3,CASH_OUT,124490.28,C864011357,103679.0,0.0,C450289878,205868.9,330359.18
4,CASH_OUT,197171.59,C1454967690,0.0,0.0,C1048883071,397656.96,594828.55
5,PAYMENT,782.27,C1075029826,0.0,0.0,M2021412368,0.0,0.0
6,CASH_IN,168252.45,C900644650,292159.65,460412.1,C1344105048,490679.31,322426.86
7,CASH_OUT,1382.36,C1663808065,0.0,0.0,C42847118,99275.94,100658.29
8,CASH_OUT,3658.95,C624361723,0.0,0.0,C1318673216,1011338.73,1014997.69
9,PAYMENT,5045.14,C915578899,16016.0,10970.86,M591114958,0.0,0.0


In [16]:
#Also lets import the OneHotEncoder from the previous script
f = open('OHE.pckl','rb')
pickle_list = pickle.load(f)
f.close()

ohe = pickle_list[0]

#### NUMERICAL COLUMNS

In [17]:
#amount
test['amount_4root'] = test['amount']**(1/4)
#oldbalanceOrig
test['oldbalanceOrig_4root'] = test['oldbalanceOrig']**(1/4)
#newbalanceOrig
test['newbalanceOrig_4root'] = test['newbalanceOrig']**(1/4)
#oldbalanceDest
test['oldbalanceDest_4root'] = test['oldbalanceDest']**(1/4)
#newbalanceDest
test['newbalanceDest_4root'] = test['newbalanceDest']**(1/4)

# new_cols = train2.columns.tolist()
# new_cols = new_cols[-5:] + new_cols[:-5]
# train = train2[new_cols]

#Lets remove the old columns
test.drop(test_numerical_columns,axis=1,inplace=True)
#Reorganize the order of the columns
new_cols = test.columns.tolist()
new_cols = new_cols[-5:] + new_cols[:-5]
test = test[new_cols]
#Double check the columns are in the right order
test.head(10)

Unnamed: 0,amount_4root,oldbalanceOrig_4root,newbalanceOrig_4root,oldbalanceDest_4root,newbalanceDest_4root,type,nameOrig,nameDest
0,5.881439,10.416504,10.141103,0.0,0.0,PAYMENT,C667413222,M1671085229
1,11.579458,15.127828,13.618269,0.0,0.0,PAYMENT,C576954285,M25637026
2,14.637209,4.29723,0.0,37.585766,37.800051,CASH_OUT,C1297817053,C289286626
3,18.783818,17.944142,0.0,21.300887,23.974336,CASH_OUT,C864011357,C450289878
4,21.072258,0.0,0.0,25.11176,27.771411,CASH_OUT,C1454967690,C1048883071
5,5.288581,0.0,0.0,0.0,0.0,PAYMENT,C1075029826,M2021412368
6,20.253046,23.249042,26.048738,26.466678,23.829109,CASH_IN,C900644650,C1344105048
7,6.097549,0.0,0.0,17.750517,17.811988,CASH_OUT,C1663808065,C42847118
8,7.777484,0.0,0.0,31.712038,31.740683,CASH_OUT,C624361723,C1318673216
9,8.427879,11.249637,10.234348,0.0,0.0,PAYMENT,C915578899,M591114958


#### CATEGORICAL COLUMNS

In [18]:
#Encode the type variable - Need to do transform instead of fit_transform
type_encoded = pd.DataFrame(data=ohe.transform(test['type'].array.reshape(-1,1)), columns=ohe.get_feature_names_out())

#Grab the column names from the encoded data 
e_col = type_encoded.columns.tolist()

#Convert each column to a boolean
for col in e_col:
    type_encoded[col] = type_encoded[col].astype('bool')
    
#Add columns to dataframe
test = pd.concat([type_encoded,test],axis=1)

#Drop type column
test.drop('type',axis=1,inplace=True)

In [19]:
#Create the function that transforms the name column into the two columns that we need

def name_transform(df, colname, new_colname1, new_colname2):
    #grab the column from the dataframe
    name_df = df[colname].tolist()
    
    #Grab first letter of each entry
    name_first = [i[0] for i in name_df]
    
    #Grab the ID #
    name_ID = [i[1:] for i in name_df]
    
    #Put into df
    df['dummy_var'] = name_ID
    
    #Create new list
    #1 if 'C' & '0' if M
    first_bool = [1 if i=='C' else 0 for i in name_first]
    
    #Add the boolean list to the original dataframe
    df[new_colname1] = first_bool
    df[new_colname1] = df[new_colname1].astype('bool')
    
    #Create a pivot table of the IDs and grabt he ID#s
    name_pt = df['dummy_var'].value_counts()
    name_ptID = name_pt.index
    
    #Grab all the repeat IDs 
    name_repeat = name_pt[name_pt>1]
    ID_repeat = name_repeat.index
    
    #Loop and assign 0 if the ID repeats and 1 if the ID doesn't
    ID_bool = []
    for i in name_ID:
        if i in ID_repeat:
            ID_bool.append(0)
        else:
            ID_bool.append(1)
    
    #Add the ID_bool to the dataframe
    df[new_colname2] = ID_bool
    df[new_colname2] = df[new_colname2].astype('bool')
    
    #Drop the old column
    df.drop(colname,axis=1,inplace=True)
    df.drop('dummy_var',axis=1,inplace=True)
    
    return df

In [20]:
#nameOrig
test = name_transform(test,'nameOrig','ClientOrig','NO_ID_unique')
#nameDest
test = name_transform(test,'nameDest','ClientDest','ND_ID_unique')
test.head()

Unnamed: 0,x0_CASH_OUT,x0_DEBIT,x0_PAYMENT,x0_TRANSFER,amount_4root,oldbalanceOrig_4root,newbalanceOrig_4root,oldbalanceDest_4root,newbalanceDest_4root,ClientOrig,NO_ID_unique,ClientDest,ND_ID_unique
0,False,False,True,False,5.881439,10.416504,10.141103,0.0,0.0,True,True,False,True
1,False,False,True,False,11.579458,15.127828,13.618269,0.0,0.0,True,True,False,True
2,True,False,False,False,14.637209,4.29723,0.0,37.585766,37.800051,True,True,True,False
3,True,False,False,False,18.783818,17.944142,0.0,21.300887,23.974336,True,True,True,True
4,True,False,False,False,21.072258,0.0,0.0,25.11176,27.771411,True,True,True,True


In [21]:
#Need to reorganize the column names so they match final_col_order
test = test[final_col_order]
test.head()

Unnamed: 0,x0_CASH_OUT,x0_DEBIT,x0_PAYMENT,x0_TRANSFER,NO_ID_unique,ND_ID_unique,ClientOrig,ClientDest,amount_4root,oldbalanceOrig_4root,newbalanceOrig_4root,oldbalanceDest_4root,newbalanceDest_4root
0,False,False,True,False,True,True,True,False,5.881439,10.416504,10.141103,0.0,0.0
1,False,False,True,False,True,True,True,False,11.579458,15.127828,13.618269,0.0,0.0
2,True,False,False,False,True,False,True,True,14.637209,4.29723,0.0,37.585766,37.800051
3,True,False,False,False,True,True,True,True,18.783818,17.944142,0.0,21.300887,23.974336
4,True,False,False,False,True,True,True,True,21.072258,0.0,0.0,25.11176,27.771411


### STEP 3 TESTING THE MODELS

#### TOMEK MODEL

In [22]:
test_pred = fmodel_tomek.predict(test)

recall_test1 = round(recall_score(test_y, test_pred),3)
f1_test1 = round(f1_score(y_true=test_y, y_pred=test_pred),3)

print("Recall of Final Tomek Model: {}".format(recall_test1))
print("F1 Score of Final Tomek Model: {}".format(f1_test1))

Recall of Final Tomek Model: 1.0
F1 Score of Final Tomek Model: 0.005


#### TOMEK + RUS MODEL

In [23]:
test_pred2 = fmodel_rus.predict(test)

recall_test2 = round(recall_score(test_y,test_pred2),3)
f1_test2 = round(f1_score(y_true=test_y,y_pred=test_pred2),3)

print("Recall of Final Tomek+RUS Model: {}".format(recall_test2))
print("F1 Score of Final Tomek+RUS Model: {}".format(f1_test2))

Recall of Final Tomek+RUS Model: 1.0
F1 Score of Final Tomek+RUS Model: 0.003
