## Bank Account Fraud Supplementary Code
Using this as supplementary testing for Bank Account Fraud main notebook. Will run logistic regression and SMOTE.

In [1]:
import pandas as pd
import numpy as np
import itertools
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline



Load in Bank Account Fraud File and save as data frame before only keeping columns with Numeric Values.

In [2]:
baf_base = pd.read_csv('files/Base.csv')  

# Print the first five rows of data
baf_base.head()

Unnamed: 0,fraud_bool,income,name_email_similarity,prev_address_months_count,current_address_months_count,customer_age,days_since_request,intended_balcon_amount,payment_type,zip_count_4w,...,has_other_cards,proposed_credit_limit,foreign_request,source,session_length_in_minutes,device_os,keep_alive_session,device_distinct_emails_8w,device_fraud_count,month
0,0,0.3,0.986506,-1,25,40,0.006735,102.453711,AA,1059,...,0,1500.0,0,INTERNET,16.224843,linux,1,1,0,0
1,0,0.8,0.617426,-1,89,20,0.010095,-0.849551,AD,1658,...,0,1500.0,0,INTERNET,3.363854,other,1,1,0,0
2,0,0.8,0.996707,9,14,40,0.012316,-1.490386,AB,1095,...,0,200.0,0,INTERNET,22.730559,windows,0,1,0,0
3,0,0.6,0.4751,11,14,30,0.006991,-1.863101,AB,3483,...,0,200.0,0,INTERNET,15.215816,linux,1,1,0,0
4,0,0.9,0.842307,-1,29,40,5.742626,47.152498,AA,2339,...,0,200.0,0,INTERNET,3.743048,other,0,1,0,0


## Exploratory Data Analysis (EDA)

In [3]:
baf_base.describe()

Unnamed: 0,fraud_bool,income,name_email_similarity,prev_address_months_count,current_address_months_count,customer_age,days_since_request,intended_balcon_amount,zip_count_4w,velocity_6h,...,phone_mobile_valid,bank_months_count,has_other_cards,proposed_credit_limit,foreign_request,session_length_in_minutes,keep_alive_session,device_distinct_emails_8w,device_fraud_count,month
count,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,...,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0
mean,0.011029,0.562696,0.493694,16.718568,86.587867,33.68908,1.025705,8.661499,1572.692049,5665.296605,...,0.889676,10.839303,0.222988,515.85101,0.025242,7.54494,0.576947,1.018312,0.0,3.288674
std,0.104438,0.290343,0.289125,44.04623,88.406599,12.025799,5.381835,20.236155,1005.374565,3009.380665,...,0.313293,12.116875,0.416251,487.559902,0.156859,8.033106,0.494044,0.180761,0.0,2.209994
min,0.0,0.1,1e-06,-1.0,-1.0,10.0,4.03686e-09,-15.530555,1.0,-170.603072,...,0.0,-1.0,0.0,190.0,0.0,-1.0,0.0,-1.0,0.0,0.0
25%,0.0,0.3,0.225216,-1.0,19.0,20.0,0.007193246,-1.181488,894.0,3436.365848,...,1.0,-1.0,0.0,200.0,0.0,3.103053,0.0,1.0,0.0,1.0
50%,0.0,0.6,0.492153,-1.0,52.0,30.0,0.01517574,-0.830507,1263.0,5319.769349,...,1.0,5.0,0.0,200.0,0.0,5.114321,1.0,1.0,0.0,3.0
75%,0.0,0.8,0.755567,12.0,130.0,40.0,0.02633069,4.984176,1944.0,7680.717827,...,1.0,25.0,0.0,500.0,0.0,8.866131,1.0,1.0,0.0,5.0
max,1.0,0.9,0.999999,383.0,428.0,90.0,78.4569,112.956928,6700.0,16715.565404,...,1.0,32.0,1.0,2100.0,1.0,85.899143,1.0,2.0,0.0,7.0


In [5]:
baf_base['fraud_bool'].value_counts()

fraud_bool
0    988971
1     11029
Name: count, dtype: int64

In [9]:
baf_base['fraud_bool'].value_counts()/len(baf_base)

fraud_bool
0    0.988971
1    0.011029
Name: count, dtype: float64

### Make independent dataframes for whether the account if fraudulanet or valid and show summary stats of each variable for the respective dataframe

In [28]:
baf_fraud = baf_base.loc[baf_base['fraud_bool'] == 1]
baf_valid = baf_base.loc[baf_base['fraud_bool'] == 0]

In [29]:
baf_fraud.describe()

Unnamed: 0,fraud_bool,income,name_email_similarity,prev_address_months_count,current_address_months_count,customer_age,days_since_request,intended_balcon_amount,zip_count_4w,velocity_6h,...,phone_mobile_valid,bank_months_count,has_other_cards,proposed_credit_limit,foreign_request,session_length_in_minutes,keep_alive_session,device_distinct_emails_8w,device_fraud_count,month
count,11029.0,11029.0,11029.0,11029.0,11029.0,11029.0,11029.0,11029.0,11029.0,11029.0,...,11029.0,11029.0,11029.0,11029.0,11029.0,11029.0,11029.0,11029.0,11029.0,11029.0
mean,1.0,0.686635,0.393161,5.861365,114.801161,40.858645,1.054615,3.962009,1622.311542,5183.913444,...,0.850576,10.46958,0.084414,833.986762,0.050322,8.22952,0.341645,1.079427,0.0,3.565962
std,0.0,0.265579,0.295607,32.816956,85.324351,13.086334,5.707977,16.622067,1005.687071,2902.298679,...,0.356522,12.875992,0.27802,643.287556,0.218618,9.681103,0.474283,0.320447,0.0,2.312055
min,1.0,0.1,0.000132,-1.0,-1.0,10.0,1e-06,-8.249792,18.0,64.422571,...,0.0,-1.0,0.0,190.0,0.0,-1.0,0.0,-1.0,0.0,0.0
25%,1.0,0.6,0.134257,-1.0,53.0,30.0,0.005911,-1.195618,909.0,2894.349366,...,1.0,-1.0,0.0,200.0,0.0,3.249171,0.0,1.0,0.0,2.0
50%,1.0,0.8,0.29242,-1.0,94.0,40.0,0.013094,-0.900578,1328.0,4916.464728,...,1.0,2.0,0.0,500.0,0.0,5.12182,0.0,1.0,0.0,4.0
75%,1.0,0.9,0.676543,-1.0,156.0,50.0,0.023265,-0.562268,2050.0,7128.603281,...,1.0,25.0,0.0,1500.0,0.0,8.62816,1.0,1.0,0.0,6.0
max,1.0,0.9,0.999953,357.0,392.0,90.0,75.495921,111.321272,6368.0,16084.61717,...,1.0,31.0,1.0,2100.0,1.0,77.730242,1.0,2.0,0.0,7.0


In [30]:
baf_valid.describe()

Unnamed: 0,fraud_bool,income,name_email_similarity,prev_address_months_count,current_address_months_count,customer_age,days_since_request,intended_balcon_amount,zip_count_4w,velocity_6h,...,phone_mobile_valid,bank_months_count,has_other_cards,proposed_credit_limit,foreign_request,session_length_in_minutes,keep_alive_session,device_distinct_emails_8w,device_fraud_count,month
count,988971.0,988971.0,988971.0,988971.0,988971.0,988971.0,988971.0,988971.0,988971.0,988971.0,...,988971.0,988971.0,988971.0,988971.0,988971.0,988971.0,988971.0,988971.0,988971.0,988971.0
mean,0.0,0.561313,0.494815,16.839647,86.273232,33.609125,1.025383,8.713907,1572.138693,5670.664988,...,0.890112,10.843426,0.224533,512.303162,0.024962,7.537306,0.579571,1.01763,0.0,3.285582
std,0.0,0.290309,0.288855,44.140319,88.389648,11.989302,5.378088,20.26669,1005.35778,3010.120768,...,0.31275,12.108084,0.417275,484.365435,0.15601,8.012493,0.493628,0.178471,0.0,2.208634
min,0.0,0.1,1e-06,-1.0,-1.0,10.0,4.03686e-09,-15.530555,1.0,-170.603072,...,0.0,-1.0,0.0,190.0,0.0,-1.0,0.0,-1.0,0.0,0.0
25%,0.0,0.3,0.226963,-1.0,19.0,20.0,0.007208821,-1.181296,894.0,3441.756464,...,1.0,-1.0,0.0,200.0,0.0,3.101249,0.0,1.0,0.0,1.0
50%,0.0,0.6,0.493571,-1.0,52.0,30.0,0.01520335,-0.829552,1262.0,5324.540439,...,1.0,5.0,0.0,200.0,0.0,5.114289,1.0,1.0,0.0,3.0
75%,0.0,0.8,0.756211,13.0,129.0,40.0,0.02636146,5.789258,1943.0,7686.405675,...,1.0,25.0,0.0,500.0,0.0,8.868895,1.0,1.0,0.0,5.0
max,0.0,0.9,0.999999,383.0,428.0,90.0,78.4569,112.956928,6700.0,16715.565404,...,1.0,32.0,1.0,2100.0,1.0,85.899143,1.0,2.0,0.0,7.0


session_length_in_minutes
-1.000000     18
 3.505442      2
 4.036394      1
 13.034989     1
 1.953310      1
              ..
 5.417830      1
 2.594126      1
 2.904633      1
 34.437990     1
 9.943046      1
Name: count, Length: 11011, dtype: int64


## Convert to numeric only data frame
Preview the columns and variables we will work with

In [None]:
# Count the number of fraudulent/infraudulent purchases
baf_base['fraud_bool'].value_counts()
baf_num = baf_base.select_dtypes(include=np.number)
baf_num

## Predictor and target variables

Using train test split, with the target being the variable that tells if the account is fradulent.

In [None]:
# Your code here
y = baf_num['fraud_bool']
X = baf_num.drop(columns=['fraud_bool'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

Show counts of train and test sets.

In [None]:
# Training set
print(y_train.value_counts())
print('\n')
# Test set
print(y_test.value_counts())

## Make a basic logisitic regression model

Will plot the ROC curve and print AUC after


In [None]:
# Initial Model
logr = LogisticRegression(fit_intercept=False, solver='liblinear')

# Probability scores for test set
y_score = logr.fit(X_train, y_train).decision_function(X_test)
# False positive rate and true positive rate
fpr, tpr, thresholds = roc_curve(y_test, y_score)


# Seaborn's beautiful styling
sns.set_style('darkgrid', {'axes.facecolor': '0.9'})

# Print AUC
print('AUC: {}'.format(auc(fpr, tpr)))

# Plot the ROC curve
plt.figure(figsize=(10, 8))
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve')
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.yticks([i/20.0 for i in range(21)])
plt.xticks([i/20.0 for i in range(21)])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.show()

Make pred the prediction and create a confusion matrix after.

In [None]:
# Pred
pred = logr.predict(X_test)
# Get Values
accuracy = accuracy_score(y_test, pred)
precision = precision_score(y_test,pred)
recall = recall_score(y_test, pred)
f1 = f1_score(y_test, pred)
 
print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')
print('Confusion Matrix:')

In [None]:
# Plot confusion matrix of the test set 
cm = confusion_matrix(y_test, pred, labels=logr.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=logr.classes_)
disp.plot()
plt.grid(False)
plt.show()

## Tuning model

Try to test different parameters. Only leave the one highlighted for time as each run may take a few minutes.

In [None]:
# Now let's compare a few different regularization performances on the dataset:
#C_param_range = [0.001]
C_param_range = [0.001, 0.01, 0.1, 1, 10, 100]
names = [0.001, 0.01, 0.1, 1, 10, 100]
colors = sns.color_palette('Set2')

plt.figure(figsize=(10, 8))

for n, c in enumerate(C_param_range):
    logr = LogisticRegression(fit_intercept=False, C=c, solver='liblinear')
    model_log = logr.fit(X_train, y_train)

    y_hat_test = logr.predict(X_test)
    y_score = logr.fit(X_train, y_train).decision_function(X_test)

    fpr, tpr, thresholds = roc_curve(y_test, y_score)
    print('AUC for {}: {}'.format(names[n], auc(fpr, tpr)))
    lw = 2
    plt.plot(fpr, tpr, color=colors[n],
             lw=lw, label='ROC curve Normalization Weight: {}'.format(names[n]))

plt.plot([0, 1], [0, 1], color='blue', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])

plt.yticks([i/20.0 for i in range(21)])
plt.xticks([i/20.0 for i in range(21)])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

### SMOTE

SMOTE to compare different performance and check for improvements.

In [None]:
#pip install threadpoolctl==3.1.0

In [None]:
# Previous original class distribution
print(y_train.value_counts())

# Fit SMOTE to training data
X_train_resampled, y_train_resampled = SMOTE().fit_resample(X_train, y_train)

# Preview synthetic sample class distribution
print('\n')
print(pd.Series(y_train_resampled).value_counts()) 

fig, ax = plt.subplots(figsize = (20, 10))
sns.heatmap(X_train_resampled.corr())

Same as before, only leaving one parameter active for time when rerunning.

In [None]:
# Now let's compare a few different regularization performances on the dataset
#C_param_range = [0.005]
C_param_range = [0.005, 0.1, 0.2, 0.5, 0.8, 1, 1.25, 1.5, 2]
names = [0.005, 0.1, 0.2, 0.5, 0.8, 1, 1.25, 1.5, 2]
colors = sns.color_palette('Set2', n_colors=len(names))

plt.figure(figsize=(10, 8))

# Write a for loop that builds models for each value of C_param_range, prints the AUC and plots the ROC
for n, c in enumerate(C_param_range):
    logr = LogisticRegression(fit_intercept=False, C=c, solver='liblinear')
    model_log = logr.fit(X_train_resampled, y_train_resampled)

    y_hat_test = logr.predict(X_test)
    y_score = logr.fit(X_train_resampled, y_train_resampled).decision_function(X_test)
    fpr, tpr, thresholds = roc_curve(y_test, y_score)
    
    print('AUC for {}: {}'.format(names[n], auc(fpr, tpr))) 
    lw = 2
    plt.plot(fpr, tpr, color=colors[n],
             lw=lw, label='ROC curve Regularization Weight: {}'.format(names[n]))


plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.yticks([i/20.0 for i in range(21)])
plt.xticks([i/20.0 for i in range(21)])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.show()

Values very similiar. Some adjustments needed. Will also have to consider the few non-numerical variables in the original data set and their possible effect.

In [None]:
# Previous original class distribution
print(y.value_counts()) 
X_resampled, y_resampled = SMOTE().fit_resample(X, y) 
# Preview synthetic sample class distribution
print('---------------------------------')
print(pd.Series(y_resampled).value_counts()) 

# Split resampled data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, random_state=0)

# Now let's compare a few different regularization performances on the dataset:
#C_param_range = [0.005]
C_param_range = [0.005, 0.1, 0.2, 0.3, 0.5, 0.6, 0.7, 0.8]
names = [0.005, 0.1, 0.2, 0.3, 0.5, 0.6, 0.7, 0.8, 0.9]
colors = sns.color_palette('Set2', n_colors=len(names))

plt.figure(figsize=(10, 8))

for n, c in enumerate(C_param_range):
    # Fit a model
    logreg = LogisticRegression(fit_intercept=False, C=c, solver='liblinear')
    model_log = logreg.fit(X_train, y_train)

    # Predict
    y_hat_test = logreg.predict(X_test)

    y_score = logreg.fit(X_train, y_train).decision_function(X_test)

    fpr, tpr, thresholds = roc_curve(y_test, y_score)
    print('----------------------------------------------')
    print('AUC for {}: {}'.format(names[n], auc(fpr, tpr)))
    lw = 2
    plt.plot(fpr, tpr, color=colors[n],
             lw=lw, label='ROC curve Normalization Weight: {}'.format(names[n]))
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.yticks([i/20.0 for i in range(21)])
plt.xticks([i/20.0 for i in range(21)])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

accuracy = accuracy_score(y_test, y_hat_test)
precision = precision_score(y_test, y_hat_test)
recall = recall_score(y_test, y_hat_test)
f1 = f1_score(y_test, y_hat_test)
 
print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')
print('Confusion Matrix:')