This Notebook is for the Models of our Project.

Models include:
- Logistic Regression
- Random Forest
- KNN
- Neural Network

Below you can find the Pre-processing, Training, and Testing for Each model

At the end we will conclude with a comparison between each model and discuss results!

In [164]:
# imports
import pandas as pd
import numpy as np
import seaborn as sns
import math
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from sklearn.decomposition import PCA



In [165]:
# Data collection
data = pd.read_csv('credit_card_fraud.csv', parse_dates=['trans_date_trans_time',])

X = data.drop(['is_fraud'], axis=1)
Y = data['is_fraud']

In [161]:
# Method to calculate the distance between to longitude and latitude points.
# Since we have both the customer (lat, long) at time of purchase and the merchant's (lat,long) we can compute the distance between the two
# this could lead us seeing if the is any correlation between how far a purchase is and if it is fraud or not
def distance(lat1, lon1, lat2, lon2):
    # radius of the Earth in km
    R = 6371.0

    # convert degrees to radians
    lat1 = math.radians(lat1)
    lon1 = math.radians(lon1)
    lat2 = math.radians(lat2)
    lon2 = math.radians(lon2)

    # compute the differences between the two points
    dlon = lon2 - lon1
    dlat = lat2 - lat1

    # compute the Haversine formula
    a = math.sin(dlat / 2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon / 2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    dist = R * c

    return dist

In [166]:
# Pre-processing --------------------------------------------------------

# changing data types
X['dob'] = pd.to_datetime(X['dob'])

# creating columns out of our original Dataset --------------------------

X['hour_of_transaction'] = X.trans_date_trans_time.dt.hour # hour of transaction
X['month_of_transaction'] = X.trans_date_trans_time.dt.month # month of transaction
X['dow_of_transaction'] = X.trans_date_trans_time.dt.day_name() # day of week of transaction
X['cust_age'] = (X['trans_date_trans_time'] - X['dob']).astype('timedelta64[Y]') # age of person during transaction
X['distance_of_transaction'] = X.apply(lambda row: distance(row['lat'], row['long'], row['merch_lat'], row['merch_long']), axis=1) # distance of transaction

# encoding: 0 = normal time, 1 = odd time
X['Normal_transaction_time'] = 0
X.loc[X.hour_of_transaction < 5,'Normal_transaction_time'] = 1
X.loc[X.hour_of_transaction > 21,'Normal_transaction_time'] = 1

# one-hot encoding the categorical features
encoder = OneHotEncoder()
dow_encoded = encoder.fit_transform(X[['dow_of_transaction']])
dow_encoded_df = pd.DataFrame(dow_encoded.toarray(), columns=encoder.categories_[0])
X = pd.concat([X, dow_encoded_df], axis=1)

state_encoded = encoder.fit_transform(X[['state']])
state_encoded_df = pd.DataFrame(state_encoded.toarray(), columns=encoder.categories_[0])
X = pd.concat([X,state_encoded_df], axis=1)

merch_encoded = encoder.fit_transform(X[['merchant']])
merch_encoded_df = pd.DataFrame(merch_encoded.toarray(), columns=encoder.categories_[0])
X = pd.concat([X, merch_encoded_df], axis=1)

cat_encoded = encoder.fit_transform(X[['category']])
cat_encoded_df = pd.DataFrame(cat_encoded.toarray(), columns=encoder.categories_[0])
X = pd.concat([X, cat_encoded_df], axis=1)

city_encoded = encoder.fit_transform(X[['city']])
city_encoded_df = pd.DataFrame(city_encoded.toarray(), columns=encoder.categories_[0])
X = pd.concat([X, city_encoded_df], axis=1)


# Normalizing the features with varying ranges of numbers ------------------------------------------------------------

# min-max normalization since no real outliers for these features
X['cust_age'] = (X['cust_age'] - X['cust_age'].min()) / (X['cust_age'].max() - X['cust_age'].min())

# z-score normalization for values that are wide-spread such as amt and city population
X['amt'] = (X['amt'] - X['amt'].mean()) / X['amt'].std() 
X['city_pop'] = (X['city_pop'] - X['city_pop'].mean()) / X['city_pop'].std() 

# getting rid of unnecessary columns
X.drop(['trans_num', 'job','trans_date_trans_time', 'state', 'city', 'merchant', 'category', 'dow_of_transaction', 'dob'], axis=1, inplace=True)

In [167]:
X.head()

Unnamed: 0,amt,lat,long,city_pop,merch_lat,merch_long,hour_of_transaction,month_of_transaction,cust_age,distance_of_transaction,...,Wales,Wappapello,Weeping Water,Wendel,Westerville,Westfir,Wheaton,Williamsburg,Woods Cross,Yellowstone National Park
0,0.226701,48.8878,-118.2105,-0.365123,49.159047,-118.186462,0,1,0.302632,30.212176,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.924891,42.1808,-112.262,-0.351455,43.150704,-112.154481,0,1,0.513158,108.206083,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.159035,41.6125,-122.5258,-0.363621,41.65752,-122.230347,0,1,0.736842,25.059079,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.388482,32.9396,-105.8189,-0.362563,32.863258,-106.520205,0,1,0.447368,66.021685,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.394173,43.0172,-111.0292,-0.364024,43.753735,-111.454923,0,1,0.447368,88.830984,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [153]:
pca = PCA(n_components=20)
pca.fit(X)
X_pca = pca.transform(X)
X_pca = pd.DataFrame(X_pca)
X_pca.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,11.762639,13.488977,11.531076,6.148449,0.196133,-0.043365,-0.098173,-0.496094,-0.391983,-0.204738,-0.26778,0.220493,-0.686378,-0.175062,-0.324896,-0.028146,-0.378488,-0.572667,-0.153772,-0.015224
1,2.663536,5.169252,12.366131,6.167659,0.821876,0.30659,-0.096498,-0.475124,-0.161804,-0.580113,-0.558632,0.3294,-0.664159,0.043374,-0.310994,-0.040888,0.049311,-0.045441,-0.059226,-0.054507
2,16.908968,2.655734,12.472429,6.17348,0.07312,0.35879,-0.082031,-0.519137,-0.404864,-0.4312,-0.132352,-0.563,-0.750026,0.057217,-0.338387,-0.033824,-0.283833,-0.463733,-0.426065,0.066462
3,-6.85154,-7.925427,13.650886,6.198347,-0.537756,0.618512,-0.111379,-0.480808,-0.425854,0.343206,-0.490844,0.614755,-0.655615,0.127032,-0.309509,-0.056618,-0.08141,-0.203623,0.230281,-0.300932
4,1.372367,6.28137,12.278311,6.164148,-0.480372,0.107973,-0.103271,-0.486222,-0.20954,-0.189112,-0.710503,0.351519,-0.668539,0.10871,-0.30257,-0.042627,-0.283745,-0.4174,0.506267,0.194498


Since this data set is heavely skewed in Non-Fraudulent transactions favor, we have done some research in how to address this.
We concluded that we can take the approach of doing under-sampling, over-sampling, and combining both.

Under-sampling: The number of samples taken from majority class (Not Fraud) will be equal to total number of samples of minority class (Fraud)
Over-sampling: Selecting random samples from the minority class (Fraud) and adding to the training data copies of the sample


Logistic Regression Model - Under Sampling

In [168]:
under_sample = RandomUnderSampler()
X_under, Y_under = under_sample.fit_resample(X,Y) # data set used for all under sampled models

X_train_u, X_test_u, Y_train_u, Y_test_u = train_test_split(X_under, Y_under, test_size = 0.2, random_state=42)

print('Training Data Shape   : ', X_train_u.shape)
print('Training Labels Shape : ', Y_train_u.shape)
print('Testing Data Shape    : ', X_test_u.shape)
print('Testing Labels Shape  : ', Y_test_u.shape)
print()

from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression()
lr_model.fit(X_train_u,Y_train_u)

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

pred_train_lr = lr_model.predict(X_train_u)
pred_test_lr  = lr_model.predict(X_test_u)

print('Logistic Regression Results with Under-Sampling:')
print()
print('Training Accuracy : ', accuracy_score(Y_train_u, pred_train_lr))
print('Testing  Accuracy : ', accuracy_score(Y_test_u, pred_test_lr))

# Checking f1 score, precision and recall
print('Training Set f1 score : ', f1_score(Y_train_u, pred_train_lr))
print('Testing  Set f1 score : ', f1_score(Y_test_u, pred_test_lr))
print()
print('Test set precision : ', precision_score(Y_train_u, pred_train_lr))
print('Test set recall    : ', recall_score(Y_test_u, pred_test_lr))




Training Data Shape   :  (2851, 914)
Training Labels Shape :  (2851,)
Testing Data Shape    :  (713, 914)
Testing Labels Shape  :  (713,)

Logistic Regression Results with Under-Sampling:

Training Accuracy :  0.890915468256752
Testing  Accuracy :  0.9004207573632539
Training Set f1 score :  0.8936752136752136
Testing  Set f1 score :  0.8957415565345079

Test set precision :  0.8843031123139378
Test set recall    :  0.9104477611940298


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Random Forest Model - Under Sampling

Hyperparameters include:
n_estimators: Determines the number of decision tress that are "grown" in random forest
max_depth: the maximum depth for each decision tree
random_state: helps randomize the data to generate diverse decision trees and will help in comparing later since each model has same random_state

Hyperparameters tested:

n_estimators=100, max_depth=10, random_state=42




In [169]:
from sklearn.ensemble import RandomForestClassifier


rf_classifier = RandomForestClassifier(n_estimators=200, max_depth=200, random_state=42)
rf_classifier.fit(X_train_u, Y_train_u)

pred_train_rf = rf_classifier.predict(X_train_u)
pred_test_rf = rf_classifier.predict(X_test_u)

print('Random Forest Classifier Results with Under-Sampling:')
print()

print('Training Set Accuracy : ', accuracy_score(Y_train_u, pred_train_rf))
print('Testing Set Accuracy  : ', accuracy_score(Y_test_u, pred_test_rf))





Random Forest Classifier Results with Under-Sampling:

Training Set Accuracy :  1.0
Testing Set Accuracy  :  0.9565217391304348


In [171]:
from sklearn.neural_network import MLPClassifier

nn_classifier = MLPClassifier(hidden_layer_sizes=(914,500,250,100,50,1), activation='relu', random_state=42)
nn_classifier.fit(X_train_u, Y_train_u)

pred_train_nn = nn_classifier.predict(X_train_u)
pred_test_nn = nn_classifier.predict(X_test_u)

print('Neural Network (MLP) Classifier Results with Under-Sampling:')
print()

print('Training Set Accuracy : ', accuracy_score(Y_train_u, pred_train_nn))
print('Testing Set Accuracy  : ', accuracy_score(Y_test_u, pred_test_nn))






Neural Network (MLP) Classifier Results with Under-Sampling:

Training Set Accuracy :  0.9600140301648544
Testing Set Accuracy  :  0.85273492286115


In [148]:
over_sample = RandomOverSampler()
X_over, Y_over = over_sample.fit_resample(X,Y) # data set used for all over sampled models


X_train_o, X_test_o, Y_train_o, Y_test_o = train_test_split(X_over, Y_over, test_size = 0.2, random_state=42)

print('Training Data Shape   : ', X_train_o.shape)
print('Training Labels Shape : ', Y_train_o.shape)
print('Testing Data Shape    : ', X_test_o.shape)
print('Testing Labels Shape  : ', Y_test_o.shape)
print()

from sklearn.linear_model import LogisticRegression

lr_model_over = LogisticRegression()
lr_model_over.fit(X_train_o,Y_train_o)

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

pred_train_lr2 = lr_model_over.predict(X_train_o)
pred_test_lr2  = lr_model_over.predict(X_test_o)

print('Logistic Regression Results with Under-Sampling:')
print()
print('Training Accuracy : ', accuracy_score(Y_train_o, pred_train_lr2))
print('Testing  Accuracy : ', accuracy_score(Y_test_o, pred_test_lr2))

# Checking f1 score, precision and recall
print('Training Set f1 score : ', f1_score(Y_train_o, pred_train_lr2))
print('Testing  Set f1 score : ', f1_score(Y_test_o, pred_test_lr2))
print()
print('Test set precision : ', precision_score(Y_train_o, pred_train_lr2))
print('Test set recall    : ', recall_score(Y_test_o, pred_test_lr2))


Training Data Shape   :  (540520, 913)
Training Labels Shape :  (540520,)
Testing Data Shape    :  (135130, 913)
Testing Labels Shape  :  (135130,)



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression Results with Under-Sampling:

Training Accuracy :  0.8968770813290905
Testing  Accuracy :  0.8968992821727225
Training Set f1 score :  0.899475556004415
Testing  Set f1 score :  0.8997467042772438

Test set precision :  0.8770777206446122
Test set recall    :  0.9240160215196795


In [149]:
rf_classifier_o = RandomForestClassifier(n_estimators=200, max_depth=200, random_state=42)
rf_classifier_o.fit(X_train_o, Y_train_o)

pred_train_rf2 = rf_classifier.predict(X_train_o)
pred_test_rf2 = rf_classifier.predict(X_test_o)

print('Random Forest Classifier Results with Under-Sampling:')
print()

print('Training Set Accuracy : ', accuracy_score(Y_train_o, pred_train_rf2))
print('Testing Set Accuracy  : ', accuracy_score(Y_test_o, pred_test_rf2))

Random Forest Classifier Results with Under-Sampling:

Training Set Accuracy :  0.968252793606157
Testing Set Accuracy  :  0.9680011840449937
