In [29]:
# Import Modules
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Display all of the columns
pd.set_option('display.max_columns', None)

In [30]:
## function to genearte Confusion Matrix and Classification Report
def gen_cm_cr(modelName, y_test, y_pred):
    cm = confusion_matrix(y_test, y_pred)
    cm_df = pd.DataFrame(
        cm, index=["Legitimate 0", "Fraudulent 1"], columns=["Predicted Legitimate 0", "Predicted Fraudulent 1"]
    )
    
    # Calculating the accuracy score
    acc_score = accuracy_score(y_test, y_pred)
    print("Confusion Matrix: " + modelName)
    display(cm_df)
    print(f"Accuracy Score : {acc_score}")
    print("Classification Report")
    print(classification_report(y_test, y_pred))

In [31]:
# Read the CSV file into a Pandas DataFrame
fraud_df = pd.read_csv(
    Path('/Users/guranslimbu/Desktop/Project-4 Team 6/Resources/fraud_test_extended.csv')
)

In [32]:
fraud_df.head()

Unnamed: 0,ID,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,state,zip,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud,date_dob,dt_trans_date_time,age_years,log_amount,distance_km,region
0,0,21/06/2020 12:14,2291160000000000.0,fraud_Kirlin and Sons,personal_care,2.86,Jeff,Elliott,M,351 Darlene Green,Columbia,SC,29209,33.9659,-80.9355,333497,Mechanical engineer,19/03/1968,2da90c7d74bd46a0caf3777415b3ebd3,1371816865,33.986391,-81.200714,0,1968-03-19,2020-06-21 12:14:00,56.130169,1.051171,1.551148,Southeast
1,1,21/06/2020 12:14,3573030000000000.0,fraud_Sporer-Keebler,personal_care,29.84,Joanne,Williams,F,3638 Marsh Union,Altonah,UT,84002,40.3207,-110.436,302,"Sales professional, IT",17/01/1990,324cc204407e99f51b0d6ca0055005e7,1371816873,39.450498,-109.960431,0,1990-01-17,2020-06-21 12:14:00,34.283593,3.395883,1.666594,Rocky Mountain
2,2,21/06/2020 12:14,3598220000000000.0,"fraud_Swaniawski, Nitzsche and Welch",health_fitness,41.28,Ashley,Lopez,F,9333 Valentine Point,Bellmore,NY,11710,40.6729,-73.5365,34496,"Librarian, public",21/10/1970,c81755dbbbea9d5c77f094348a7579be,1371816893,40.49581,-74.196111,0,1970-10-21,2020-06-21 12:14:00,53.538388,3.720402,1.387472,Mideast
3,3,21/06/2020 12:15,3591920000000000.0,fraud_Haley Group,misc_pos,60.05,Brian,Williams,M,32941 Krystal Mill Apt. 552,Titusville,FL,32780,28.5697,-80.8191,54767,Set designer,25/07/1987,2159175b9efe66dc301f149d3d5abf8c,1371816915,28.812398,-80.883061,0,1987-07-25,2020-06-21 12:15:00,36.768525,4.095194,1.599663,Southeast
4,4,21/06/2020 12:15,3526830000000000.0,fraud_Johnston-Casper,travel,3.19,Nathan,Massey,M,5783 Evan Roads Apt. 465,Falmouth,MI,49632,44.2529,-85.017,1126,Furniture designer,06/07/1955,57ff021bd3f328f8738bb535c302a31b,1371816917,44.959148,-85.884734,0,1955-07-06,2020-06-21 12:15:00,68.842497,1.160334,1.40847,Great Lakes


In [33]:
fraud_df.dtypes.head()

ID                         int64
trans_date_trans_time     object
cc_num                   float64
merchant                  object
category                  object
dtype: object

# Splitting Training and Testing data

In [34]:
# Selecting features (y) and target variable (X)
y = fraud_df['is_fraud']
X = fraud_df[['amt', 'age_years', 'lat', 'long', 'merch_lat', 'merch_long', 
              'log_amount', 'distance_km', 'city_pop']]

In [35]:
X[:5]

Unnamed: 0,amt,age_years,lat,long,merch_lat,merch_long,log_amount,distance_km,city_pop
0,2.86,56.130169,33.9659,-80.9355,33.986391,-81.200714,1.051171,1.551148,333497
1,29.84,34.283593,40.3207,-110.436,39.450498,-109.960431,3.395883,1.666594,302
2,41.28,53.538388,40.6729,-73.5365,40.49581,-74.196111,3.720402,1.387472,34496
3,60.05,36.768525,28.5697,-80.8191,28.812398,-80.883061,4.095194,1.599663,54767
4,3.19,68.842497,44.2529,-85.017,44.959148,-85.884734,1.160334,1.40847,1126


In [36]:
y[:5]

0    0
1    0
2    0
3    0
4    0
Name: is_fraud, dtype: int64

In [37]:
# Splitting the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [38]:
# Display the shape of training and testing sets
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (444575, 9)
Shape of X_test: (111144, 9)
Shape of y_train: (444575,)
Shape of y_test: (111144,)


In [39]:
# Create the StandardScaler instance
scaler = StandardScaler()
# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)
# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [40]:
# Initializing model with k = 5 neighbours # Setting the number of neighbors to adjust value
model = KNeighborsClassifier(n_neighbors=5)  

In [41]:
# Train the model
model.fit(X_train_scaled, y_train)

In [42]:
# Create predictions
y_pred = model.predict(X_test_scaled)

# Review the predictions
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [43]:
# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.99669797739869


In [44]:
# Print confusion matrix
confusion_matrix(y_pred,y_test)

array([[110632,    281],
       [    86,    145]])

In [47]:
# Display classification report
gen_cm_cr('Classification Report', y_test, y_pred)

Confusion Matrix: Classification Report


Unnamed: 0,Predicted Legitimate 0,Predicted Fraudulent 1
Legitimate 0,110632,86
Fraudulent 1,281,145


Accuracy Score : 0.99669797739869
Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    110718
           1       0.63      0.34      0.44       426

    accuracy                           1.00    111144
   macro avg       0.81      0.67      0.72    111144
weighted avg       1.00      1.00      1.00    111144

