In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

! kaggle datasets download kartik2112/fraud-detection

! unzip fraud-detection.zip

In [2]:
df = pd.read_csv("fraudTrain.csv")
df.head()

Unnamed: 0,row_num,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


In [3]:
df.columns

Index(['row_num', 'trans_date_trans_time', 'cc_num', 'merchant', 'category',
       'amt', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip',
       'lat', 'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time',
       'merch_lat', 'merch_long', 'is_fraud'],
      dtype='object')

# Feature Engineering

## Customer Specific Features

- Age (at time of transaction)
- Gender
- Job (target encoding)
- State (target encoding)

## Transaction Specific Features

- Hour of day
- Day of the week
- Havensine distance between customer and merchant
- Category (target encoding)

In [11]:
# Age (at time of transaction)
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
df['trans_date'] = df['trans_date_trans_time'].dt.date
df['trans_date'] = pd.to_datetime(df["trans_date"])
df['trans_time'] = df['trans_date_trans_time'].dt.time
df['trans_time'] = pd.to_datetime(df["trans_time"])

In [None]:
df[['trans_date_trans_time', 'trans_date', 'trans_time']].head()

In [None]:
df['dob'] = pd.to_datetime(df['dob'])

df['age_at_transaction'] = (df['trans_date'].dt.year - df['dob'].dt.year) - \
    ((df['trans_date'].dt.month < df['dob'].dt.month) | \
     ((df['trans_date'].dt.month == df['dob'].dt.month) & \
      (df['trans_date'].dt.day < df['dob'].dt.day)))

df[['trans_date_trans_time', 'dob', 'age_at_transaction']].head()

In [5]:
# Age (at time of transaction)
df["trans_date_trans_time"] = pd.to_datetime(df["trans_date_trans_time"])
df["trans_date"] = df["trans_date_trans_time"].dt.date
df["trans_date"] = pd.to_datetime(df["trans_date"])

df["dob"] = pd.to_datetime(df["dob"])

df["age_at_transaction"] = (df["trans_date"].dt.year - df["dob"].dt.year) - \
    ((df["trans_date"].dt.month < df["dob"].dt.month) | \
     ((df["trans_date"].dt.month == df["dob"].dt.month) & \
      (df["trans_date"].dt.day < df["dob"].dt.day)))

df[["trans_date", "dob", "age_at_transaction"]].head()

Unnamed: 0,trans_date,dob,age_at_transaction
0,2019-01-01,1988-03-09,30
1,2019-01-01,1978-06-21,40
2,2019-01-01,1962-01-19,56
3,2019-01-01,1967-01-12,51
4,2019-01-01,1986-03-28,32


In [6]:
# Job (target encoding)
job_encoding = df.groupby("job")["is_fraud"].mean()
df["job_encoding"] = df["job"].map(job_encoding)

df[["job", "is_fraud", "job_encoding"]].head()

Unnamed: 0,job,is_fraud,job_encoding
0,"Psychologist, counselling",0,0.001693
1,Special educational needs teacher,0,0.002157
2,Nature conservation officer,0,0.015656
3,Patent attorney,0,0.007905
4,Dance movement psychotherapist,0,0.0


In [7]:
# State (target encoding)
state_encoding = df.groupby("state")["is_fraud"].mean()
df["state_encoding"] = df["state"].map(state_encoding)

df[["state", "is_fraud", "state_encoding"]].head()


Unnamed: 0,state,is_fraud,state_encoding
0,NC,0,0.004923
1,WA,0,0.005073
2,ID,0,0.001984
3,MT,0,0.002722
4,VA,0,0.006769


In [8]:
# Hour of day
df["hour_of_day"] = df["trans_date_trans_time"].dt.hour
df[["trans_date_trans_time", "hour_of_day"]].head()

Unnamed: 0,trans_date_trans_time,hour_of_day
0,2019-01-01 00:00:18,0
1,2019-01-01 00:00:44,0
2,2019-01-01 00:00:51,0
3,2019-01-01 00:01:16,0
4,2019-01-01 00:03:06,0


In [10]:
# Day of the week
df["day_of_week"] = df["trans_date_trans_time"].dt.dayofweek
df[["trans_date_trans_time", "day_of_week"]].head()

Unnamed: 0,trans_date_trans_time,day_of_week
0,2019-01-01 00:00:18,1
1,2019-01-01 00:00:44,1
2,2019-01-01 00:00:51,1
3,2019-01-01 00:01:16,1
4,2019-01-01 00:03:06,1


In [11]:
# Havensine distance between customer and merchant
def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # Earth's radius in kilometers
    
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    return R * c

df["havensine_distance"] = haversine(df["lat"], df["long"], df["merch_lat"], df["merch_long"])
df[["lat", "long", "merch_lat", "merch_long", "havensine_distance"]].head()

Unnamed: 0,lat,long,merch_lat,merch_long,havensine_distance
0,36.0788,-81.1781,36.011293,-82.048315,78.597568
1,48.8878,-118.2105,49.159047,-118.186462,30.212176
2,42.1808,-112.262,43.150704,-112.154481,108.206083
3,46.2306,-112.1138,47.034331,-112.561071,95.673231
4,38.4207,-79.4629,38.674999,-78.632459,77.556744


In [12]:
df["category"].unique()

array(['misc_net', 'grocery_pos', 'entertainment', 'gas_transport',
       'misc_pos', 'grocery_net', 'shopping_net', 'shopping_pos',
       'food_dining', 'personal_care', 'health_fitness', 'travel',
       'kids_pets', 'home'], dtype=object)

In [13]:
# Category (target encoding)
category_encoding = df.groupby("category")["is_fraud"].mean()
df["category_encoding"] = df["category"].map(category_encoding)

df[["category", "is_fraud", "category_encoding"]].head()

Unnamed: 0,category,is_fraud,category_encoding
0,misc_net,0,0.014458
1,grocery_pos,0,0.014098
2,entertainment,0,0.002478
3,gas_transport,0,0.004694
4,misc_pos,0,0.003139


In [28]:
from sklearn.model_selection import train_test_split

def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # Earth's radius in kilometers
    
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    return R * c

def parse_df(train_csv_path, test_csv_path):

    train_df = pd.read_csv(train_csv_path)
    test_df = pd.read_csv(test_csv_path)

    # Concatenate train and test data
    df = pd.concat([train_df, test_df])

    # Age (at time of transaction)
    df["trans_date_trans_time"] = pd.to_datetime(df["trans_date_trans_time"])
    df["trans_date"] = df["trans_date_trans_time"].dt.date
    df["trans_date"] = pd.to_datetime(df["trans_date"])

    # Gender
    df = pd.get_dummies(df, columns=["gender"])

    df["dob"] = pd.to_datetime(df["dob"])

    df["age_at_transaction"] = (df["trans_date"].dt.year - df["dob"].dt.year) - \
        ((df["trans_date"].dt.month < df["dob"].dt.month) | \
        ((df["trans_date"].dt.month == df["dob"].dt.month) & \
        (df["trans_date"].dt.day < df["dob"].dt.day)))
    
    # Job (target encoding)
    job_encoding = df.groupby("job")["is_fraud"].mean()
    df["job_encoding"] = df["job"].map(job_encoding)

    # State (target encoding)
    state_encoding = df.groupby("state")["is_fraud"].mean()
    df["state_encoding"] = df["state"].map(state_encoding)

    # Hour of day
    df["hour_of_day"] = df["trans_date_trans_time"].dt.hour
    
    # Day of the week
    df["day_of_week"] = df["trans_date_trans_time"].dt.dayofweek

    # Havensine distance between customer and merchant
    df["havensine_distance"] = haversine(df["lat"], df["long"], df["merch_lat"], df["merch_long"])

    # Category (target encoding)
    category_encoding = df.groupby("category")["is_fraud"].mean()
    df["category_encoding"] = df["category"].map(category_encoding)

    X, y = df.drop(columns=["row_num", "trans_date_trans_time", "cc_num", "merchant", "first", "last", "street", "city", "state", "zip", "category", "lat", "long", "merch_lat", "merch_long", "job", "dob", "trans_num", "unix_time", "trans_date", "is_fraud"]), df["is_fraud"]
    
    # Split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    return X_train, X_test, y_train, y_test

In [29]:
X_train, X_test, y_train, y_test = parse_df("fraudTrain.csv", "fraudTest.csv")

In [30]:
X_train.head()

Unnamed: 0,amt,city_pop,gender_F,gender_M,age_at_transaction,job_encoding,state_encoding,hour_of_day,day_of_week,havensine_distance,category_encoding
1273644,166.8,450,False,True,73,0.00183,0.005833,7,5,118.568453,0.012645
601398,28.86,34882,True,False,48,0.000686,0.004994,19,3,132.208394,0.015927
999645,37.93,302,True,False,30,0.004719,0.003972,5,4,31.778845,0.004106
1180310,18.7,5791,False,True,60,0.002466,0.005488,0,4,133.995217,0.002819
213847,33.54,2526,True,False,49,0.005696,0.004376,2,5,57.125396,0.013039


In [31]:
X_test.head()

Unnamed: 0,amt,city_pop,gender_F,gender_M,age_at_transaction,job_encoding,state_encoding,hour_of_day,day_of_week,havensine_distance,category_encoding
244469,59.91,18182,False,True,45,0.0,0.004106,7,4,68.679224,0.004106
434906,3.96,76383,False,True,35,0.003731,0.004954,17,5,74.983242,0.006344
354659,51.17,16305,True,False,92,0.007153,0.004542,11,5,57.099604,0.004106
197113,2.06,5161,True,False,27,0.003013,0.004371,22,5,52.78145,0.006344
468148,6.58,1263321,False,True,76,0.016173,0.004376,15,3,37.942009,0.002692


In [32]:
from sklearn.metrics import accuracy_score, classification_report
import pickle

def evaluate_and_save_model(model, X_train, X_test, y_train, y_test, filename):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model.__class__.__name__} Accuracy: {accuracy:.4f}")
    print(f"\nClassification Report:\n{classification_report(y_test, y_pred)}")
    print("-----------")

    with open(f"models/{filename}", "wb") as file:
        pickle.dump(model, file)
    
    print(f"Model saved as {filename}")

In [33]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
import xgboost as xgb

In [34]:
xgb_model = xgb.XGBClassifier(random_state=42)
evaluate_and_save_model(xgb_model, X_train, X_test, y_train, y_test, "xgb_model.pkl")

XGBClassifier Accuracy: 0.9989

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    368526
           1       0.95      0.83      0.89      1953

    accuracy                           1.00    370479
   macro avg       0.98      0.91      0.94    370479
weighted avg       1.00      1.00      1.00    370479

-----------
Model saved as xgb_model.pkl


In [35]:
dt_model = DecisionTreeClassifier(random_state=42)
evaluate_and_save_model(dt_model, X_train, X_test, y_train, y_test, "dt_model.pkl")

DecisionTreeClassifier Accuracy: 0.9980

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    368526
           1       0.80      0.83      0.81      1953

    accuracy                           1.00    370479
   macro avg       0.90      0.91      0.91    370479
weighted avg       1.00      1.00      1.00    370479

-----------
Model saved as dt_model.pkl


In [36]:
rf_model = RandomForestClassifier(random_state=42)
evaluate_and_save_model(rf_model, X_train, X_test, y_train, y_test, "rf_model.pkl")


RandomForestClassifier Accuracy: 0.9987

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    368526
           1       0.97      0.78      0.86      1953

    accuracy                           1.00    370479
   macro avg       0.99      0.89      0.93    370479
weighted avg       1.00      1.00      1.00    370479

-----------
Model saved as rf_model.pkl


In [37]:

knn_model = KNeighborsClassifier()
evaluate_and_save_model(knn_model, X_train, X_test, y_train, y_test, "knn_model.pkl")

KNeighborsClassifier Accuracy: 0.9952

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    368526
           1       0.60      0.29      0.39      1953

    accuracy                           1.00    370479
   macro avg       0.80      0.64      0.69    370479
weighted avg       0.99      1.00      0.99    370479

-----------
Model saved as knn_model.pkl


In [38]:
nb_model = GaussianNB()
evaluate_and_save_model(nb_model, X_train, X_test, y_train, y_test, "nb_model.pkl")

GaussianNB Accuracy: 0.9919

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      1.00    368526
           1       0.32      0.48      0.38      1953

    accuracy                           0.99    370479
   macro avg       0.66      0.74      0.69    370479
weighted avg       0.99      0.99      0.99    370479

-----------
Model saved as nb_model.pkl


In [None]:
svm_model = SVC(random_state=42)
evaluate_and_save_model(svm_model, X_train, X_test, y_train, y_test, "svm_model.pkl")

In [None]:
from sklearn.ensemble import VotingClassifier

In [None]:
soft_voting = VotingClassifier(
    estimators=[
        ('xgb', xgb_model),
        ('dt', dt_model), 
        ('rf', rf_model)
    ],
    voting='soft'
)
evaluate_and_save_model(soft_voting, X_train, X_test, y_train, y_test, "soft_voting_model.pkl")

In [None]:
hard_voting = VotingClassifier(
    estimators=[
        ('xgb', xgb_model),
        ('dt', dt_model),
        ('rf', rf_model)
    ],
    voting='hard'
)
evaluate_and_save_model(hard_voting, X_train, X_test, y_train, y_test, "hard_voting_model.pkl")