In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
from matplotlib import pyplot as plt
from sklearn.model_selection import cross_val_score
import os
from sklearn import metrics
from collections import Counter
import kagglehub
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
import datetime as dt
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing
import xgboost as xgb
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
# Download latest version
path = kagglehub.dataset_download("kartik2112/fraud-detection")
print("Path to dataset files:", path)
print(os.listdir(path))

Path to dataset files: /kaggle/input/fraud-detection
['fraudTest.csv', 'fraudTrain.csv']


In [None]:
train_path = os.path.join(path, 'fraudTrain.csv')
test_path = os.path.join(path, 'fraudTest.csv')

train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)

print(train_data.shape)
train_data.head()


(1296675, 23)


Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


In [None]:
combined_df = pd.concat([train_data, test_data], ignore_index=True)

print(combined_df.shape)
combined_df.head()

(1852394, 23)


Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


**The following bar chart presents the overall distribution of fraudulent versus genuine transactions in the dataset. It visualizes the extreme imbalance between the two classes, which is typical in fraud detection scenarios:**

In [None]:
# Creating a bar chart to show the distribution of fraudulent vs genuine transactions
labels = ["Genuine", "Fraud"]
counts = combined_df["is_fraud"].value_counts().sort_index()
percentages = (counts / counts.sum() * 100).round(2)

# Creating the bar chart
fig = go.Figure(
    data=[
        go.Bar(
            x=labels,
            y=counts,
            text=[f"{v:,} ({p}%)" for v, p in zip(counts, percentages)],
            textposition='auto',
            marker_color=["pink", "black"]
        )
    ]
)

fig.update_layout(
    title={
        "text": "Distribution of Fraud vs Genuine Transactions",
        "font": {"size": 22}
    },
    xaxis_title="Transaction Type",
    yaxis_title="Count",
    xaxis=dict(title_font=dict(size=18), tickfont=dict(size=14)),
    yaxis=dict(title_font=dict(size=18), tickfont=dict(size=14)),
    font=dict(size=16),
    width=700,
    height=500
)
fig.show()


As shown, fraudulent transactions represent only 0.52% of all transactions, while genuine transactions account for the remaining 99.48%. This highlights a significant class imbalance, emphasizing the need for appropriate techniques such as resampling, anomaly detection, or cost-sensitive modeling to effectively train fraud detection algorithms

**The following pie chart shows the distribution of fraudulent transactions across different purchase categories, allowing us to identify which categories are most frequently targeted by fraudsters:**

In [None]:
# Filtering only fraud cases
fraud_only_df = combined_df[combined_df["is_fraud"] == 1]

# Count fraudulent transactions per category
category_counts = fraud_only_df["category"].value_counts().reset_index()
category_counts.columns = ["category", "fraud_count"]

fig = px.pie(
    category_counts,
    values="fraud_count",
    names="category",
    title="Fraud Transactions Distribution by Category",
    color_discrete_sequence=px.colors.qualitative.Pastel
)

fig.update_layout(width=700, height=700)
fig.show()

The pie chart reveals that fraudulent transactions are most commonly associated with the grocery_pos and shopping_net categories, each accounting for approximately 23% of all fraud cases. This suggests that fraudsters may frequently target everyday purchases and online shopping platforms, possibly due to their high transaction volume and relatively low verification requirements. Other notable categories include misc_net and shopping_pos, highlighting the need for enhanced fraud detection measures across both physical and online retail environments.

**This chart illustrates the distribution of fraudulent transactions by hour, highlighting the time periods with the highest fraud occurrence throughout the day:**

In [None]:
# Ensure the datetime column is in datetime format
combined_df['trans_date_trans_time'] = pd.to_datetime(combined_df['trans_date_trans_time'])

# Add an hour column
combined_df['hour'] = combined_df['trans_date_trans_time'].dt.hour

fraud_hourly = combined_df[combined_df["is_fraud"] == 1].groupby("hour").size().reset_index(name="fraud_count")

fig = px.line(fraud_hourly, x="hour", y="fraud_count",
              labels={"hour": "Hour of Day", "fraud_count": "Number of Frauds"},
              title="Fraud Transactions by Hour")
fig.show()

The chart shows a significant spike in fraudulent transactions occurring between 10 PM and midnight. During most hours of the day, the number of fraud cases remains relatively low and stable, with a noticeable drop in activity during early morning hours (around 3 AM to 7 AM). This pattern suggests that fraudsters may be more active during late evening hours, possibly taking advantage of reduced monitoring or increased user fatigue.


**The following chart shows whether there are specific days with a higher number of fraud cases:**


In [None]:
combined_df['dayofweek'] = pd.to_datetime(combined_df['trans_date_trans_time']).dt.dayofweek

fraud_by_day = combined_df.groupby("dayofweek")["is_fraud"].mean().reset_index()

days_map = {0: "Mon", 1: "Tue", 2: "Wed", 3: "Thu", 4: "Fri", 5: "Sat", 6: "Sun"}

fraud_day = combined_df[combined_df["is_fraud"] == 1].groupby("dayofweek").size().reset_index(name="fraud_count")
fraud_day["day_name"] = fraud_day["dayofweek"].map(days_map)

fig = px.line(fraud_day, x="day_name", y="fraud_count",
              labels={"day_name": "Day of Week", "fraud_count": "Number of Frauds"},
              title="Fraud Transactions by Day of Week")
fig.show()



The chart illustrates the distribution of fraudulent transactions across the days of the week. The number of fraud cases is lowest on Wednesday and gradually increases toward the weekend, peaking on Sunday. This trend may indicate that fraudsters are more active during weekends, possibly due to reduced oversight or increased online activity by users. In contrast, midweek (especially Wednesday) shows a noticeable dip in fraud occurrences.


**The following chart provides a clear insight into the relationship between a user's age and the likelihood of their transaction being fraudulent:**

In [None]:
# Ensure 'dob' column is in datetime format
combined_df['dob'] = pd.to_datetime(combined_df['dob'])

# Calculate age from date of birth
combined_df['age'] = (pd.to_datetime('today') - combined_df['dob']).dt.days // 365

# Create age groups
combined_df["age_group"] = pd.cut(combined_df["age"],
                                  bins=[0, 25, 35, 50, 65, 90],
                                  labels=["<25", "25-35", "35-50", "50-65", "65+"])

# Calculate fraud rate for each age group
fraud_by_age = combined_df[combined_df["is_fraud"] == 1].groupby("age_group", observed=True).size().reset_index(name="fraud_count")

fig = px.bar(fraud_by_age, x="age_group", y="fraud_count",
             labels={"age_group": "Age Group", "fraud_count": "Number of Frauds"},
             title="Fraud Transactions by Age Group")
fig.show()



The chart shows that fraudulent transactions are most prevalent among users aged 50–65, followed closely by the 35–50 age group. In contrast, individuals under 25 account for the fewest fraud cases. This trend suggests that middle-aged and older adults may be more frequently targeted or more vulnerable to fraud, possibly due to higher transaction volumes or reduced digital vigilance compared to younger users.


**The following chart illustrates the difference in fraud rates between male and female users, helping to assess whether gender has any impact on the likelihood of fraudulent transactions:**

In [None]:
fraud_by_gender = combined_df[combined_df["is_fraud"] == 1].groupby("gender").size().reset_index(name="fraud_count")

fig = px.bar(fraud_by_gender, x="gender", y="fraud_count",
             title="Fraud Transactions by Gender",
             labels={"gender": "Gender", "fraud_count": "Number of Frauds"})
fig.show()


The chart shows that the number of fraudulent transactions is slightly higher among female users compared to male users. While the difference is not dramatic, it may indicate varying exposure levels, transaction behaviors, or targeting patterns between genders. Further analysis would be needed to determine whether this trend reflects actual vulnerability or simply higher transaction volume among female users.


In [None]:
# Count duplicate rows
num_duplicates = combined_df.duplicated().sum()
print(f"Number of duplicate rows before removal: {num_duplicates}")

Number of duplicate rows before removal: 0


In [None]:
missing_values = combined_df.isnull().sum()
missing_values

Unnamed: 0,0
Unnamed: 0,0
trans_date_trans_time,0
cc_num,0
merchant,0
category,0
amt,0
first,0
last,0
gender,0
street,0


In [None]:
import plotly.express as px

# Select the top 30 job titles with the highest number of transactions
top_jobs = combined_df["job"].value_counts().nlargest(30).index


# Filter only frauds
fraud_jobs = fraud_only_df[fraud_only_df["job"].isin(top_jobs)]

# Group by job and count
job_summary = fraud_jobs.groupby("job").size().reset_index(name="fraud_count")

# Top 10 by fraud count
top_10_fraud_jobs = job_summary.sort_values("fraud_count", ascending=False).head(10)

fig = px.treemap(top_10_fraud_jobs,
                 path=["job"],
                 values="fraud_count",
                 color="fraud_count",
                 color_continuous_scale="Reds",
                 title="Top 10 Jobs by Number of Fraudulent Transactions")
fig.show()

The treemap highlights the top 10 job titles with the highest number of fraudulent transactions. The size of each block represents the number of transactions, while the color indicates the fraud rate — darker shades reflect higher fraud rates. Professions such as Naval architect, Audiological scientist, and Materials engineer stand out with both high transaction counts and elevated fraud levels. While fraud is rare overall, certain occupations show a stronger association with fraudulent activity