In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import statsmodels.api as sm
import statsmodels.formula.api as smf

dataset = pd.read_csv('financial_fraud_detection_dataset.csv')

In [3]:
# Tidying dataset 
dataset = dataset.drop(['transaction_id', 'timestamp', 'sender_account', 'receiver_account', 'fraud_type', 'time_since_last_transaction', 'ip_address', 'device_hash'], axis=1)

# Display all the columns of the dataset 
dataset.head(5)

Unnamed: 0,amount,transaction_type,merchant_category,location,device_used,is_fraud,spending_deviation_score,velocity_score,geo_anomaly_score,payment_channel
0,343.78,withdrawal,utilities,Tokyo,mobile,False,-0.21,3,0.22,card
1,419.65,withdrawal,online,Toronto,atm,False,-0.14,7,0.96,ACH
2,2773.86,deposit,other,London,pos,False,-1.78,20,0.89,card
3,1666.22,deposit,online,Sydney,pos,False,-0.6,6,0.37,wire_transfer
4,24.43,transfer,utilities,Toronto,mobile,False,0.79,13,0.27,ACH


## Introduction 

Financial fraud  happens when someone takes money or other assets from you illegally, and there are many different types types such as identity, payment, account takeover, etc. Especially due to the growing banking system and the financial technologies, we would like to research and learn more into what could be used to determine whether a transaction is fraud or not, and which category(ies) would be more important than others for detection.

Our goal for this project is to detect financial fraud using machine learning given synthetical data. We would like to be able to detect whether a transaction is fraud or not, and what type of fraud it would be.

## Variable Description

There were a total of 18 variables in the original dataset which we had removed for 10. 

The following variables are the ones we will focus on: 

- amount (USD)
- transaction_type: type of transaction [deposit,  withdrawal, transfer, payment]
- merchant_category: business category involved in transaction [retail, travel, online, utilities, entertainment, etc...]
- location: location from where transaction was initiated
- device_used: device type used [mobile, web, atm, pos]
- is_fraud: boolean flag indicating whether the transaction was fraudulent [true, false]
- spending_deviation_score: deviation from normal spending pattern
- velocity_score: number of transactions over a recent period
- geo_anomaly_score: measure of unusual geographic transaction behaviour [0, 1]
- payment_channel: card, ACH, wire_transfer, UPI


In [4]:
# We want to find the correlation between the features and the target variable 'is_fraud'
dataset = pd.get_dummies(dataset, columns=['merchant_category', 'location', 'payment_channel', 'device_used', 'transaction_type'])

To improve model clarity, we can first remove a few irrelevant variables first. The variables transaction_id, timestamp, sender_account, receiver_account, ip_address and device_hash provide arbitriary technical details, making it unlikely for these variables to be as helpful for detecting a fraud transaction.

Linear regression relies on numerial data as it uses mathematical operations, like addition, subtraction, etc., to fit a linear equation to observed data. However, this dataset has multiple categorical variables, so we first need to make these into indicator variables. This is essential, as it converts categories into a binary variable, making linear regression possible with these variables. We will use the `get-dummies` function from the Pandas library to do this.

In [5]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(dataset.drop('is_fraud', axis=1), dataset['is_fraud'], test_size=0.3, random_state=42)

# Fit the Logistic Regression model 
clf = LogisticRegression(max_iter=10000)
clf.fit(X_train, y_train) 

# Feature selection using RFE 
rfe = RFE(clf) 
rfe.fit(X_train, y_train)

# Display the ranking of features
ranking_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Ranking': rfe.ranking_,
    'Selected': rfe.support_
}).sort_values(by='Ranking')

#print(ranking_df)
all_features_ordered = ranking_df.sort_values(by='Ranking')['Feature'].tolist()
print(all_features_ordered)


['location_Dubai', 'location_London', 'transaction_type_withdrawal', 'transaction_type_transfer', 'device_used_web', 'device_used_pos', 'transaction_type_payment', 'transaction_type_deposit', 'device_used_atm', 'device_used_mobile', 'payment_channel_wire_transfer', 'payment_channel_card', 'payment_channel_ACH', 'payment_channel_UPI', 'location_Tokyo', 'location_Singapore', 'location_Berlin', 'location_Sydney', 'location_New York', 'location_Toronto', 'merchant_category_utilities', 'merchant_category_restaurant', 'merchant_category_retail', 'merchant_category_other', 'merchant_category_entertainment', 'merchant_category_online', 'merchant_category_travel', 'merchant_category_grocery', 'geo_anomaly_score', 'spending_deviation_score', 'velocity_score', 'amount']


In [6]:
train_data = X_train.copy()
train_data['is_fraud'] = y_train

train_data_sample = train_data.sample(frac=0.3, random_state=100)

null_model = smf.glm(
    formula= 'is_fraud ~ 1',
    data=train_data_sample,
    family=sm.families.Binomial()
).fit()

final_model = smf.glm(
    formula='is_fraud ~ location_Dubai + location_London + transaction_type_withdrawal + transaction_type_transfer + device_used_web + device_used_pos + transaction_type_payment + transaction_type_deposit + device_used_atm + device_used_mobile + payment_channel_wire_transfer + payment_channel_card + payment_channel_ACH + payment_channel_UPI + location_Tokyo + location_Singapore + location_Berlin + location_Sydney + Q("location_New York") + location_Toronto + merchant_category_utilities + merchant_category_restaurant + merchant_category_retail + merchant_category_other + merchant_category_entertainment + merchant_category_online + merchant_category_travel + merchant_category_grocery + geo_anomaly_score + spending_deviation_score + velocity_score + amount',
    data=train_data_sample,
    family=sm.families.Binomial()
).fit()

print(final_model.summary())


                           Generalized Linear Model Regression Results                           
Dep. Variable:     ['is_fraud[False]', 'is_fraud[True]']   No. Observations:              1050000
Model:                                               GLM   Df Residuals:                  1049972
Model Family:                                   Binomial   Df Model:                           27
Link Function:                                     Logit   Scale:                          1.0000
Method:                                             IRLS   Log-Likelihood:            -1.6261e+05
Date:                                   Tue, 15 Jul 2025   Deviance:                   3.2522e+05
Time:                                           21:44:35   Pearson chi2:                 1.05e+06
No. Iterations:                                       19   Pseudo R-squ. (CS):          2.941e-05
Covariance Type:                               nonrobust                                         
                    

In [8]:
y_true = train_data_sample['is_fraud']
y_pred_prob = final_model.predict(train_data_sample)
y_pred = (y_pred_prob > 0.5).astype(int)
conf_matrix = metrics.confusion_matrix(y_true, y_pred)
print(conf_matrix)

[[      0 1012243]
 [      0   37757]]
