import libraries

In [2]:
import pandas as pd

read csv

In [3]:
df = pd.read_csv('financial_fraud_detection_dataset.csv')
df.head()

Unnamed: 0,transaction_id,timestamp,sender_account,receiver_account,amount,transaction_type,merchant_category,location,device_used,is_fraud,fraud_type,time_since_last_transaction,spending_deviation_score,velocity_score,geo_anomaly_score,payment_channel,ip_address,device_hash
0,T100000,2023-08-22T09:22:43.516168,ACC877572,ACC388389,343.78,withdrawal,utilities,Tokyo,mobile,False,,,-0.21,3,0.22,card,13.101.214.112,D8536477
1,T100001,2023-08-04T01:58:02.606711,ACC895667,ACC944962,419.65,withdrawal,online,Toronto,atm,False,,,-0.14,7,0.96,ACH,172.52.47.194,D2622631
2,T100002,2023-05-12T11:39:33.742963,ACC733052,ACC377370,2773.86,deposit,other,London,pos,False,,,-1.78,20,0.89,card,185.98.35.23,D4823498
3,T100003,2023-10-10T06:04:43.195112,ACC996865,ACC344098,1666.22,deposit,online,Sydney,pos,False,,,-0.6,6,0.37,wire_transfer,107.136.36.87,D9961380
4,T100004,2023-09-24T08:09:02.700162,ACC584714,ACC497887,24.43,transfer,utilities,Toronto,mobile,False,,,0.79,13,0.27,ACH,108.161.108.255,D7637601


true fraud

In [4]:
fraud_df = df.loc[df['is_fraud'] == True]
fraud_types = df['fraud_type'].dropna().unique()
print(fraud_types)

['card_not_present']


remove several columns

In [5]:
df = df.drop(columns=[
    'transaction_id',
    'sender_account',
    'receiver_account',
    'ip_address',
    'device_hash',
    'fraud_type'
])
df.head()

Unnamed: 0,timestamp,amount,transaction_type,merchant_category,location,device_used,is_fraud,time_since_last_transaction,spending_deviation_score,velocity_score,geo_anomaly_score,payment_channel
0,2023-08-22T09:22:43.516168,343.78,withdrawal,utilities,Tokyo,mobile,False,,-0.21,3,0.22,card
1,2023-08-04T01:58:02.606711,419.65,withdrawal,online,Toronto,atm,False,,-0.14,7,0.96,ACH
2,2023-05-12T11:39:33.742963,2773.86,deposit,other,London,pos,False,,-1.78,20,0.89,card
3,2023-10-10T06:04:43.195112,1666.22,deposit,online,Sydney,pos,False,,-0.6,6,0.37,wire_transfer
4,2023-09-24T08:09:02.700162,24.43,transfer,utilities,Toronto,mobile,False,,0.79,13,0.27,ACH


split the timestamp

In [6]:
df['timestamp'] = pd.to_datetime(df['timestamp'], format='ISO8601', errors='coerce')

df['hour'] = df['timestamp'].dt.hour
df['day_of_week'] = df['timestamp'].dt.dayofweek
df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)

df = df.drop(columns=['timestamp'])

df.head()

Unnamed: 0,amount,transaction_type,merchant_category,location,device_used,is_fraud,time_since_last_transaction,spending_deviation_score,velocity_score,geo_anomaly_score,payment_channel,hour,day_of_week,is_weekend
0,343.78,withdrawal,utilities,Tokyo,mobile,False,,-0.21,3,0.22,card,9,1,0
1,419.65,withdrawal,online,Toronto,atm,False,,-0.14,7,0.96,ACH,1,4,0
2,2773.86,deposit,other,London,pos,False,,-1.78,20,0.89,card,11,4,0
3,1666.22,deposit,online,Sydney,pos,False,,-0.6,6,0.37,wire_transfer,6,1,0
4,24.43,transfer,utilities,Toronto,mobile,False,,0.79,13,0.27,ACH,8,6,1


check correlation

In [7]:
df.corr(numeric_only=True)

Unnamed: 0,amount,is_fraud,time_since_last_transaction,spending_deviation_score,velocity_score,geo_anomaly_score,hour,day_of_week,is_weekend
amount,1.0,-0.000167,0.000316,0.0007986702,-0.000811,0.000176,-0.0006209222,-0.000394,-1.5e-05
is_fraud,-0.000167,1.0,1.4e-05,0.0002005477,0.00037,0.000311,-0.0007818012,0.000668,0.000734
time_since_last_transaction,0.000316,1.4e-05,1.0,0.0002017022,8.8e-05,-3.1e-05,-0.0001883589,0.00757,0.006026
spending_deviation_score,0.000799,0.000201,0.000202,1.0,-0.000155,0.000472,-3.377701e-07,-0.00078,-0.000709
velocity_score,-0.000811,0.00037,8.8e-05,-0.0001552447,1.0,0.00034,0.0004035007,0.000941,0.000668
geo_anomaly_score,0.000176,0.000311,-3.1e-05,0.0004723374,0.00034,1.0,-0.000538175,-0.000171,-0.00035
hour,-0.000621,-0.000782,-0.000188,-3.377701e-07,0.000404,-0.000538,1.0,0.002046,0.001956
day_of_week,-0.000394,0.000668,0.00757,-0.0007801599,0.000941,-0.000171,0.002046489,1.0,0.79117
is_weekend,-1.5e-05,0.000734,0.006026,-0.0007090146,0.000668,-0.00035,0.001956188,0.79117,1.0


encode

In [8]:
df = pd.get_dummies(df, columns=['transaction_type', 'merchant_category', 'location', 'device_used', 'payment_channel'])

split target feature

In [9]:
X = df.drop(columns=['is_fraud'])
y = df['is_fraud']                


In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=42, 
                                                    stratify=y)


In [11]:
from sklearn.preprocessing import StandardScaler

numeric_cols = ['amount', 'time_since_last_transaction', 'spending_deviation_score', 
                'velocity_score', 'geo_anomaly_score', 'hour', 'day_of_week']

scaler = StandardScaler()
X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])


train the model

In [15]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(class_weight='balanced' ,random_state=42)
model.fit(X_train, y_train)


0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [16]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


              precision    recall  f1-score   support

       False       0.96      1.00      0.98    964089
        True       0.00      0.00      0.00     35911

    accuracy                           0.96   1000000
   macro avg       0.48      0.50      0.49   1000000
weighted avg       0.93      0.96      0.95   1000000

Confusion Matrix:
 [[964089      0]
 [ 35911      0]]
ROC AUC Score: 0.5884240962385835


In [14]:
print(y.value_counts(normalize=True))


is_fraud
False    0.964089
True     0.035911
Name: proportion, dtype: float64
