In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense



In [2]:
df = pd.read_csv("../data/transactions.csv")
df.head()



Unnamed: 0,transaction_id,customer_id,card_number,timestamp,amount,merchant_id,merchant_category,merchant_lat,merchant_long,is_fraud,fraud_type,hour,day_of_week,month,distance_from_home
0,TXN_00000000,CUST_02906,CARD_37782,2025-05-28T21:43:00,3925.33,MERCHANT_1354,gas,33.0093,78.4679,0,none,10,2,5,5.21
1,TXN_00000001,CUST_00125,CARD_47138,2025-01-27T01:34:00,3490.53,MERCHANT_6926,electronics,35.0089,79.3276,0,none,8,0,1,12.39
2,TXN_00000002,CUST_03419,CARD_66165,2025-06-13T21:30:00,656.3,MERCHANT_4968,gas,20.977,88.2948,0,none,21,4,6,12.91
3,TXN_00000003,CUST_04810,CARD_57686,2025-02-09T17:19:00,4654.73,MERCHANT_5636,grocery,12.1694,71.1686,0,none,13,6,2,13.02
4,TXN_00000004,CUST_00093,CARD_50881,2025-02-10T20:28:00,3627.84,MERCHANT_5994,luxury_goods,20.7116,76.8014,0,none,21,0,2,3.98


In [4]:
df.info()
df.is_fraud.value_counts()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 15 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   transaction_id      100000 non-null  object 
 1   customer_id         100000 non-null  object 
 2   card_number         100000 non-null  object 
 3   timestamp           100000 non-null  object 
 4   amount              100000 non-null  float64
 5   merchant_id         100000 non-null  object 
 6   merchant_category   100000 non-null  object 
 7   merchant_lat        100000 non-null  float64
 8   merchant_long       100000 non-null  float64
 9   is_fraud            100000 non-null  int64  
 10  fraud_type          100000 non-null  object 
 11  hour                100000 non-null  int64  
 12  day_of_week         100000 non-null  int64  
 13  month               100000 non-null  int64  
 14  distance_from_home  100000 non-null  float64
dtypes: float64(4), int64(4), object(7)


is_fraud
0    98081
1     1919
Name: count, dtype: int64

In [3]:
df.info()
df["is_fraud"].value_counts(normalize=True)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 15 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   transaction_id      100000 non-null  object 
 1   customer_id         100000 non-null  object 
 2   card_number         100000 non-null  object 
 3   timestamp           100000 non-null  object 
 4   amount              100000 non-null  float64
 5   merchant_id         100000 non-null  object 
 6   merchant_category   100000 non-null  object 
 7   merchant_lat        100000 non-null  float64
 8   merchant_long       100000 non-null  float64
 9   is_fraud            100000 non-null  int64  
 10  fraud_type          100000 non-null  object 
 11  hour                100000 non-null  int64  
 12  day_of_week         100000 non-null  int64  
 13  month               100000 non-null  int64  
 14  distance_from_home  100000 non-null  float64
dtypes: float64(4), int64(4), object(7)


is_fraud
0    0.98021
1    0.01979
Name: proportion, dtype: float64

In [4]:
FEATURES = [
    "amount",
    "hour",
    "day_of_week",
    "distance_from_home"
]

X = df[FEATURES]
y = df["is_fraud"]



In [5]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)



isolation forest

In [6]:
iso_model = IsolationForest(
    n_estimators=100,
    contamination=0.02,
    random_state=42
)

iso_model.fit(X_scaled)


In [7]:
iso_pred_raw = iso_model.predict(X_scaled)
iso_pred = np.where(iso_pred_raw == -1, 1, 0)


In [8]:
iso_precision = precision_score(y, iso_pred)
iso_recall = recall_score(y, iso_pred)
iso_f1 = f1_score(y, iso_pred)
iso_auc = roc_auc_score(y, iso_pred)

iso_precision, iso_recall, iso_f1, iso_auc


(0.9895, 1.0, 0.9947222920331742, 0.9998928800971221)

svm


In [9]:
svm_model = OneClassSVM(
    kernel="rbf",
    nu=0.02,
    gamma="scale"
)

svm_model.fit(X_scaled)


In [10]:
svm_pred_raw = svm_model.predict(X_scaled)
svm_pred = np.where(svm_pred_raw == -1, 1, 0)

svm_precision = precision_score(y, svm_pred)
svm_recall = recall_score(y, svm_pred)
svm_f1 = f1_score(y, svm_pred)
svm_auc = roc_auc_score(y, svm_pred)

svm_precision, svm_recall, svm_f1, svm_auc


(0.8285, 0.8372915613946438, 0.8328725810505152, 0.9168961556169819)

auto encoder

In [11]:
X_normal = X_scaled[y == 0]


In [12]:
input_dim = X_normal.shape[1]

input_layer = Input(shape=(input_dim,))
encoded = Dense(8, activation="relu")(input_layer)
decoded = Dense(input_dim, activation="linear")(encoded)

autoencoder = Model(input_layer, decoded)
autoencoder.compile(optimizer="adam", loss="mse")


In [13]:
autoencoder.fit(
    X_normal,
    X_normal,
    epochs=10,
    batch_size=256,
    validation_split=0.1,
    verbose=1
)


AttributeError: module 'ml_dtypes' has no attribute 'float4_e2m1fn'
Epoch 1/10
[1m345/345[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - loss: 0.6646 - val_loss: 0.1757
Epoch 2/10
[1m345/345[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 0.1249 - val_loss: 0.0438
Epoch 3/10
[1m345/345[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.0351 - val_loss: 0.0202
Epoch 4/10
[1m345/345[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.0175 - val_loss: 0.0120
Epoch 5/10
[1m345/345[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.0108 - val_loss: 0.0077
Epoch 6/10
[1m345/345[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.0069 - val_loss: 0.0051
Epoch 7/10
[1m345/345[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.0046 - val_loss: 0.0036
Epoch 8/10
[1m345/345[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss

<keras.src.callbacks.history.History at 0x2445119fd70>

In [14]:
reconstructions = autoencoder.predict(X_scaled)
errors = np.mean(np.square(X_scaled - reconstructions), axis=1)

threshold = np.percentile(errors, 95)
ae_pred = np.where(errors > threshold, 1, 0)


[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step


In [15]:
ae_precision = precision_score(y, ae_pred)
ae_recall = recall_score(y, ae_pred)
ae_f1 = f1_score(y, ae_pred)
ae_auc = roc_auc_score(y, ae_pred)

ae_precision, ae_recall, ae_f1, ae_auc


(0.3958, 1.0, 0.5671299613125089, 0.9845900368288427)

In [16]:
results = pd.DataFrame({
    "Model": ["Isolation Forest", "One-Class SVM", "Autoencoder"],
    "Precision": [iso_precision, svm_precision, ae_precision],
    "Recall": [iso_recall, svm_recall, ae_recall],
    "F1-Score": [iso_f1, svm_f1, ae_f1],
    "AUC-ROC": [iso_auc, svm_auc, ae_auc]
})

results


Unnamed: 0,Model,Precision,Recall,F1-Score,AUC-ROC
0,Isolation Forest,0.9895,1.0,0.994722,0.999893
1,One-Class SVM,0.8285,0.837292,0.832873,0.916896
2,Autoencoder,0.3958,1.0,0.56713,0.98459
