In [3]:
import pandas as pd

file_path = "dummy_historic_data.csv"
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,user_id,transaction_id,timestamp,amount,device_type,location,is_vpn,fraud
0,U47935,T97D7AE2A,2022-01-01T00:10:27,74.47,tablet,"California, USA",True,False
1,U45203,T1B97D873,2022-01-01T00:12:18,420.23,mobile,"London, UK",False,False
2,U26878,T9D67FE01,2022-01-01T00:15:59,288.41,tablet,"California, USA",False,False
3,U98609,T073EBAF8,2022-01-01T00:23:34,981.99,tablet,"California, USA",True,False
4,U90290,T2E2A2FAF,2022-01-01T00:29:52,372.75,mobile,"New York, USA",False,False


## Use only last 3 months of data

In [5]:
# Convert timestamp to datetime format
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Determine the last available date in the dataset
last_date = df['timestamp'].max()

# Filter data for the last 3 months
three_months_ago = last_date - pd.DateOffset(months=3)
df_filtered = df[df['timestamp'] >= three_months_ago]

# Display summary of filtered dataset
df_filtered.info()


<class 'pandas.core.frame.DataFrame'>
Index: 327604 entries, 1111897 to 1439500
Data columns (total 8 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   user_id         327604 non-null  object        
 1   transaction_id  327604 non-null  object        
 2   timestamp       327604 non-null  datetime64[ns]
 3   amount          327604 non-null  float64       
 4   device_type     327604 non-null  object        
 5   location        327604 non-null  object        
 6   is_vpn          327604 non-null  bool          
 7   fraud           327604 non-null  bool          
dtypes: bool(2), datetime64[ns](1), float64(1), object(4)
memory usage: 18.1+ MB


In [7]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
import numpy as np

# Sort dataset by timestamp to ensure proper time-based splitting
df_filtered = df_filtered.sort_values(by="timestamp")

# Extract week number for splitting
df_filtered["week"] = df_filtered["timestamp"].dt.strftime('%Y-%W').astype(str)

# Label encode categorical features
categorical_features = ["device_type", "location"]
label_encoders = {col: LabelEncoder() for col in categorical_features}

for col, encoder in label_encoders.items():
    df_filtered[col] = encoder.fit_transform(df_filtered[col])

# Scale the 'amount' feature
scaler = StandardScaler()
df_filtered["amount"] = scaler.fit_transform(df_filtered[["amount"]])

# Define features and target variable
features = ["amount", "device_type", "location", "is_vpn"]
X = df_filtered[features]
y = df_filtered["fraud"]

# Get unique weeks for temporal splitting
unique_weeks = df_filtered["week"].unique()

# Temporal Splitting: 2 weeks training, 1 week validation, 1 week testing
results = []
n_splits = len(unique_weeks) - 3

for i in range(n_splits):
    train_weeks = unique_weeks[i:i+2]
    val_week = unique_weeks[i+2]
    test_week = unique_weeks[i+3]

    # Filter data for each fold
    X_train, y_train = X[df_filtered["week"].isin(train_weeks)], y[df_filtered["week"].isin(train_weeks)]
    X_val, y_val = X[df_filtered["week"] == val_week], y[df_filtered["week"] == val_week]
    X_test, y_test = X[df_filtered["week"] == test_week], y[df_filtered["week"] == test_week]

    # Train model
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    # Validate model
    y_val_pred = model.predict(X_val)
    y_test_pred = model.predict(X_test)

    # Calculate evaluation metrics
    metrics = {
        "train_weeks": train_weeks,
        "val_week": val_week,
        "test_week": test_week,
        "val_precision": precision_score(y_val, y_val_pred),
        "val_recall": recall_score(y_val, y_val_pred),
        "val_f1": f1_score(y_val, y_val_pred),
        "val_auc": roc_auc_score(y_val, y_val_pred),
        "test_precision": precision_score(y_test, y_test_pred),
        "test_recall": recall_score(y_test, y_test_pred),
        "test_f1": f1_score(y_test, y_test_pred),
        "test_auc": roc_auc_score(y_test, y_test_pred),
    }
    
    results.append(metrics)

# Convert results to DataFrame for aggregation
results_df = pd.DataFrame(results)

results_df


Unnamed: 0,train_weeks,val_week,test_week,val_precision,val_recall,val_f1,val_auc,test_precision,test_recall,test_f1,test_auc
0,"[2023-39, 2023-40]",2023-41,2023-42,0.028736,0.020243,0.023753,0.50297,0.006494,0.004184,0.005089,0.495678
1,"[2023-40, 2023-41]",2023-42,2023-43,0.014493,0.012552,0.013453,0.497724,0.027569,0.021696,0.024283,0.502707
2,"[2023-41, 2023-42]",2023-43,2023-44,0.023256,0.017751,0.020134,0.500945,0.021186,0.020534,0.020855,0.500779
3,"[2023-42, 2023-43]",2023-44,2023-45,0.026667,0.024641,0.025614,0.503325,0.028446,0.024436,0.026289,0.503187
4,"[2023-43, 2023-44]",2023-45,2023-46,0.028103,0.022556,0.025026,0.502837,0.023364,0.021186,0.022222,0.50219
5,"[2023-44, 2023-45]",2023-46,2023-47,0.012474,0.012712,0.012592,0.496807,0.015453,0.013462,0.014388,0.497768
6,"[2023-45, 2023-46]",2023-47,2023-48,0.011468,0.009615,0.01046,0.496147,0.019481,0.018219,0.018828,0.500156
7,"[2023-46, 2023-47]",2023-48,2023-49,0.022523,0.020243,0.021322,0.501543,0.0282,0.02554,0.026804,0.503959
8,"[2023-47, 2023-48]",2023-49,2023-50,0.017937,0.015717,0.016754,0.499244,0.039735,0.035156,0.037306,0.509178
9,"[2023-48, 2023-49]",2023-50,2023-51,0.021786,0.019531,0.020597,0.501096,0.019481,0.017013,0.018163,0.499722


In [8]:
results_df.describe()

Unnamed: 0,val_precision,val_recall,val_f1,val_auc,test_precision,test_recall,test_f1,test_auc
count,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0
mean,0.021217,0.017851,0.019345,0.500457,0.02203,0.019295,0.020546,0.501113
std,0.006242,0.00467,0.005264,0.002581,0.008981,0.008176,0.008557,0.003775
min,0.011468,0.009615,0.01046,0.496147,0.006494,0.004184,0.005089,0.495678
25%,0.016215,0.014214,0.015103,0.498484,0.017467,0.015237,0.016276,0.498745
50%,0.022523,0.019531,0.020597,0.501096,0.021186,0.020534,0.020855,0.500779
75%,0.026305,0.020518,0.023419,0.502613,0.027884,0.023066,0.025286,0.502947
max,0.028736,0.024641,0.025614,0.503325,0.039735,0.035156,0.037306,0.509178


### The model is bad but is consistently bad given the standard deviation is so low 

In [10]:
import joblib

joblib.dump(model, "fraud_detection_model.pkl")

['fraud_detection_model.pkl']