Load Clean Fraud Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

fraud = pd.read_csv("../data/raw/Fraud_Data.csv")
fraud["signup_time"] = pd.to_datetime(fraud["signup_time"])
fraud["purchase_time"] = pd.to_datetime(fraud["purchase_time"])


Time-Based Features

In [4]:
fraud["hour_of_day"] = fraud["purchase_time"].dt.hour
fraud["day_of_week"] = fraud["purchase_time"].dt.dayofweek
fraud["time_since_signup"] = (
    fraud["purchase_time"] - fraud["signup_time"]
).dt.total_seconds()
fraud["purchase_time"] = pd.to_datetime(fraud["purchase_time"])



Transaction Velocity

In [8]:
fraud = fraud.sort_values(["user_id", "purchase_time"])

# Compute transaction count in past 24 hours per user
fraud["txn_count_24h"] = (
    fraud.groupby("user_id")["purchase_time"]
    .transform(
        lambda x: x.apply(
            lambda t: ((x >= t - pd.Timedelta(hours=24)) & (x <= t)).sum()
        )
    )
)


Encoding & Scaling

In [12]:
from sklearn.preprocessing import StandardScaler

cat_cols = ["browser", "source", "sex"]
cat_cols = [col for col in cat_cols if col in fraud.columns]

fraud = pd.get_dummies(fraud, columns=cat_cols, drop_first=True)


Save Processed Data

In [13]:
fraud.to_csv("../data/processed/fraud_processed.csv", index=False)
