Time-based features

In [2]:
import pandas as pd
import numpy as np


In [3]:
fraud = pd.read_csv("../data/raw/Fraud_Data.csv")
ip_country = pd.read_csv("../data/raw/IpAddress_to_Country.csv")


Fix datetime

In [4]:
fraud['signup_time'] = pd.to_datetime(fraud['signup_time'])
fraud['purchase_time'] = pd.to_datetime(fraud['purchase_time'])


Convert IPs to integer

In [5]:
fraud['ip_address'] = fraud['ip_address'].astype(np.int64)

ip_country['lower_bound_ip_address'] = ip_country['lower_bound_ip_address'].astype(np.int64)
ip_country['upper_bound_ip_address'] = ip_country['upper_bound_ip_address'].astype(np.int64)


Create fraud_geo

In [6]:
ip_country_sorted = ip_country.sort_values('lower_bound_ip_address')
fraud_sorted = fraud.sort_values('ip_address')

fraud_geo = pd.merge_asof(
    fraud_sorted,
    ip_country_sorted,
    left_on='ip_address',
    right_on='lower_bound_ip_address',
    direction='backward'
)

fraud_geo = fraud_geo[
    fraud_geo['ip_address'] <= fraud_geo['upper_bound_ip_address']
]


time feature code

In [7]:
fraud_geo['hour_of_day'] = fraud_geo['purchase_time'].dt.hour
fraud_geo['day_of_week'] = fraud_geo['purchase_time'].dt.dayofweek


Time since signup

In [8]:
fraud_geo['time_since_signup'] = (
    fraud_geo['purchase_time'] - fraud_geo['signup_time']
).dt.total_seconds() / 3600  # hours


Transaction velocity

In [9]:
txn_count = fraud_geo.groupby('user_id')['purchase_time'].transform('count')
fraud_geo['transactions_per_user'] = txn_count


# Encoding, Scaling & Class Imbalance

Encode categorical features

In [10]:
categorical_cols = ['source', 'browser', 'sex', 'country']
fraud_encoded = pd.get_dummies(fraud_geo, columns=categorical_cols, drop_first=True)


Scale numerical features

In [11]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
num_cols = ['purchase_value', 'age', 'time_since_signup', 'transactions_per_user']

fraud_encoded[num_cols] = scaler.fit_transform(fraud_encoded[num_cols])


Handle class imbalance (training data ONLY – later)

In [12]:
fraud_encoded['class'].value_counts()


class
0    116878
1     12268
Name: count, dtype: int64

Save processed fraud data

In [13]:
# ==============================
# Save processed fraud dataset
# ==============================

# Verify dataframe
print(fraud_geo.shape)
fraud_geo.head()

# Save to processed folder
fraud_geo.to_csv(
    "../data/processed/fraud_cleaned_featured.csv",
    index=False
)

print("✅ Feature-engineered fraud data saved successfully!")

(129146, 18)
✅ Feature-engineered fraud data saved successfully!
