# Import Statements

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from collections import Counter

# Load Dataset

In [None]:
from google.colab import files
# uploaded = files.upload()

df = pd.read_csv("Base.csv")

# Data Cleaning

## Check for missing values

In [None]:
print(df.isna().sum())

fraud_bool                          0
income                              0
name_email_similarity               0
prev_address_months_count           0
current_address_months_count        0
customer_age                        0
days_since_request                  0
intended_balcon_amount              0
payment_type                        0
zip_count_4w                        0
velocity_6h                         0
velocity_24h                        0
velocity_4w                         0
bank_branch_count_8w                0
date_of_birth_distinct_emails_4w    0
employment_status                   0
credit_risk_score                   0
email_is_free                       0
housing_status                      0
phone_home_valid                    0
phone_mobile_valid                  0
bank_months_count                   0
has_other_cards                     0
proposed_credit_limit               0
foreign_request                     0
source                              0
session_leng

## Drop columns

In [None]:
df = df.drop('device_fraud_count', axis=1)
X = df.drop("fraud_bool", axis=1)
y = df["fraud_bool"]

## Train-val-test split

In [None]:
# Training: months 1–6 (values 0-5)
X_train = df[df['month'].isin([0,1,2,3,4,5])].drop(columns=['fraud_bool'])
y_train = df[df['month'].isin([0,1,2,3,4,5])]['fraud_bool']

# Validation: month 7 (value 6)
X_val = df[df['month'] == 6].drop(columns=['fraud_bool'])
y_val = df[df['month'] == 6]['fraud_bool']

# Test: month 8 (value 7)
X_test = df[df['month'] == 7].drop(columns=['fraud_bool'])
y_test = df[df['month'] == 7]['fraud_bool']

In [None]:
print("Train Shape: " + str(X_train.shape))
print("Validation Shape: " + str(X_val.shape))
print("Test Shape: " + str(X_test.shape))

Train Shape: (794989, 30)
Validation Shape: (108168, 30)
Test Shape: (96843, 30)


# Feature Engineering

### One-hot encoding for categorical variables

In [None]:
X_train = pd.get_dummies(X_train, columns=['customer_age'], prefix='age', drop_first=False)
X_val = pd.get_dummies(X_val, columns=['customer_age'], prefix='age', drop_first=False)
X_test = pd.get_dummies(X_test, columns=['customer_age'], prefix='age', drop_first=False)

# Align validation and test columns with training columns
X_val = X_val.reindex(columns=X_train.columns, fill_value=0)
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

# Check dataset shapes
print("Train Shape: " + str(X_train.shape))
print("Validation Shape: " + str(X_val.shape))
print("Test Shape: " + str(X_test.shape))

Train Shape: (794989, 38)
Validation Shape: (108168, 38)
Test Shape: (96843, 38)


In [None]:
# One-hot encode
X_train = pd.get_dummies(X_train, columns=['payment_type'], prefix='payment', drop_first=False)
X_val = pd.get_dummies(X_val, columns=['payment_type'], prefix='payment', drop_first=False)
X_test = pd.get_dummies(X_test, columns=['payment_type'], prefix='payment', drop_first=False)

# Align columns so validation/test match training
X_val = X_val.reindex(columns=X_train.columns, fill_value=0)
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

# Check dataset shapes
print("Train Shape: " + str(X_train.shape))
print("Validation Shape: " + str(X_val.shape))
print("Test Shape: " + str(X_test.shape))

Train Shape: (794989, 42)
Validation Shape: (108168, 42)
Test Shape: (96843, 42)


In [None]:
# One-hot encode
X_train = pd.get_dummies(X_train, columns=['employment_status'], prefix='employment', drop_first=False)
X_val = pd.get_dummies(X_val, columns=['employment_status'], prefix='employment', drop_first=False)
X_test = pd.get_dummies(X_test, columns=['employment_status'], prefix='employment', drop_first=False)

# Align columns so validation/test match training
X_val = X_val.reindex(columns=X_train.columns, fill_value=0)
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

# Check dataset shapes
print("Train Shape: " + str(X_train.shape))
print("Validation Shape: " + str(X_val.shape))
print("Test Shape: " + str(X_test.shape))

Train Shape: (794989, 48)
Validation Shape: (108168, 48)
Test Shape: (96843, 48)


In [None]:
X_train = pd.get_dummies(X_train, columns=['housing_status'], prefix='housing', drop_first=False)
X_val = pd.get_dummies(X_val, columns=['housing_status'], prefix='housing', drop_first=False)
X_test = pd.get_dummies(X_test, columns=['housing_status'], prefix='housing', drop_first=False)

# Align columns so validation/test match training
X_val = X_val.reindex(columns=X_train.columns, fill_value=0)
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

# Check dataset shapes
print("Train Shape: " + str(X_train.shape))
print("Validation Shape: " + str(X_val.shape))
print("Test Shape: " + str(X_test.shape))

Train Shape: (794989, 54)
Validation Shape: (108168, 54)
Test Shape: (96843, 54)


In [None]:
X_train = pd.get_dummies(X_train, columns=['source'], prefix='source', drop_first=False)
X_val = pd.get_dummies(X_val, columns=['source'], prefix='source', drop_first=False)
X_test = pd.get_dummies(X_test, columns=['source'], prefix='source', drop_first=False)

# Align columns so validation/test match training
X_val = X_val.reindex(columns=X_train.columns, fill_value=0)
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

# Check dataset shapes
print("Train Shape: " + str(X_train.shape))
print("Validation Shape: " + str(X_val.shape))
print("Test Shape: " + str(X_test.shape))

Train Shape: (794989, 55)
Validation Shape: (108168, 55)
Test Shape: (96843, 55)


In [None]:
X_train = pd.get_dummies(X_train, columns=['device_os'], prefix='device', drop_first=False)
X_val = pd.get_dummies(X_val, columns=['device_os'], prefix='device', drop_first=False)
X_test = pd.get_dummies(X_test, columns=['device_os'], prefix='device', drop_first=False)

# Align columns so validation/test match training
X_val = X_val.reindex(columns=X_train.columns, fill_value=0)
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

# Check dataset shapes
print("Train Shape: " + str(X_train.shape))
print("Validation Shape: " + str(X_val.shape))
print("Test Shape: " + str(X_test.shape))

Train Shape: (794989, 59)
Validation Shape: (108168, 59)
Test Shape: (96843, 59)


### Log transformation for skewed features

In [None]:
X_train['days_since_request_log'] = np.log1p(X_train['days_since_request'])
X_val['days_since_request_log'] = np.log1p(X_val['days_since_request'])
X_test['days_since_request_log'] = np.log1p(X_test['days_since_request'])

In [None]:
X_train['intended_balcon_amount_clean'] = X_train['intended_balcon_amount'].apply(lambda x: x if x >= 0 else -1)
X_val['intended_balcon_amount_clean'] = X_val['intended_balcon_amount'].apply(lambda x: x if x >= 0 else -1)
X_test['intended_balcon_amount_clean'] = X_test['intended_balcon_amount'].apply(lambda x: x if x >= 0 else -1)

# Optional: log-transform only positive values
X_train['intended_balcon_amount_log'] = X_train['intended_balcon_amount_clean'].apply(lambda x: np.log1p(x) if x > 0 else 0)
X_val['intended_balcon_amount_log'] = X_val['intended_balcon_amount_clean'].apply(lambda x: np.log1p(x) if x > 0 else 0)
X_test['intended_balcon_amount_log'] = X_test['intended_balcon_amount_clean'].apply(lambda x: np.log1p(x) if x > 0 else 0)

# Drop original (not cleaned) column
X_train = X_train.drop('intended_balcon_amount', axis=1)
X_val = X_val.drop('intended_balcon_amount', axis=1)
X_test = X_test.drop('intended_balcon_amount', axis=1)

In [None]:
X_train['zip_count_4w_log'] = np.log1p(X_train['zip_count_4w'])
X_val['zip_count_4w_log'] = np.log1p(X_val['zip_count_4w'])
X_test['zip_count_4w_log'] = np.log1p(X_test['zip_count_4w'])

In [None]:
for df in [X_train, X_val, X_test]:
    df['velocity_24h_log'] = np.log1p(df['velocity_24h'])
    df['velocity_4w_log'] = np.log1p(df['velocity_4w'])

In [None]:
X_train['date_of_birth_distinct_emails_4w_log'] = np.log1p(X_train['date_of_birth_distinct_emails_4w'])
X_val['date_of_birth_distinct_emails_4w_log'] = np.log1p(X_val['date_of_birth_distinct_emails_4w'])
X_test['date_of_birth_distinct_emails_4w_log'] = np.log1p(X_test['date_of_birth_distinct_emails_4w'])

In [None]:
X_train['session_length_in_minutes_cleaned'] = X_train['session_length_in_minutes'].apply(lambda x: x if x >= 0 else -1)
X_val['session_length_in_minutes_cleaned'] = X_val['session_length_in_minutes'].apply(lambda x: x if x >= 0 else -1)
X_test['session_length_in_minutes_cleaned'] = X_test['session_length_in_minutes'].apply(lambda x: x if x >= 0 else -1)

X_train['session_length_in_minutes_log'] = X_train['session_length_in_minutes_cleaned'].apply(lambda x: np.log1p(x) if x > 0 else 0)
X_val['session_length_in_minutes_log'] = X_val['session_length_in_minutes_cleaned'].apply(lambda x: np.log1p(x) if x > 0 else 0)
X_test['session_length_in_minutes_log'] = X_test['session_length_in_minutes_cleaned'].apply(lambda x: np.log1p(x) if x > 0 else 0)

# Drop original (not cleaned) column
X_train = X_train.drop('session_length_in_minutes', axis=1)
X_val = X_val.drop('session_length_in_minutes', axis=1)
X_test = X_test.drop('session_length_in_minutes', axis=1)

In [None]:
# Check shapes
print("Train Shape: " + str(X_train.shape))
print("Validation Shape: " + str(X_val.shape))
print("Test Shape: " + str(X_test.shape))

Train Shape: (794989, 66)
Validation Shape: (108168, 66)
Test Shape: (96843, 66)


In [None]:
# Check columns
print("Train Columns: " + str(X_train.columns))
print("Validation Columns: " + str(X_val.columns))
print("Test Columns: " + str(X_test.columns))

Train Columns: Index(['income', 'name_email_similarity', 'prev_address_months_count',
       'current_address_months_count', 'days_since_request', 'zip_count_4w',
       'velocity_6h', 'velocity_24h', 'velocity_4w', 'bank_branch_count_8w',
       'date_of_birth_distinct_emails_4w', 'credit_risk_score',
       'email_is_free', 'phone_home_valid', 'phone_mobile_valid',
       'bank_months_count', 'has_other_cards', 'proposed_credit_limit',
       'foreign_request', 'keep_alive_session', 'device_distinct_emails_8w',
       'month', 'age_10', 'age_20', 'age_30', 'age_40', 'age_50', 'age_60',
       'age_70', 'age_80', 'age_90', 'payment_AA', 'payment_AB', 'payment_AC',
       'payment_AD', 'payment_AE', 'employment_CA', 'employment_CB',
       'employment_CC', 'employment_CD', 'employment_CE', 'employment_CF',
       'employment_CG', 'housing_BA', 'housing_BB', 'housing_BC', 'housing_BD',
       'housing_BE', 'housing_BF', 'housing_BG', 'source_INTERNET',
       'source_TELEAPP', 'device_l

### New features

In [None]:
# total_residence_months
X_train['total_residence_months'] = X_train['prev_address_months_count'] + X_train['current_address_months_count']
X_val['total_residence_months'] = X_val['prev_address_months_count'] + X_val['current_address_months_count']
X_test['total_residence_months'] = X_test['prev_address_months_count'] + X_test['current_address_months_count']

In [None]:
# velocity_ratio_24h_4w
for df in [X_train, X_val, X_test]:
    # Add a small epsilon to denominator to avoid division by zero
    df['velocity_ratio_24h_4w'] = df['velocity_24h'] / (df['velocity_4w'] + 1e-6)

In [None]:
# Check shapes
print("Train Shape: " + str(X_train.shape))
print("Validation Shape: " + str(X_val.shape))
print("Test Shape: " + str(X_test.shape))

Train Shape: (794989, 68)
Validation Shape: (108168, 68)
Test Shape: (96843, 68)


# SMOTE on Train Set

In [None]:
# Initialise SMOTE
smote = SMOTE(random_state=42, sampling_strategy='auto')

# Apply SMOTE
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print("Before SMOTE:", Counter(y_train))
print("After SMOTE:", Counter(y_train_resampled))

Before SMOTE: Counter({0: 786838, 1: 8151})
After SMOTE: Counter({0: 786838, 1: 786838})


# Save to CSV

In [None]:
# Training set (resampled)
X_train_resampled.to_csv("X_train_resampled.csv", index=False)
y_train_resampled.to_csv("y_train_resampled.csv", index=False)

# Training set (original)
X_train.to_csv("X_train.csv", index=False)
y_train.to_csv("y_train.csv", index=False)

# Validation set
X_val.to_csv("X_val.csv", index=False)
y_val.to_csv("y_val.csv", index=False)

# Test set
X_test.to_csv("X_test.csv", index=False)
y_test.to_csv("y_test.csv", index=False)