In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

plt.style.use("default")
sns.set_theme()


In [None]:
credit = pd.read_csv("../data/processed/creditcard_cleaned.csv")
credit.head()
credit.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129146 entries, 0 to 129145
Data columns (total 13 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   user_id         129146 non-null  int64  
 1   signup_time     129146 non-null  object 
 2   purchase_time   129146 non-null  object 
 3   purchase_value  129146 non-null  int64  
 4   device_id       129146 non-null  object 
 5   source          129146 non-null  object 
 6   browser         129146 non-null  object 
 7   sex             129146 non-null  object 
 8   age             129146 non-null  int64  
 9   ip_address      129146 non-null  float64
 10  class           129146 non-null  int64  
 11  ip_int          129146 non-null  int64  
 12  country         129146 non-null  object 
dtypes: float64(1), int64(5), object(7)
memory usage: 12.8+ MB


In [27]:
credit['hour_of_day'] = (credit['Time'] // 3600) % 24
credit['day_of_week'] = (credit['Time'] // (3600*24)) % 7  # approximate weekday


In [29]:
scaler = StandardScaler()
credit['Amount_scaled'] = scaler.fit_transform(credit[['Amount']])


In [30]:
credit['amount_hour_interaction'] = credit['Amount_scaled'] * credit['hour_of_day']

# Transaction velocity proxy
credit = credit.sort_values('Time')
credit['cum_amount'] = credit['Amount_scaled'].cumsum()

# Approx rolling transactions in last 24 hours (rough)
credit['txn_last_24h'] = credit['Time'].rolling(window=24*60*60, min_periods=1).count()


In [31]:
feature_cols = ['V1','V2','V3','V4','V5','V6','V7','V8','V9','V10','V11','V12','V13','V14',
                'V15','V16','V17','V18','V19','V20','V21','V22','V23','V24','V25','V26','V27','V28',
                'Amount_scaled','hour_of_day','day_of_week','amount_hour_interaction','cum_amount','txn_last_24h']

X_credit = credit[feature_cols]
y_credit = credit['Class']


In [32]:
X_train, X_test, y_train, y_test = train_test_split(
    X_credit, y_credit, test_size=0.2, stratify=y_credit, random_state=42
)

print("Original train distribution:\n", y_train.value_counts())


Original train distribution:
 Class
0    226602
1       378
Name: count, dtype: int64


In [33]:
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

print("Resampled train distribution:\n", y_train_res.value_counts())


Resampled train distribution:
 Class
0    226602
1    226602
Name: count, dtype: int64


In [35]:
num_cols = ['Amount_scaled','hour_of_day','day_of_week','amount_hour_interaction','cum_amount','txn_last_24h']
scaler_final = StandardScaler()
X_train_res[num_cols] = scaler_final.fit_transform(X_train_res[num_cols])
X_test[num_cols] = scaler_final.transform(X_test[num_cols])

print("X_train:", X_train_res.shape, "y_train:", y_train_res.shape)
print("X_test:", X_test.shape, "y_test:", y_test.shape)


X_train: (453204, 34) y_train: (453204,)
X_test: (56746, 34) y_test: (56746,)


In [36]:
X_train_res.to_csv("../data/processed/credit_X_train.csv", index=False)
X_test.to_csv("../data/processed/credit_X_test.csv", index=False)
y_train_res.to_csv("../data/processed/credit_y_train.csv", index=False)
y_test.to_csv("../data/processed/credit_y_test.csv", index=False)
