In [28]:
import pandas as pd
import numpy as np

In [29]:
data_path = 'fraud_detection/data/raw/card_transdata.csv'
df = pd.read_csv(data_path)

In [30]:
missing = df.isnull().sum()
print("Missing values:\n", missing)

Missing values:
 distance_from_home                0
distance_from_last_transaction    0
ratio_to_median_purchase_price    0
repeat_retailer                   0
used_chip                         0
used_pin_number                   0
online_order                      0
fraud                             0
dtype: int64


In [31]:
df = df.dropna(thresh=df.shape[1] - 3)

In [32]:
numeric_cols = ['distance_from_home', 
                'distance_from_last_transaction', 
                'ratio_to_median_purchase_price']

for col in numeric_cols:
    median_val = df[col].median()
    df[col] = df[col].fillna(median_val)

In [33]:
df.dropna(inplace=True)
print("After cleaning data shape:", df.shape)

After cleaning data shape: (1000000, 8)


In [34]:
from sklearn.model_selection import train_test_split

In [35]:
X = df.drop("fraud", axis=1)
y = df['fraud']

In [36]:
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y,
    test_size=0.10, #10% for test
    random_state=42,
    stratify=y
)

In [37]:
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp,
    test_size=0.2222,
    random_state=42,
    stratify=y_temp
)
print("Train:", X_train.shape, "Val:", X_val.shape, "Test:", X_test.shape)

Train: (700020, 7) Val: (199980, 7) Test: (100000, 7)


In [38]:
#smote

'''
note for me//to be deleted
Synthetic Minority Oversampling Technique (SMOTE) 
is a statistical technique for increasing the number
of cases in your dataset in a balanced way. 
The component works by generating new instances 
from existing minority cases that you supply as input.
'''
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_bal, y_train_bal = smote.fit_resample(X_train, y_train)
print("Before SMOTE:", X_train.shape, "fraud=1", sum(y_train==1))
print("After SMOTE:", X_train_bal.shape, "fraud=1", sum(y_train_bal==1))

Before SMOTE: (700020, 7) fraud=1 61184
After SMOTE: (1277672, 7) fraud=1 638836


In [39]:
from sklearn.preprocessing import StandardScaler
import joblib

scaler = StandardScaler()
X_train_bal_scaled = scaler.fit_transform(X_train_bal)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

In [40]:
import joblib
joblib.dump(scaler, "fraud_detection/models/scaler.pkl")
joblib.dump((X_train_bal_scaled, y_train_bal), 'fraud_detection/data/processed/train.pkl')
joblib.dump((X_val_scaled, y_val), 'fraud_detection/data/processed/val.pkl')
joblib.dump((X_test_scaled, y_test), 'fraud_detection/data/processed/test.pkl')

['C:/Users/kamil/Documents/pythonProject1/fraud_detection/data/processed/test.pkl']