# Credit Card Fraud Detection - Data Preprocessing

This notebook handles data preprocessing including:
- Data cleaning and preparation
- Feature engineering
- Handling class imbalance with SMOTE
- Train-test split preparation

In [7]:
# Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
import warnings
import pickle
from pathlib import Path

# Settings
warnings.filterwarnings('ignore')
plt.style.use('default')
np.random.seed(42)

print("Libraries imported successfully")

Libraries imported successfully


## Load Dataset

In [8]:
# Load the dataset
try:
    data_path = "../data/raw/PS_20174392719_1491204439457_log.csv"
    df = pd.read_csv(data_path)
    print(f"Dataset loaded successfully from {data_path}")
    print(f"Shape: {df.shape}")
except FileNotFoundError:
    print("Dataset file not found!")
    print("Creating sample data for demonstration...")
    
    # Create sample data
    np.random.seed(42)
    n_samples = 10000
    
    sample_data = {
        'step': np.random.randint(1, 744, n_samples),
        'type': np.random.choice(['CASH_OUT', 'PAYMENT', 'CASH_IN', 'TRANSFER', 'DEBIT'], n_samples),
        'amount': np.random.exponential(100, n_samples),
        'nameOrig': [f'C{i}' for i in range(n_samples)],
        'oldbalanceOrg': np.random.exponential(1000, n_samples),
        'newbalanceOrig': np.random.exponential(1000, n_samples),
        'nameDest': [f'M{i}' for i in range(n_samples)],
        'oldbalanceDest': np.random.exponential(1000, n_samples),
        'newbalanceDest': np.random.exponential(1000, n_samples),
        'isFraud': np.random.choice([0, 1], n_samples, p=[0.998, 0.002])
    }
    
    df = pd.DataFrame(sample_data)
    print(f"Sample data created with shape: {df.shape}")

print(f"\nDataset info:")
print(df.info())

MemoryError: Unable to allocate 243. MiB for an array with shape (5, 6362620) and data type float64

## Data Cleaning

In [None]:
# Check for missing values
print("Missing values:")
print(df.isnull().sum())

# Check for duplicates
print(f"\nDuplicate rows: {df.duplicated().sum()}")

# Remove duplicates if any
if df.duplicated().sum() > 0:
    df = df.drop_duplicates()
    print(f"Duplicates removed. New shape: {df.shape}")

# Basic data cleaning
print(f"\nData types:")
print(df.dtypes)

# Check for any anomalies
print(f"\nNegative amounts: {(df['amount'] < 0).sum()}")
print(f"Zero amounts: {(df['amount'] == 0).sum()}")

## Feature Engineering

In [None]:
# Create a copy for feature engineering
df_processed = df.copy()

# 1. Balance difference features
df_processed['balance_diff_orig'] = df_processed['oldbalanceOrg'] - df_processed['newbalanceOrig']
df_processed['balance_diff_dest'] = df_processed['newbalanceDest'] - df_processed['oldbalanceDest']

# 2. Balance ratio features
df_processed['orig_balance_ratio'] = df_processed['newbalanceOrig'] / (df_processed['oldbalanceOrg'] + 1)
df_processed['dest_balance_ratio'] = df_processed['newbalanceDest'] / (df_processed['oldbalanceDest'] + 1)

# 3. Amount vs balance ratios
df_processed['amount_to_orig_ratio'] = df_processed['amount'] / (df_processed['oldbalanceOrg'] + 1)
df_processed['amount_to_dest_ratio'] = df_processed['amount'] / (df_processed['oldbalanceDest'] + 1)

# 4. Log transformed features
df_processed['log_amount'] = np.log1p(df_processed['amount'])
df_processed['log_oldbalanceOrg'] = np.log1p(df_processed['oldbalanceOrg'])
df_processed['log_newbalanceOrig'] = np.log1p(df_processed['newbalanceOrig'])
df_processed['log_oldbalanceDest'] = np.log1p(df_processed['oldbalanceDest'])
df_processed['log_newbalanceDest'] = np.log1p(df_processed['newbalanceDest'])

# 5. Transaction type encoding
le_type = LabelEncoder()
df_processed['type_encoded'] = le_type.fit_transform(df_processed['type'])

# 6. Categorical features - One hot encoding
type_dummies = pd.get_dummies(df_processed['type'], prefix='type')
df_processed = pd.concat([df_processed, type_dummies], axis=1)

# 7. Binary flags
df_processed['is_orig_zero_balance'] = (df_processed['oldbalanceOrg'] == 0).astype(int)
df_processed['is_dest_zero_balance'] = (df_processed['oldbalanceDest'] == 0).astype(int)
df_processed['is_amount_round'] = (df_processed['amount'] % 1 == 0).astype(int)

print(f"Feature engineering completed.")
print(f"New shape: {df_processed.shape}")
print(f"New features created: {df_processed.shape[1] - df.shape[1]}")

# Display new features
new_features = [col for col in df_processed.columns if col not in df.columns]
print(f"\nNew features: {new_features}")

## Feature Selection and Preparation

In [None]:
# Select features for modeling
# Exclude ID columns and original categorical column
exclude_cols = ['nameOrig', 'nameDest', 'type']
feature_cols = [col for col in df_processed.columns if col not in exclude_cols + ['isFraud']]

X = df_processed[feature_cols]
y = df_processed['isFraud']

print(f"Features for modeling: {len(feature_cols)}")
print(f"Feature columns: {feature_cols}")

# Check for any infinite or NaN values
print(f"\nInfinite values: {np.isinf(X).sum().sum()}")
print(f"NaN values: {X.isnull().sum().sum()}")

# Replace infinite values with NaN and then fill
X = X.replace([np.inf, -np.inf], np.nan)
X = X.fillna(X.median())

print(f"\nFinal X shape: {X.shape}")
print(f"Final y shape: {y.shape}")
print(f"Class distribution: {y.value_counts()}")

## Train-Test Split

In [None]:
# Initial train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Train set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")
print(f"\nTrain class distribution:")
print(y_train.value_counts())
print(f"\nTest class distribution:")
print(y_test.value_counts())

# Calculate class distribution percentages
train_fraud_rate = y_train.sum() / len(y_train) * 100
test_fraud_rate = y_test.sum() / len(y_test) * 100

print(f"\nTrain fraud rate: {train_fraud_rate:.3f}%")
print(f"Test fraud rate: {test_fraud_rate:.3f}%")

## Feature Scaling

In [None]:
# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert back to DataFrame for easier handling
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

print(f"Features scaled successfully")
print(f"Scaled train mean: {X_train_scaled.mean().mean():.6f}")
print(f"Scaled train std: {X_train_scaled.std().mean():.6f}")

# Show scaling statistics
print(f"\nScaling verification:")
print(f"Train set - Mean range: [{X_train_scaled.mean().min():.6f}, {X_train_scaled.mean().max():.6f}]")
print(f"Train set - Std range: [{X_train_scaled.std().min():.6f}, {X_train_scaled.std().max():.6f}]")

## Handle Class Imbalance with SMOTE

In [None]:
# Apply SMOTE to handle class imbalance
print("Applying SMOTE to handle class imbalance...")

# Before SMOTE
print(f"Before SMOTE:")
print(f"Class distribution: {y_train.value_counts()}")
print(f"Class ratios: {y_train.value_counts(normalize=True)}")

# Apply SMOTE
smote = SMOTE(random_state=42, sampling_strategy='auto')
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)

# Convert back to DataFrame
X_train_smote = pd.DataFrame(X_train_smote, columns=X_train_scaled.columns)
y_train_smote = pd.Series(y_train_smote)

# After SMOTE
print(f"\nAfter SMOTE:")
print(f"Class distribution: {y_train_smote.value_counts()}")
print(f"Class ratios: {y_train_smote.value_counts(normalize=True)}")
print(f"\nTraining set size increased from {len(y_train)} to {len(y_train_smote)}")

# Visualization
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

# Before SMOTE
y_train.value_counts().plot(kind='bar', ax=ax1, color=['blue', 'red'])
ax1.set_title('Class Distribution - Before SMOTE')
ax1.set_xlabel('Class')
ax1.set_ylabel('Count')
ax1.set_xticklabels(['Legitimate', 'Fraud'], rotation=0)

# After SMOTE
y_train_smote.value_counts().plot(kind='bar', ax=ax2, color=['blue', 'red'])
ax2.set_title('Class Distribution - After SMOTE')
ax2.set_xlabel('Class')
ax2.set_ylabel('Count')
ax2.set_xticklabels(['Legitimate', 'Fraud'], rotation=0)

plt.tight_layout()
plt.show()

## Save Processed Data

In [None]:
# Create processed data directory
processed_dir = Path('../data/processed')
processed_dir.mkdir(exist_ok=True)

# Save datasets
datasets = {
    'X_train_original': X_train_scaled,
    'X_train_smote': X_train_smote,
    'X_test': X_test_scaled,
    'y_train_original': y_train,
    'y_train_smote': y_train_smote,
    'y_test': y_test
}

for name, data in datasets.items():
    filepath = processed_dir / f"{name}.pkl"
    with open(filepath, 'wb') as f:
        pickle.dump(data, f)
    print(f"Saved {name} to {filepath}")

# Save scaler and label encoder
with open(processed_dir / 'scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
    
with open(processed_dir / 'label_encoder_type.pkl', 'wb') as f:
    pickle.dump(le_type, f)

# Save feature names
feature_info = {
    'feature_columns': feature_cols,
    'all_columns': list(df_processed.columns),
    'original_columns': list(df.columns)
}

with open(processed_dir / 'feature_info.pkl', 'wb') as f:
    pickle.dump(feature_info, f)

print("\nAll preprocessing artifacts saved successfully!")

## Preprocessing Summary

In [None]:
# Final preprocessing summary
print("=" * 60)
print("DATA PREPROCESSING SUMMARY")
print("=" * 60)

print(f"\n1. DATASET OVERVIEW:")
print(f"   - Original shape: {df.shape}")
print(f"   - Final shape: {df_processed.shape}")
print(f"   - Features created: {df_processed.shape[1] - df.shape[1]}")

print(f"\n2. FEATURE ENGINEERING:")
print(f"   - Balance difference features: 2")
print(f"   - Balance ratio features: 2")
print(f"   - Amount ratio features: 2")
print(f"   - Log transformed features: 5")
print(f"   - One-hot encoded features: {len([col for col in df_processed.columns if col.startswith('type_')])}")
print(f"   - Binary flag features: 3")

print(f"\n3. DATA SPLITS:")
print(f"   - Training set: {X_train.shape}")
print(f"   - Test set: {X_test.shape}")
print(f"   - Training set (after SMOTE): {X_train_smote.shape}")

print(f"\n4. CLASS DISTRIBUTION:")
print(f"   - Original train fraud rate: {train_fraud_rate:.3f}%")
print(f"   - SMOTE train fraud rate: {(y_train_smote.sum() / len(y_train_smote)) * 100:.1f}%")
print(f"   - Test fraud rate: {test_fraud_rate:.3f}%")

print(f"\n5. PREPROCESSING STEPS COMPLETED:")
print(f"   ✓ Data cleaning and validation")
print(f"   ✓ Feature engineering")
print(f"   ✓ Feature scaling (StandardScaler)")
print(f"   ✓ Train-test split (stratified)")
print(f"   ✓ Class imbalance handling (SMOTE)")
print(f"   ✓ Data persistence")

print(f"\n6. READY FOR MODELING:")
print(f"   - Features available: {len(feature_cols)}")
print(f"   - Balanced dataset for training")
print(f"   - Preserved test set for evaluation")
print(f"   - All preprocessing artifacts saved")

print(f"\n" + "="*60)
print("PREPROCESSING COMPLETED - Ready for model training")
print("="*60)