# Preprocessing and Feature Engineering
## Credit Card Fraud Detection

This notebook handles data preprocessing and feature engineering using modules from src folder.


In [1]:
# Import necessary libraries
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Add src to path
sys.path.append(str(Path('..')))
from src.data_loader import DataLoader

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
%matplotlib inline


## 1. Load Data


In [4]:
# Initialize data loader
data_loader = DataLoader(data_dir='../data')

# Load cleaned data (from Data Cleaning notebook)
cleaned_csv_path = Path('../data/creditcard_cleaned.csv')
if cleaned_csv_path.exists():
    print(f"Loading cleaned data from: {cleaned_csv_path.absolute()}")
    df_clean = pd.read_csv(cleaned_csv_path)
    print("✓ Using cleaned dataset (duplicates already removed)")
    print(f"  - Cleaned rows: {len(df_clean):,}")
else:
    # Fallback to original data if cleaned version doesn't exist
    print("⚠ WARNING: Cleaned data not found!")
    print("Loading original data...")
    csv_path = Path('../data/creditcard.csv')
    print(f"Loading data from: {csv_path.absolute()}")
    df_original = data_loader.load_csv_data('creditcard.csv')
    print("⚠ Using original dataset - Please run 01b_DataCleaning.ipynb first!")
    # Remove duplicates as fallback
    df_clean = df_original.drop_duplicates(keep='first')
    print(f"  - Removed {df_original.duplicated().sum():,} duplicates as fallback")

print(f"\nCleaned dataset shape: {df_clean.shape}")
print(f"Columns: {len(df_clean.columns)}")
print(f"Duplicate rows: {df_clean.duplicated().sum()}")
print(f"\nFirst few rows of cleaned data:")
df_clean.head()


Loading cleaned data from: d:\h\Financial Fraud Detection-AI\transactions\notebooks\..\data\creditcard_cleaned.csv
✓ Using cleaned dataset (duplicates already removed)
  - Cleaned rows: 283,726

Cleaned dataset shape: (283726, 31)
Columns: 31
Duplicate rows: 0

First few rows of cleaned data:


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


## 2. Data Preprocessing


In [5]:
# Data quality check on cleaned dataset
print("Data Quality Check (Cleaned Dataset):")
print("="*50)
print(f"Missing Values: {df_clean.isnull().sum().sum()}")
print(f"Duplicate Rows: {df_clean.duplicated().sum()}")
print(f"Data Types: {df_clean.dtypes.value_counts().to_dict()}")
print(f"\nClass Distribution:")
print(f"  Normal (0): {df_clean['Class'].value_counts()[0]:,} ({df_clean['Class'].value_counts(normalize=True)[0]*100:.2f}%)")
print(f"  Fraud (1):  {df_clean['Class'].value_counts()[1]:,} ({df_clean['Class'].value_counts(normalize=True)[1]*100:.2f}%)")
print("\n✓ Using cleaned dataset for feature engineering")


Data Quality Check (Cleaned Dataset):
Missing Values: 0
Duplicate Rows: 0
Data Types: {dtype('float64'): 30, dtype('int64'): 1}

Class Distribution:
  Normal (0): 283,253 (99.83%)
  Fraud (1):  473 (0.17%)

✓ Using cleaned dataset for feature engineering


## 3. Feature Engineering


In [6]:
# Create time-based features on cleaned dataset
print("Creating time-based features on cleaned dataset...")
df_clean['Hour'] = (df_clean['Time'] / 3600) % 24
df_clean['Day'] = (df_clean['Time'] / (3600 * 24)) % 7
df_clean['Hour_Sin'] = np.sin(2 * np.pi * df_clean['Hour'] / 24)
df_clean['Hour_Cos'] = np.cos(2 * np.pi * df_clean['Hour'] / 24)

print("✓ Created time-based features:")
print("  - Hour: Hour of day (0-23)")
print("  - Day: Day of week (0-6)")
print("  - Hour_Sin/Cos: Cyclical encoding of hour")
print(f"\nDataset shape after feature engineering: {df_clean.shape}")


Creating time-based features on cleaned dataset...
✓ Created time-based features:
  - Hour: Hour of day (0-23)
  - Day: Day of week (0-6)
  - Hour_Sin/Cos: Cyclical encoding of hour

Dataset shape after feature engineering: (283726, 35)


## 4. Data Splitting


In [7]:
# Use DataLoader for preprocessing on cleaned dataset
print("Preprocessing cleaned dataset...")
X_train, X_test, y_train, y_test, feature_cols = data_loader.preprocess_data(
    df_clean,  # Using cleaned dataset
    target_col='Class',
    test_size=0.2,
    random_state=42
)

print(f"\n✓ Preprocessing complete using cleaned dataset:")
print(f"  Training set: {X_train.shape}")
print(f"  Test set: {X_test.shape}")
print(f"  Feature columns: {len(feature_cols)}")
print(f"  Features: {feature_cols[:5]}... (showing first 5)")


Preprocessing cleaned dataset...

Preprocessing data...
Features: 33
Feature columns: ['V1', 'V2', 'V3', 'V4', 'V5']... (showing first 5)

Train set: 226980 samples
  - Fraud: 378 (0.17%)
Test set: 56746 samples
  - Fraud: 95 (0.17%)

Scaling features...

✓ Preprocessing complete using cleaned dataset:
  Training set: (226980, 33)
  Test set: (56746, 33)
  Feature columns: 33
  Features: ['V1', 'V2', 'V3', 'V4', 'V5']... (showing first 5)


## 5. Summary


In [8]:
print("="*70)
print("PREPROCESSING SUMMARY")
print("="*70)
print(f"\n1. Data Source:")
print(f"   - ✓ Using cleaned dataset from 01b_DataCleaning.ipynb")
print(f"   - Cleaned rows: {len(df_clean):,}")
print(f"   - Duplicates removed: {df_clean.duplicated().sum()}")
print(f"   - Missing values: {df_clean.isnull().sum().sum()}")
print(f"\n2. Feature Engineering (on cleaned data):")
print(f"   - ✓ Created time-based features (Hour, Day, cyclical encoding)")
print(f"   - Note: Amount transformations handled by DataLoader")
print(f"   - Note: V features are already PCA transformed")
print(f"\n3. Data Splitting (from cleaned dataset):")
print(f"   - Train: {X_train.shape[0]:,} samples")
print(f"   - Test: {X_test.shape[0]:,} samples")
print(f"   - Features: {len(feature_cols)}")
print(f"\n4. Scaling:")
print(f"   - ✓ Applied StandardScaler to all features")
print(f"\n5. Next Steps:")
print(f"   - Proceed to 03_ML_Training.ipynb for model training")
print(f"   - All models will be trained on cleaned, preprocessed data")
print("="*70)


PREPROCESSING SUMMARY

1. Data Source:
   - ✓ Using cleaned dataset from 01b_DataCleaning.ipynb
   - Cleaned rows: 283,726
   - Duplicates removed: 0
   - Missing values: 0

2. Feature Engineering (on cleaned data):
   - ✓ Created time-based features (Hour, Day, cyclical encoding)
   - Note: Amount transformations handled by DataLoader
   - Note: V features are already PCA transformed

3. Data Splitting (from cleaned dataset):
   - Train: 226,980 samples
   - Test: 56,746 samples
   - Features: 33

4. Scaling:
   - ✓ Applied StandardScaler to all features

5. Next Steps:
   - Proceed to 03_ML_Training.ipynb for model training
   - All models will be trained on cleaned, preprocessed data
