# Feature Engineering and Data Preprocessing

This notebook performs feature engineering, data transformation, and handles class imbalance for both datasets as part of Task 1.


In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os
import warnings
warnings.filterwarnings('ignore')

# Add src to path
sys.path.append(os.path.join(os.path.dirname(os.getcwd()), 'src'))

from data_cleaning import clean_fraud_data, clean_creditcard_data, ip_to_integer, merge_ip_to_country
from feature_engineering import create_time_features, create_transaction_frequency_features, create_aggregated_features
from data_transformation import prepare_data_for_modeling, encode_categorical_features, scale_numerical_features, handle_class_imbalance

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

print("All libraries imported successfully!")


## Part 1: Fraud_Data.csv Feature Engineering


In [None]:
# Load and clean Fraud_Data
print("=" * 60)
print("Processing Fraud_Data.csv")
print("=" * 60)

fraud_df = pd.read_csv('../data/raw/Fraud_Data.csv')
print(f"Original shape: {fraud_df.shape}")

# Clean data
fraud_df_clean = clean_fraud_data(fraud_df)
print(f"Shape after cleaning: {fraud_df_clean.shape}")


In [None]:
# Geolocation Integration
print("\n" + "=" * 60)
print("Geolocation Integration")
print("=" * 60)

# Load IP to Country mapping
ip_country_df = pd.read_csv('../data/raw/IpAddress_to_Country.csv')
print(f"IP Country mapping shape: {ip_country_df.shape}")

# Convert IP address columns to integer
ip_country_df['lower_bound_ip_address'] = ip_country_df['lower_bound_ip_address'].apply(ip_to_integer)
ip_country_df['upper_bound_ip_address'] = ip_country_df['upper_bound_ip_address'].apply(ip_to_integer)

# Merge fraud data with country mapping
fraud_df_with_country = merge_ip_to_country(fraud_df_clean, ip_country_df)
print(f"Shape after country merge: {fraud_df_with_country.shape}")
print(f"Rows matched: {len(fraud_df_with_country)} / {len(fraud_df_clean)} ({len(fraud_df_with_country)/len(fraud_df_clean)*100:.2f}%)")

# Analyze fraud patterns by country
if 'country' in fraud_df_with_country.columns:
    country_fraud = fraud_df_with_country.groupby('country').agg({
        'class': ['count', 'sum', 'mean']
    }).reset_index()
    country_fraud.columns = ['country', 'total_transactions', 'fraud_count', 'fraud_rate']
    print(f"\nTop 5 countries by fraud rate:")
    print(country_fraud.sort_values('fraud_rate', ascending=False).head())


In [None]:
# Create time-based features
print("\n" + "=" * 60)
print("Creating Time-based Features")
print("=" * 60)

fraud_df_feat = create_time_features(fraud_df_with_country)
print("Time features created:")
time_features = ['hour_of_day', 'day_of_week', 'time_since_signup']
for feat in time_features:
    if feat in fraud_df_feat.columns:
        print(f"  - {feat}")

# Display statistics
if 'time_since_signup' in fraud_df_feat.columns:
    print(f"\ntime_since_signup statistics:")
    print(fraud_df_feat['time_since_signup'].describe())


In [None]:
# Create transaction frequency features
print("\n" + "=" * 60)
print("Creating Transaction Frequency Features")
print("=" * 60)

fraud_df_feat = create_transaction_frequency_features(fraud_df_feat, time_windows=[1, 6, 24])
print("Frequency features created:")
freq_features = [col for col in fraud_df_feat.columns if 'transactions_in' in col or 'transaction_velocity' in col]
for feat in freq_features:
    print(f"  - {feat}")

# Display statistics
if 'transaction_velocity' in fraud_df_feat.columns:
    print(f"\ntransaction_velocity statistics:")
    print(fraud_df_feat['transaction_velocity'].describe())


In [None]:
# Create aggregated features
print("\n" + "=" * 60)
print("Creating Aggregated Features")
print("=" * 60)

fraud_df_feat = create_aggregated_features(fraud_df_feat)
print("Aggregated features created:")
agg_features = [col for col in fraud_df_feat.columns if col.startswith('user_') and col != 'user_id']
for feat in agg_features:
    print(f"  - {feat}")

print(f"\nFinal feature count: {len(fraud_df_feat.columns)}")
print(f"Final shape: {fraud_df_feat.shape}")


In [None]:
# Identify feature types for transformation
print("\n" + "=" * 60)
print("Preparing Data for Transformation")
print("=" * 60)

# Identify categorical and numerical columns
categorical_cols = ['source', 'browser', 'sex', 'country']
categorical_cols = [col for col in categorical_cols if col in fraud_df_feat.columns]

# Numerical columns (exclude target and ID columns)
numerical_cols = fraud_df_feat.select_dtypes(include=[np.number]).columns.tolist()
numerical_cols = [col for col in numerical_cols if col not in ['class', 'user_id', 'device_id', 'ip_integer', 
                                                                 'lower_bound_ip_address', 'upper_bound_ip_address']]

print(f"Categorical columns ({len(categorical_cols)}): {categorical_cols}")
print(f"\nNumerical columns ({len(numerical_cols)}): {numerical_cols[:10]}... (showing first 10)")


In [None]:
# Data Transformation and Class Imbalance Handling
print("\n" + "=" * 60)
print("Data Transformation and Class Imbalance Handling")
print("=" * 60)

# Prepare data for modeling
X_train_fraud, X_test_fraud, y_train_fraud, y_test_fraud, transformers_fraud = prepare_data_for_modeling(
    fraud_df_feat,
    target_col='class',
    categorical_cols=categorical_cols,
    numerical_cols=numerical_cols,
    test_size=0.2,
    random_state=42,
    handle_imbalance=True,
    imbalance_method='smote'  # Using SMOTE as recommended
)

print(f"\nTraining set shape: {X_train_fraud.shape}")
print(f"Test set shape: {X_test_fraud.shape}")
print(f"\nTraining target distribution:")
print(y_train_fraud.value_counts())
print(f"\nTest target distribution:")
print(y_test_fraud.value_counts())

# Save processed data
fraud_df_feat.to_csv('../data/processed/fraud_data_processed.csv', index=False)
X_train_fraud.to_csv('../data/processed/fraud_X_train.csv', index=False)
X_test_fraud.to_csv('../data/processed/fraud_X_test.csv', index=False)
y_train_fraud.to_csv('../data/processed/fraud_y_train.csv', index=False)
y_test_fraud.to_csv('../data/processed/fraud_y_test.csv', index=False)

print("\nProcessed data saved to data/processed/")


## Part 2: creditcard.csv Feature Engineering


In [None]:
# Load and clean creditcard data
print("\n" + "=" * 60)
print("Processing creditcard.csv")
print("=" * 60)

cc_df = pd.read_csv('../data/raw/creditcard.csv')
print(f"Original shape: {cc_df.shape}")

# Clean data
cc_df_clean = clean_creditcard_data(cc_df)
print(f"Shape after cleaning: {cc_df_clean.shape}")

# For creditcard data, features are already PCA-transformed
# We just need to scale and handle imbalance
print("\nNote: Credit card data features (V1-V28) are already PCA-transformed.")


In [None]:
# Prepare creditcard data for modeling
print("\n" + "=" * 60)
print("Data Transformation and Class Imbalance Handling")
print("=" * 60)

# Identify numerical columns (all V1-V28, Time, Amount)
numerical_cols_cc = [f'V{i}' for i in range(1, 29)] + ['Time', 'Amount']
numerical_cols_cc = [col for col in numerical_cols_cc if col in cc_df_clean.columns]

# No categorical columns for creditcard data
categorical_cols_cc = []

# Prepare data for modeling
X_train_cc, X_test_cc, y_train_cc, y_test_cc, transformers_cc = prepare_data_for_modeling(
    cc_df_clean,
    target_col='Class',
    categorical_cols=categorical_cols_cc,
    numerical_cols=numerical_cols_cc,
    test_size=0.2,
    random_state=42,
    handle_imbalance=True,
    imbalance_method='smote'  # Using SMOTE as recommended
)

print(f"\nTraining set shape: {X_train_cc.shape}")
print(f"Test set shape: {X_test_cc.shape}")
print(f"\nTraining target distribution:")
print(y_train_cc.value_counts())
print(f"\nTest target distribution:")
print(y_test_cc.value_counts())

# Save processed data
cc_df_clean.to_csv('../data/processed/creditcard_processed.csv', index=False)
X_train_cc.to_csv('../data/processed/creditcard_X_train.csv', index=False)
X_test_cc.to_csv('../data/processed/creditcard_X_test.csv', index=False)
y_train_cc.to_csv('../data/processed/creditcard_y_train.csv', index=False)
y_test_cc.to_csv('../data/processed/creditcard_y_test.csv', index=False)

print("\nProcessed data saved to data/processed/")


## Summary

### Task 1 Completion Checklist:

✅ **Data Cleaning**
- Handled missing values (imputed with median/mode)
- Removed duplicates
- Corrected data types (timestamps, numerical)

✅ **Exploratory Data Analysis**
- Univariate analysis: distributions of key variables
- Bivariate analysis: relationships between features and target
- Class distribution analysis: quantified the imbalance

✅ **Geolocation Integration**
- Converted IP addresses to integer format
- Merged Fraud_Data.csv with IpAddress_to_Country.csv using range-based lookup
- Analyzed fraud patterns by country

✅ **Feature Engineering (for Fraud_Data.csv)**
- Transaction frequency and velocity features
- Time-based features: hour_of_day, day_of_week, time_since_signup
- Aggregated user-level features

✅ **Data Transformation**
- Normalized/scaled numerical features (StandardScaler)
- Encoded categorical features (One-Hot Encoding)

✅ **Class Imbalance Handling**
- Applied SMOTE to training data only
- Documented class distribution before and after resampling
- Justified choice: SMOTE creates synthetic samples of minority class, preserving information while balancing classes

### Key Decisions:

1. **SMOTE vs Undersampling**: Chose SMOTE because it preserves all data points and creates synthetic samples, which is better for highly imbalanced datasets where we have limited fraud cases.

2. **Feature Engineering**: Created time-based and frequency features to capture behavioral patterns that are strong indicators of fraud.

3. **Geolocation**: Integrated country information to identify high-risk regions for fraud.

### Next Steps:
- Proceed to Task 2: Model Building and Training
- Use the processed datasets saved in `data/processed/`
