# NIBSS Fraud Detection: Data Preprocessing and Pipeline Setup

This notebook covers the initial data preprocessing and pipeline setup for the NIBSS fraud detection system.

## Setup and Dependencies

In [None]:
# Install required packages if not available
!pip install -q \
    scikit-learn>=1.4.0 \
    imbalanced-learn>=0.12.0 \
    category-encoders>=2.7.0 \
    shap==0.45.0 \
    xgboost==2.0.3

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Data processing
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Imbalanced learning
from imblearn.over_sampling import SMOTE, SMOTENC
from imblearn.pipeline import Pipeline as ImbPipeline

# Metrics
from sklearn.metrics import (
    roc_auc_score, average_precision_score, f1_score,
    confusion_matrix, classification_report, precision_recall_curve,
    roc_curve, auc, make_scorer
)

# Utilities
import joblib
import gc

# Set random seed for reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

print("All packages imported successfully!")
print(f"Current time: {datetime.now()}")

## Load Generated Dataset

Load the synthetic NIBSS fraud dataset generated using our dataset generator.

In [None]:
# Load the dataset
data_path = '../data/processed/nibss_fraud_dataset.csv'
df = pd.read_csv(data_path)

print(f"Dataset shape: {df.shape}")
print(f"Fraud rate: {df['is_fraud'].mean():.4f}")
print(f"Memory usage: {df.memory_usage().sum() / 1024**2:.2f} MB")

# Basic validation
print("\nColumn types:")
print(df.dtypes.value_counts())
print("\nMissing values:")
print(df.isnull().sum().sum())

## Feature Engineering and Data Preparation

In [None]:
# Define feature groups
numerical_features = [
    'amount', 'tx_count_24h', 'amount_sum_24h', 'amount_mean_7d',
    'amount_std_7d', 'tx_count_total', 'amount_mean_total', 'amount_std_total',
    'channel_diversity', 'location_diversity', 'amount_vs_mean_ratio',
    'online_channel_ratio', 'hour_sin', 'hour_cos', 'day_sin', 'day_cos',
    'month_sin', 'month_cos', 'amount_log', 'amount_rounded',
    'velocity_score', 'merchant_risk_score', 'composite_risk'
]

categorical_low_cardinality = [
    'channel', 'age_group', 'is_weekend', 'is_peak_hour'
]

categorical_high_cardinality = [
    'merchant_category', 'bank', 'location'
]

# Additional time-based features (already in dataset but let's verify)
time_features = ['hour', 'day_of_week', 'month']

# Prepare features and target
feature_columns = numerical_features + categorical_low_cardinality + categorical_high_cardinality + time_features
X = df[feature_columns].copy()
y = df['is_fraud'].copy()

print(f"Feature columns: {len(feature_columns)}")
print(f"Numerical features: {len(numerical_features)}")
print(f"Categorical features: {len(categorical_low_cardinality + categorical_high_cardinality + time_features)}")

## Train-Validation-Test Split (70/15/15)

In [None]:
# Stratified split to maintain fraud rate
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.15, random_state=RANDOM_SEED, stratify=y
)

X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.176, random_state=RANDOM_SEED, stratify=y_temp
)  # 0.176 * 0.85 ≈ 0.15

print("Dataset splits:")
print(f"Train: {X_train.shape[0]} samples, fraud rate: {y_train.mean():.4f}")
print(f"Val: {X_val.shape[0]} samples, fraud rate: {y_val.mean():.4f}")
print(f"Test: {X_test.shape[0]} samples, fraud rate: {y_test.mean():.4f}")

# Save indices for reproducibility
split_indices = {
    'train': X_train.index.tolist(),
    'val': X_val.index.tolist(),
    'test': X_test.index.tolist()
}
joblib.dump(split_indices, '../data/processed/split_indices.pkl')

## Create Preprocessing Pipeline

In [None]:
from category_encoders import TargetEncoder

# Identify categorical columns for SMOTENC
categorical_indices = []
all_features = numerical_features + categorical_low_cardinality + categorical_high_cardinality + time_features

for i, col in enumerate(all_features):
    if col in categorical_low_cardinality + categorical_high_cardinality + time_features:
        categorical_indices.append(i)

print(f"Categorical indices for SMOTENC: {categorical_indices}")

# Create preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat_low', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_low_cardinality),
        ('cat_high', TargetEncoder(smoothing=10), categorical_high_cardinality),
        ('time', 'passthrough', time_features)
    ],
    remainder='passthrough'
)

print("Preprocessing pipeline created successfully")

## Define Model Pipelines with SMOTE

In [None]:
# Calculate SMOTE strategy for 1:5 ratio
n_fraud_train = y_train.sum()
n_legit_train = len(y_train) - n_fraud_train
desired_fraud_samples = n_legit_train // 5
sampling_strategy = desired_fraud_samples / n_legit_train

print(f"Original fraud samples in train: {n_fraud_train}")
print(f"Original legitimate samples in train: {n_legit_train}")
print(f"SMOTE sampling strategy: {sampling_strategy:.4f}")

# Define models
models = {
    'logistic_regression': LogisticRegression(
        max_iter=500,  # Reduced from 1000
        random_state=RANDOM_SEED,
        solver='liblinear',  # Faster solver
        class_weight='balanced'
    ),
    'random_forest': RandomForestClassifier(
        random_state=RANDOM_SEED,
        n_jobs=-1,
        class_weight='balanced'
    ),
    'xgboost': XGBClassifier(
        random_state=RANDOM_SEED,
        use_label_encoder=False,
        eval_metric='logloss',
        tree_method='hist',
        n_jobs=-1
    )
}

# Create pipelines with SMOTE
pipelines = {}
for name, model in models.items():
    # Note: We use regular SMOTE here, but in production you might want SMOTENC
    # for mixed data types. However, after preprocessing, all features are numeric
    pipelines[name] = ImbPipeline([
        ('preprocessor', preprocessor),
        ('smote', SMOTE(sampling_strategy=sampling_strategy, random_state=RANDOM_SEED)),
        ('classifier', model)
    ])

print("Model pipelines created successfully")

## Define Hyperparameter Grids

In [None]:
param_grids = {
    'logistic_regression': {
        'classifier__C': [0.1, 1.0, 10.0],  # Reduced from 4 to 3
        'classifier__penalty': ['l2']  # Only L2, L1 is slower with saga solver
    },
    'random_forest': {
        'classifier__n_estimators': [100, 200],  # Reduced from 3 to 2
        'classifier__max_depth': [20, None],     # Reduced from 4 to 2
        'classifier__min_samples_split': [10],   # Fixed at reasonable value
        'classifier__max_features': ['sqrt']     # Fixed at most common choice
    },
    'xgboost': {
        'classifier__learning_rate': [0.1, 0.3],      # Reduced from 3 to 2
        'classifier__max_depth': [6],                   # Fixed at good default
        'classifier__n_estimators': [100, 200],        # Reduced from 3 to 2
        'classifier__subsample': [1.0],                # Fixed
        'classifier__colsample_bytree': [1.0]          # Fixed
    }
}

# Custom scoring function for AUC-PR (more suitable for imbalanced data)
average_precision_scorer = make_scorer(average_precision_score)

print("Hyperparameter grids defined")

## Save Processed Data and Pipeline Components

In [None]:
# Save preprocessed data info
data_info = {
    'n_samples': len(df),
    'n_features': len(feature_columns),
    'fraud_rate': y.mean(),
    'numerical_features': numerical_features,
    'categorical_low': categorical_low_cardinality,
    'categorical_high': categorical_high_cardinality,
    'time_features': time_features,
    'train_size': len(X_train),
    'val_size': len(X_val),
    'test_size': len(X_test),
    'smote_ratio': sampling_strategy
}

joblib.dump(data_info, '../data/processed/data_info.pkl')
joblib.dump(pipelines, '../models/base_pipelines.pkl')
joblib.dump(param_grids, '../config/param_grids.pkl')

# Save data splits
data_splits = {
    'X_train': X_train,
    'X_val': X_val,
    'X_test': X_test,
    'y_train': y_train,
    'y_val': y_val,
    'y_test': y_test
}
joblib.dump(data_splits, '../data/processed/data_splits.pkl')

print("All data and pipeline components saved successfully!")
print("Ready for hyperparameter optimization in next notebook")

## Memory Cleanup

In [None]:
del df
gc.collect()
print("Memory cleaned up")