# Data Preprocessing for XGBoost Model

This notebook handles data preprocessing steps including:
- Loading the raw data
- Handling missing values
- Feature scaling
- Feature engineering
- Saving processed data for model training

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# Set random seed for reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

## Load Raw Data

In [None]:
# Load datasets
train_data = pd.read_csv('../data/raw/train.csv')
val_data = pd.read_csv('../data/raw/validation.csv')
test_data = pd.read_csv('../data/raw/test.csv')

# Separate features and target
X_train = train_data.drop('target', axis=1)
y_train = train_data['target']
X_val = val_data.drop('target', axis=1)
y_val = val_data['target']
X_test = test_data.drop('target', axis=1)
y_test = test_data['target']

print("Data loaded successfully!")
print(f"Training set shape: {X_train.shape}")
print(f"Validation set shape: {X_val.shape}")
print(f"Test set shape: {X_test.shape}")

## Handle Missing Values

We'll use median imputation for missing values since it's more robust to outliers than mean imputation.

In [None]:
# Initialize imputer
imputer = SimpleImputer(strategy='median')

# Fit on training data and transform all sets
X_train_imputed = pd.DataFrame(
    imputer.fit_transform(X_train),
    columns=X_train.columns,
    index=X_train.index
)

X_val_imputed = pd.DataFrame(
    imputer.transform(X_val),
    columns=X_val.columns,
    index=X_val.index
)

X_test_imputed = pd.DataFrame(
    imputer.transform(X_test),
    columns=X_test.columns,
    index=X_test.index
)

print("Missing values handled successfully!")

## Feature Scaling

StandardScaler will normalize features to have zero mean and unit variance.

In [None]:
# Initialize scaler
scaler = StandardScaler()

# Fit on training data and transform all sets
X_train_scaled = pd.DataFrame(
    scaler.fit_transform(X_train_imputed),
    columns=X_train_imputed.columns,
    index=X_train_imputed.index
)

X_val_scaled = pd.DataFrame(
    scaler.transform(X_val_imputed),
    columns=X_val_imputed.columns,
    index=X_val_imputed.index
)

X_test_scaled = pd.DataFrame(
    scaler.transform(X_test_imputed),
    columns=X_test_imputed.columns,
    index=X_test_imputed.index
)

print("Feature scaling completed!")

## Feature Engineering

Create interaction features between the most important features to capture non-linear relationships.

In [None]:
def create_interaction_features(X):
    """Create interaction features between selected features."""
    # Select first 5 features for interactions
    base_features = X.columns[:5]
    
    # Create interactions
    for i in range(len(base_features)):
        for j in range(i + 1, len(base_features)):
            feat1, feat2 = base_features[i], base_features[j]
            X[f"{feat1}_x_{feat2}"] = X[feat1] * X[feat2]
    
    return X

# Add interaction features
X_train_final = create_interaction_features(X_train_scaled.copy())
X_val_final = create_interaction_features(X_val_scaled.copy())
X_test_final = create_interaction_features(X_test_scaled.copy())

print("Feature engineering completed!")
print(f"Final number of features: {X_train_final.shape[1]}")

## Save Processed Data

In [None]:
import os
import joblib

# Create processed data directory
os.makedirs('../data/processed', exist_ok=True)

# Save processed datasets
X_train_final.to_csv('../data/processed/X_train.csv', index=False)
pd.DataFrame(y_train).to_csv('../data/processed/y_train.csv', index=False)
X_val_final.to_csv('../data/processed/X_val.csv', index=False)
pd.DataFrame(y_val).to_csv('../data/processed/y_val.csv', index=False)
X_test_final.to_csv('../data/processed/X_test.csv', index=False)
pd.DataFrame(y_test).to_csv('../data/processed/y_test.csv', index=False)

# Save preprocessing objects for inference
os.makedirs('../models/preprocessing', exist_ok=True)
joblib.dump(imputer, '../models/preprocessing/imputer.joblib')
joblib.dump(scaler, '../models/preprocessing/scaler.joblib')

print("Processed data and preprocessing objects saved successfully!")

## Save Feature Information

In [None]:
# Save feature names and their descriptions
feature_info = {
    'original_features': list(X_train.columns),
    'interaction_features': [col for col in X_train_final.columns if '_x_' in col],
    'total_features': list(X_train_final.columns)
}

import json
with open('../data/processed/feature_info.json', 'w') as f:
    json.dump(feature_info, f, indent=2)

print("Feature information saved successfully!")