# XGBoost Model Development

This notebook implements the XGBoost model for our classification task, including:
- Loading processed data
- Training the model
- Model evaluation
- Model export for Kubeflow pipeline

In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import json
import os

# Set random seed for reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

## Load Processed Data

In [None]:
# Load processed datasets
X_train = pd.read_csv('../data/processed/X_train.csv')
y_train = pd.read_csv('../data/processed/y_train.csv')['target']
X_val = pd.read_csv('../data/processed/X_val.csv')
y_val = pd.read_csv('../data/processed/y_val.csv')['target']
X_test = pd.read_csv('../data/processed/X_test.csv')
y_test = pd.read_csv('../data/processed/y_test.csv')['target']

print("Data loaded successfully!")
print(f"Training set shape: {X_train.shape}")
print(f"Validation set shape: {X_val.shape}")
print(f"Test set shape: {X_test.shape}")

## Create DMatrix Objects

XGBoost's DMatrix is an optimized data structure for training.

In [None]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
dtest = xgb.DMatrix(X_test, label=y_test)

# Create watchlist for training
watchlist = [(dtrain, 'train'), (dval, 'validation')]

## Define Model Parameters

These parameters will later be optimized using Katib.

In [None]:
params = {
    'objective': 'binary:logistic',
    'eval_metric': ['error', 'auc', 'logloss'],
    'max_depth': 6,
    'learning_rate': 0.1,
    'n_estimators': 500,
    'min_child_weight': 1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'random_state': RANDOM_STATE
}

## Train Model with Early Stopping

In [None]:
num_rounds = params['n_estimators']
model = xgb.train(
    params,
    dtrain,
    num_rounds,
    watchlist,
    early_stopping_rounds=50,
    verbose_eval=100
)

print(f"Best iteration: {model.best_iteration}")
print(f"Best validation error: {model.best_score}")

## Model Evaluation

In [None]:
def evaluate_model(model, X, y, dataset_name):
    dmatrix = xgb.DMatrix(X)
    y_pred = model.predict(dmatrix)
    y_pred_binary = (y_pred > 0.5).astype(int)
    
    accuracy = accuracy_score(y, y_pred_binary)
    report = classification_report(y, y_pred_binary)
    conf_matrix = confusion_matrix(y, y_pred_binary)
    
    print(f"\n{dataset_name} Results:")
    print(f"Accuracy: {accuracy:.4f}")
    print("\nClassification Report:")
    print(report)
    
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix - {dataset_name}')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.show()
    
    return {
        'accuracy': accuracy,
        'confusion_matrix': conf_matrix.tolist(),
        'classification_report': report
    }

# Evaluate on all datasets
train_metrics = evaluate_model(model, X_train, y_train, "Training Set")
val_metrics = evaluate_model(model, X_val, y_val, "Validation Set")
test_metrics = evaluate_model(model, X_test, y_test, "Test Set")

## Feature Importance Analysis

In [None]:
feature_importance = model.get_score(importance_type='gain')
importance_df = pd.DataFrame(
    {
        'feature': list(feature_importance.keys()),
        'importance': list(feature_importance.values())
    }
).sort_values('importance', ascending=False)

plt.figure(figsize=(12, 6))
sns.barplot(data=importance_df.head(20), x='importance', y='feature')
plt.title('Top 20 Feature Importance (Gain)')
plt.show()

## Save Model and Metrics

In [None]:
# Create directories if they don't exist
os.makedirs('../models/xgboost', exist_ok=True)
os.makedirs('../models/metrics', exist_ok=True)
os.makedirs('../models/serving', exist_ok=True)

# Save model in XGBoost format
model.save_model('../models/serving/model.json')

# Save model configuration
model_config = {
    'feature_names': list(X_train.columns),
    'parameters': params,
    'best_iteration': model.best_iteration,
    'best_score': float(model.best_score)
}

with open('../models/serving/model_config.json', 'w') as f:
    json.dump(model_config, f, indent=2)

# Save feature information
feature_info = {
    'feature_names': list(X_train.columns),
    'n_features': len(X_train.columns)
}

with open('../models/serving/feature_info.json', 'w') as f:
    json.dump(feature_info, f, indent=2)

# Save evaluation metrics
metrics = {
    'train': train_metrics,
    'validation': val_metrics,
    'test': test_metrics,
    'feature_importance': importance_df.to_dict(orient='records')
}

with open('../models/metrics/evaluation_metrics.json', 'w') as f:
    json.dump(metrics, f, indent=2)

print("Model, configuration, and metrics saved successfully!")