# XGBoost Training Component
This notebook trains the XGBoost model using the preprocessed data

In [None]:
# Elyra Pipeline Parameters
input_dir = '../data/processed'
output_dir = '../models/xgboost'

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import accuracy_score, roc_auc_score
import json
import os

In [None]:
# Load data
train_data = pd.read_csv(os.path.join(input_dir, 'X_train.csv'))
val_data = pd.read_csv(os.path.join(input_dir, 'X_val.csv'))

X_train = train_data.drop('target', axis=1)
y_train = train_data['target']
X_val = val_data.drop('target', axis=1)
y_val = val_data['target']

In [None]:
# Define hyperparameters
params = {
    'objective': 'binary:logistic',
    'eval_metric': ['error', 'auc', 'logloss'],
    'learning_rate': 0.1,
    'max_depth': 6,
    'n_estimators': 500,
    'min_child_weight': 1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'random_state': 42
}

In [None]:
# Create DMatrix objects
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

# Set up watchlist
watchlist = [(dtrain, 'train'), (dval, 'validation')]

# Train model
model = xgb.train(
    params,
    dtrain,
    params['n_estimators'],
    watchlist,
    early_stopping_rounds=50,
    verbose_eval=100
)

In [None]:
# Make predictions
y_train_pred = model.predict(dtrain)
y_val_pred = model.predict(dval)

# Calculate metrics
metrics = {
    'train_accuracy': float(accuracy_score(y_train, y_train_pred > 0.5)),
    'train_auc': float(roc_auc_score(y_train, y_train_pred)),
    'val_accuracy': float(accuracy_score(y_val, y_val_pred > 0.5)),
    'val_auc': float(roc_auc_score(y_val, y_val_pred)),
    'best_iteration': model.best_iteration,
    'best_score': float(model.best_score)
}

print("Training metrics:")
for metric, value in metrics.items():
    print(f"{metric}: {value}")

In [None]:
# Create output directory
os.makedirs(output_dir, exist_ok=True)

# Save model in XGBoost format
model_path = os.path.join(output_dir, 'model.json')
model.save_model(model_path)

# Save metrics and parameters
metrics_path = os.path.join(output_dir, 'metrics.json')
with open(metrics_path, 'w') as f:
    json.dump({
        'metrics': metrics,
        'parameters': params,
        'feature_names': list(X_train.columns)
    }, f, indent=2)

print(f'Saved model to: {model_path}')
print(f'Saved metrics to: {metrics_path}')