# Diabetes Progression Prediction with XGBoost

This example demonstrates training an XGBoost regressor to predict diabetes disease progression.

## Dataset

- **Name**: Diabetes Dataset
- **Samples**: 442
- **Features**: 10 baseline variables
- **Target**: Quantitative measure of disease progression one year after baseline
- **Type**: Regression

## Model

- **Framework**: XGBoost
- **Type**: Gradient Boosting Regressor
- **Objective**: Squared error regression

## Features

The dataset includes 10 baseline variables:
- `age`: Age in years
- `sex`: Sex
- `bmi`: Body mass index
- `bp`: Average blood pressure
- `s1`: Total serum cholesterol
- `s2`: Low-density lipoproteins
- `s3`: High-density lipoproteins
- `s4`: Total cholesterol / HDL
- `s5`: Log of serum triglycerides level
- `s6`: Blood sugar level

In [None]:
%pip install xgboost pandas numpy scikit-learn mlflow

In [None]:
import os
import argparse
import json
import tempfile
import numpy as np
import pandas as pd
from datetime import datetime

# XGBoost imports
import xgboost as xgb

# MLflow imports
import mlflow
import mlflow.xgboost
from mlflow import set_tracking_uri, set_experiment
from mlflow.client import MlflowClient
from mlflow.models import infer_signature

# Scikit-learn imports
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [None]:
def setup_mlflow(mlflow_uri: str, username: str, password: str) -> MlflowClient:
    """Configure MLflow tracking and return client."""
    os.environ["MLFLOW_TRACKING_USERNAME"] = username
    os.environ["MLFLOW_TRACKING_PASSWORD"] = password
    
    set_tracking_uri(mlflow_uri)
    client = MlflowClient(mlflow_uri)
    
    print(f"MLflow tracking URI: {mlflow_uri}")
    return client


def load_and_prepare_data():
    """Load Diabetes dataset and prepare train/test splits."""
    print("\n" + "=" * 80)
    print("LOADING DATASET")
    print("=" * 80)
    
    # Load dataset
    data = load_diabetes(as_frame=True)
    X = data.data
    y = data.target
    
    print(f"Dataset: Diabetes")
    print(f"Samples: {X.shape[0]:,}")
    print(f"Features: {X.shape[1]}")
    print(f"\nFeature names:")
    for i, col in enumerate(X.columns, 1):
        print(f"  {i}. {col}")
    
    print(f"\nTarget statistics:")
    print(f"  Mean: {y.mean():.2f}")
    print(f"  Std: {y.std():.2f}")
    print(f"  Min: {y.min():.2f}")
    print(f"  Max: {y.max():.2f}")
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, shuffle=True
    )
    
    print(f"\nTrain samples: {X_train.shape[0]:,}")
    print(f"Test samples: {X_test.shape[0]:,}")
    
    return X_train, X_test, y_train, y_test, X.columns.tolist()


def train_model(X_train, y_train, X_test, y_test, hyperparams: dict):
    """Train XGBoost model and return predictions."""
    print("\n" + "=" * 80)
    print("TRAINING MODEL")
    print("=" * 80)
    
    print("Hyperparameters:")
    for key, value in hyperparams.items():
        print(f"  {key}: {value}")
    
    # Create DMatrix for XGBoost
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)
    
    # Train model
    model = xgb.train(
        params=hyperparams,
        dtrain=dtrain,
        num_boost_round=hyperparams.pop('num_boost_round', 100),
        evals=[(dtrain, 'train'), (dtest, 'test')],
        early_stopping_rounds=20,
        verbose_eval=10
    )
    
    print("Training completed!")
    
    # Predictions
    y_train_pred = model.predict(dtrain)
    y_test_pred = model.predict(dtest)
    
    return model, y_train_pred, y_test_pred


def calculate_metrics(y_true, y_pred, dataset_name="Test"):
    """Calculate and return evaluation metrics."""
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    
    return {
        f"{dataset_name.lower()}_rmse": rmse,
        f"{dataset_name.lower()}_mae": mae,
        f"{dataset_name.lower()}_r2": r2
    }


def log_to_mlflow(model, X_train, y_train, X_test, y_test, 
                  y_train_pred, y_test_pred, hyperparams, feature_names):
    """Log model, parameters, and metrics to MLflow."""
    print("\n" + "=" * 80)
    print("LOGGING TO MLFLOW")
    print("=" * 80)
    
    # Log hyperparameters
    for key, value in hyperparams.items():
        mlflow.log_param(key, value)
    
    # Calculate and log metrics
    train_metrics = calculate_metrics(y_train, y_train_pred, "Train")
    test_metrics = calculate_metrics(y_test, y_test_pred, "Test")
    all_metrics = {**train_metrics, **test_metrics}
    
    for metric_name, metric_value in all_metrics.items():
        mlflow.log_metric(metric_name, metric_value)
    
    print("\nModel Performance:")
    print(f"  Training RMSE: {train_metrics['train_rmse']:.4f}")
    print(f"  Training R2: {train_metrics['train_r2']:.4f}")
    print(f"  Test RMSE: {test_metrics['test_rmse']:.4f}")
    print(f"  Test MAE: {test_metrics['test_mae']:.4f}")
    print(f"  Test R2: {test_metrics['test_r2']:.4f}")
    
    # Create output with explicit column name
    y_pred_df = pd.DataFrame(model.predict(xgb.DMatrix(X_train[:1])), columns=["predicted_progression"])
    
    # Create model signature
    signature = infer_signature(X_train, y_pred_df)
    input_example = X_test.head(1)
    
    # Save and log model
    with tempfile.TemporaryDirectory() as tmpdir:
        local_model_path = os.path.join(tmpdir, "model")
        
        mlflow.xgboost.save_model(
            model,
            local_model_path,
            signature=signature,
            input_example=input_example
        )
        
        mlflow.log_artifacts(local_model_path, artifact_path="model")
        print("Model artifacts logged successfully!")
    
    return all_metrics


def create_sample_payload(X_test, y_test, model, feature_names):
    """Create realistic sample prediction payload."""
    # Get a sample
    sample_idx = 0
    sample = X_test.iloc[sample_idx:sample_idx+1]  # Keep as DataFrame to preserve column names
    actual_progression = y_test.iloc[sample_idx]
    
    # Predict - pass DataFrame to preserve feature names
    predicted_progression = model.predict(xgb.DMatrix(sample))[0]
    
    return {
        "features": sample.iloc[0].to_dict(),
        "actual_progression": float(actual_progression),
        "predicted_progression": float(predicted_progression)
    }


def register_model(client: MlflowClient, model_name: str, run_id: str, experiment_id: str):
    """Register model in MLflow Model Registry."""
    print("\n" + "=" * 80)
    print("REGISTERING MODEL")
    print("=" * 80)
    
    model_uri = f"runs:/{run_id}/model"
    
    # Create registered model if it doesn't exist
    try:
        client.get_registered_model(model_name)
        print(f"Model '{model_name}' already exists in registry")
    except Exception:
        try:
            client.create_registered_model(model_name)
            print(f"Created registered model: {model_name}")
        except Exception as e:
            print(f"Could not create registered model: {e}")
    
    # Create model version
    try:
        result = client.create_model_version(
            name=model_name,
            source=model_uri,
            run_id=run_id
        )
        print(f"Model version registered successfully!")
        print(f"   Model Name: {model_name}")
        print(f"   Version: {result.version}")
        print(f"   Run ID: {run_id}")
        return result.version
    except Exception as e:
        print(f"Model registration failed (model still usable via run URI): {e}")
        print(f"   You can deploy using: mlflow-artifacts:/{experiment_id}/{run_id}/artifacts/model")
        return None


def print_deployment_info(run_id: str, experiment_id: str, sample_payload: dict):
    """Print deployment instructions and sample payloads."""
    print("\n" + "=" * 80)
    print("TRAINING COMPLETE!")
    print("=" * 80)
    
    print(f"\nRun Information:")
    print(f"  Run ID: {run_id}")
    print(f"  Experiment ID: {experiment_id}")
    print(f"  Model URI: mlflow-artifacts:/{experiment_id}/{run_id}/artifacts/model")
    
    print("\n" + "=" * 80)
    print("DEPLOYMENT PAYLOAD (deploy-model API)")
    print("=" * 80)
    
    deploy_payload = {
        "serve_name": "diabetes-xgboost-regressor",
        "model_uri": f"mlflow-artifacts:/{experiment_id}/{run_id}/artifacts/model",
        "env": "local",
        "cores": 2,
        "memory": 4,
        "node_capacity": "spot",
        "min_replicas": 1,
        "max_replicas": 3
    }
    
    print(json.dumps(deploy_payload, indent=2))
    
    print("\n" + "=" * 80)
    print("SAMPLE PREDICTION PAYLOAD")
    print("=" * 80)
    
    predict_payload = {
        "features": sample_payload["features"]
    }
    
    print(json.dumps(predict_payload, indent=2))
    
    print(f"\nExpected Output:")
    print(f"  Actual Progression: {sample_payload['actual_progression']:.2f}")
    print(f"  Model Prediction: {sample_payload['predicted_progression']:.2f}")
    
    print("\n" + "=" * 80)
    print("FEATURE DESCRIPTIONS")
    print("=" * 80)
    features = sample_payload["features"]
    print(f"""
  age:    {features['age']:.4f}
  sex:    {features['sex']:.4f}
  bmi:    {features['bmi']:.4f}
  bp:     {features['bp']:.4f}
  s1:     {features['s1']:.4f}
  s2:     {features['s2']:.4f}
  s3:     {features['s3']:.4f}
  s4:     {features['s4']:.4f}
  s5:     {features['s5']:.4f}
  s6:     {features['s6']:.4f}
    """)

In [None]:
def main():
    parser = argparse.ArgumentParser(description="Train XGBoost Diabetes Regression Model")
    parser.add_argument(
        "--mlflow-uri",
        default="http://darwin-mlflow-lib.darwin.svc.cluster.local:8080",
        help="MLflow tracking URI"
    )
    parser.add_argument(
        "--username",
        default="abc@gmail.com",
        help="MLflow username"
    )
    parser.add_argument(
        "--password",
        default="password",
        help="MLflow password"
    )
    parser.add_argument(
        "--experiment-name",
        default="diabetes_xgboost_regression",
        help="MLflow experiment name"
    )
    parser.add_argument(
        "--model-name",
        default="DiabetesXGBoostRegressor",
        help="Registered model name"
    )
    
    args, _ = parser.parse_known_args()
    
    print("\n" + "=" * 80)
    print("DIABETES PROGRESSION PREDICTION WITH XGBOOST")
    print("=" * 80)
    print(f"Started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    
    # Setup MLflow
    client = setup_mlflow(args.mlflow_uri, args.username, args.password)
    set_experiment(experiment_name=args.experiment_name)
    print(f"Experiment: {args.experiment_name}")
    
    # Load data
    X_train, X_test, y_train, y_test, feature_names = load_and_prepare_data()
    
    # Define hyperparameters
    hyperparams = {
        "objective": "reg:squarederror",
        "max_depth": 5,
        "learning_rate": 0.1,
        "n_estimators": 100,
        "subsample": 0.8,
        "colsample_bytree": 0.8,
        "num_boost_round": 100,
        "random_state": 42
    }
    
    # Start MLflow run
    with mlflow.start_run(run_name=f"xgboost_diabetes_{datetime.now().strftime('%Y%m%d_%H%M%S')}"):
        # Train model
        model, y_train_pred, y_test_pred = train_model(
            X_train, y_train, X_test, y_test, hyperparams.copy()
        )
        
        # Log to MLflow
        metrics = log_to_mlflow(
            model, X_train, y_train, X_test, y_test,
            y_train_pred, y_test_pred, hyperparams, feature_names
        )
        
        # Get run information
        run_id = mlflow.active_run().info.run_id
        experiment_id = mlflow.active_run().info.experiment_id
        
        # Create sample payload
        sample_payload = create_sample_payload(X_test, y_test, model, feature_names)
    
    # Register model (outside of run context)
    version = register_model(client, args.model_name, run_id, experiment_id)
    
    # Print deployment information
    print_deployment_info(run_id, experiment_id, sample_payload)
    
    print("\nScript completed successfully!")
    print("=" * 80 + "\n")


if __name__ == "__main__":
    main()