In [0]:
%pip install xgboost
%pip install mlflow
dbutils.library.restartPython()

[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m
[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m


In [0]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from sklearn.utils.class_weight import compute_sample_weight
import xgboost as xgb
import mlflow
import mlflow.xgboost
from mlflow.models.signature import infer_signature
import matplotlib.pyplot as plt
import seaborn as sns

In [0]:
# Databricks - MLflow is auto-configured, no credentials needed
# (Colab version used userdata for DATABRICKS_HOST/TOKEN)

# End runs
while mlflow.active_run():
    mlflow.end_run(status='FINISHED')
mlflow.end_run()

mlflow.set_experiment("/Users/leo.lwakabamba@gmail.com/F1_Tire_Deg_DemoII")

<Experiment: artifact_location='dbfs:/databricks/mlflow-tracking/940444045771050', creation_time=1764878552143, experiment_id='940444045771050', last_update_time=1764879265776, lifecycle_stage='active', name='/Users/leo.lwakabamba@gmail.com/F1_Tire_Deg_DemoII', tags={'mlflow.experiment.sourceName': '/Users/leo.lwakabamba@gmail.com/F1_Tire_Deg_DemoII',
 'mlflow.experimentType': 'MLFLOW_EXPERIMENT',
 'mlflow.ownerEmail': 'leo.lwakabamba@gmail.com',
 'mlflow.ownerId': '212346414010265'}>

In [0]:
# Unity Catalog Volume path (replaces Google Drive path)
CATALOG = "workspace"
SCHEMA = "default"
VOLUME = "f1"
DATA_PATH = f"/Volumes/{CATALOG}/{SCHEMA}/{VOLUME}/simulated_dataset.csv"

df = pd.read_csv(DATA_PATH, engine='pyarrow')
# df = df.iloc[:100000]  # Limit if needed

In [0]:
df = df[df['Track'] != 'NÃ¼rburgring Nordschleife']
print(f"Rows after filter: {len(df)}")
print("Tracks:\n", df['Track'].value_counts(normalize=True))

Rows after filter: 1961279
Tracks:
 Monza            0.333345
Monaco           0.333344
Red Bull Ring    0.333311
Name: Track, dtype: float64


In [0]:
numerical_features = ['Throttle', 'Brake', 'Speed', 'Surface_Roughness',
                      'Ambient_Temperature', 'Lateral_G_Force', 'Longitudinal_G_Force',
                      'Tire_Friction_Coefficient', 'Tire_Tread_Depth',
                      'force_on_tire', 'front_surface_temp', 'rear_surface_temp',
                      'front_inner_temp', 'rear_inner_temp']
categorical_features = ['Tire_Compound', 'Driving_Style', 'Track']
target_reg = 'cumilative_Tire_Wear'  # Fix typo
target_class = 'degradation_risk'

In [0]:
# Optional: Track mult for deg (uncomment to apply; adjusts wear, model learns)
track_mult = {'Monza': 1.2, 'Monaco': 0.9, 'Red Bull Ring': 1.1}
df[target_reg] *= df['Track'].map(track_mult).fillna(1.0)
print(df[target_reg].describe())

count    1.961279e+06
mean     2.106965e-01
std      1.496784e-01
min      0.000000e+00
25%      7.609112e-02
50%      1.793738e-01
75%      3.293559e-01
max      8.142081e-01
Name: cumilative_Tire_Wear, dtype: float64


In [0]:
# Derive class (qcut for balance; or fixed bins [0,0.3,0.6,max] for realism)
if target_class not in df.columns:
    df[target_class] = pd.qcut(df[target_reg], q=3, labels=['safe', 'medium', 'critical'], duplicates='drop')
    print("Risk Dist:\n", df[target_class].value_counts(normalize=True))

df = df.dropna(subset=[target_class])
print(f"Rows clean: {len(df)}")

Risk Dist:
 safe        0.333334
critical    0.333334
medium      0.333333
Name: degradation_risk, dtype: float64
Rows clean: 1961279


In [0]:
X_num = df[numerical_features]
X_cat = df[categorical_features]
y_class = df[target_class]

encoder = OneHotEncoder(sparse=False, drop='first')
X_cat_encoded = encoder.fit_transform(X_cat)
X = np.hstack((X_num, X_cat_encoded))

le = LabelEncoder()
y_class_encoded = le.fit_transform(y_class)

In [0]:
X_train, X_test, y_class_train, y_class_test = train_test_split(X, y_class_encoded, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

sample_weights = compute_sample_weight(class_weight='balanced', y=y_class_train)

In [0]:
def generate_confusion_matrix(y_true, y_pred, class_labels, run_id=None):
    cm = confusion_matrix(y_true, y_pred)
    fig, ax = plt.subplots(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_labels, yticklabels=class_labels)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')
    if run_id:
        mlflow.log_figure(fig, "confusion_matrix.png")
    plt.close(fig)
    return cm

In [0]:
params = {
    'objective': 'multi:softprob',
    'num_class': 3,
    'eval_metric': 'mlogloss',
    'learning_rate': 0.1,
    'max_depth': 6,
    'min_child_weight': 1,
    'subsample': 0.5,
    'colsample_bytree': 0.5,
    'seed': 42,
    'n_jobs': -1,
    'early_stopping_rounds': 10,
    'tree_method': 'hist'  # 'gpu_hist' on GPU
}

In [0]:
with mlflow.start_run(run_name="XGBoost_Classifier_POC") as xgb_run:
    xgb_model = xgb.XGBClassifier(**params)
    xgb_model.fit(X_train_scaled, y_class_train, sample_weight=sample_weights,
                  eval_set=[(X_test_scaled, y_class_test)], verbose=False)
    y_class_pred = xgb_model.predict(X_test_scaled)
    y_class_probs = xgb_model.predict_proba(X_test_scaled)
    class_report = classification_report(y_class_test, y_class_pred, output_dict=True, zero_division=0)
    auc_roc = roc_auc_score(y_class_test, y_class_probs, multi_class='ovr', average='weighted')
    metrics = {
        'accuracy': class_report['accuracy'],
        'precision_safe': class_report.get('0', {'precision': 0.0})['precision'],
        'recall_safe': class_report.get('0', {'recall': 0.0})['recall'],
        'precision_medium': class_report.get('1', {'precision': 0.0})['precision'],
        'recall_medium': class_report.get('1', {'recall': 0.0})['recall'],
        'precision_critical': class_report.get('2', {'precision': 0.0})['precision'],
        'recall_critical': class_report.get('2', {'recall': 0.0})['recall'],
        'f1_critical': class_report.get('2', {'f1-score': 0.0})['f1-score'],
        'auc_roc_weighted': auc_roc
    }
    mlflow.log_metrics(metrics)
    mlflow.log_params(params)
    
    # Log model with signature for Unity Catalog
    signature = infer_signature(X_train_scaled, y_class_pred)
    mlflow.xgboost.log_model(xgb_model, "xgb_model", signature=signature)
    
    unique_classes = np.unique(np.concatenate((y_class_test, y_class_pred)))
    class_labels = le.inverse_transform(unique_classes)
    generate_confusion_matrix(y_class_test, y_class_pred, class_labels, run_id=xgb_run.info.run_id)
    print("Report:\n", classification_report(y_class_test, y_class_pred, zero_division=0))
    print(f"AUC-ROC: {auc_roc:.4f}")

🔗 View Logged Model at: https://3866123326870389.9.gcp.databricks.com/ml/experiments/940444045771050/models/m-f4b1d497ae494754a8fef31e95e3f339?o=3866123326870389


Report:
               precision    recall  f1-score   support

           0       0.78      0.91      0.84    130621
           1       0.83      0.77      0.80    130700
           2       0.93      0.85      0.89    130935

    accuracy                           0.84    392256
   macro avg       0.85      0.84      0.84    392256
weighted avg       0.85      0.84      0.84    392256

AUC-ROC: 0.9612


In [0]:
with mlflow.start_run(run_id=xgb_run.info.run_id):
    feature_names = numerical_features + list(encoder.get_feature_names_out(categorical_features))
    importances = xgb_model.feature_importances_
    sorted_idx = importances.argsort()[::-1]
    importance_dict = {f'importance_{feature_names[i]}': importances[i] for i in sorted_idx[:5]}
    mlflow.log_metrics(importance_dict)
    fig, ax = plt.subplots(figsize=(10, 6))
    ax.barh([feature_names[i] for i in sorted_idx[:10]], importances[sorted_idx[:10]])
    plt.xlabel('Importance')
    plt.title('Top Importances')
    mlflow.log_figure(fig, "feature_importance.png")
    plt.close(fig)