In [0]:
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from sklearn.utils.class_weight import compute_sample_weight
import xgboost as xgb
import mlflow
import mlflow.xgboost
from mlflow.models.signature import infer_signature
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
# from src.data_utils import load_data, get_features, get_target, numerical_features, categorical_features
# from src.utils import generate_confusion_matrix
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler

# --- Placeholder implementations ---
import pandas as pd

def load_data():
    # Replace with your actual data loading logic
    # For now, create a dummy DataFrame
    data = {
        'force_on_tire': np.random.rand(100),
        'degradation_risk': np.random.choice(['safe', 'medium', 'critical'], 100),
        'Track': np.random.choice(['Monza', 'Silverstone', 'Spa'], 100)
    }
    return pd.DataFrame(data)

def get_features(df):
    # Dummy split: all numeric except 'degradation_risk' and 'Track'
    num_cols = [col for col in df.columns if df[col].dtype in [np.float64, np.int64]]
    cat_cols = [col for col in df.columns if df[col].dtype == object and col not in ['degradation_risk']]
    return df[num_cols], df[cat_cols]

def get_target(df):
    return df['degradation_risk']

numerical_features = ['force_on_tire']
categorical_features = ['Track']

def generate_confusion_matrix(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(5,4))
    sns.heatmap(cm, annot=True, fmt='d')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix')
    plt.show()
    return cm
# --- End placeholders ---

# MLflow setup (Databricks auto—generic experiment for portability)
mlflow.set_experiment("/Users/jrcarra1@asu.edu/F1_Tire_Deg_DemoII")

df = load_data()

# Add lap_count if missing (simulates stint: F1 races ~50-70 laps, deg ramps exponentially)
if 'lap_count' not in df.columns:
    df['lap_count'] = np.random.randint(1, 71, len(df))  # Uniform; real telemetry sequential per stint
mlflow.log_param("lap_count_simulated", 'lap_count' not in df.columns)

# Add interaction for cumulative deg (lap * force amps wear under repeated loads)
df['lap_force'] = df['lap_count'] * df['force_on_tire']
numerical_features += ['lap_count', 'lap_force']  # Now 16 num features—retrain fixes shape errors

# Split (stratify by class+track for balance/gen—critical rare, avoids Monza bias)
stratify_col = df['degradation_risk'].astype(str) + df['Track']
train_idx, test_idx = train_test_split(df.index, test_size=0.2, random_state=42, stratify=stratify_col)
train_df = df.loc[train_idx]
test_df = df.loc[test_idx]

# Preprocess (num/cat split, encode, scale)
X_train_num, X_train_cat = get_features(train_df)
X_test_num, X_test_cat = get_features(test_df)
y_train = get_target(train_df)
y_test = get_target(test_df)

encoder = OneHotEncoder(sparse_output=False, drop='first')
X_train_cat