# F1 Production Model Training - v1.1.0**Purpose:** Train production-ready F1 prediction models with integrated data validation**Key Features:**- ✅ Integrated data validation (anti-leakage)- ✅ Cross-validation for robust metrics- ✅ Feature importance analysis- ✅ Model versioning (semantic versions)- ✅ Comprehensive logging and metrics**Target Metrics:**- Position RMSE < 2.5 positions (current: 4.3)- Winner ROC-AUC > 0.95 (current: 0.97 ✅)- Points RMSE < 4.0 points (current: 5.1)**Created:** 2025-12-30  **Author:** Adrian Infantes

## 1️⃣ Setup & Imports

In [None]:
import warningswarnings.filterwarnings('ignore')import pandas as pdimport numpy as npimport matplotlib.pyplot as pltimport seaborn as snsfrom pathlib import Pathfrom datetime import datetimeimport json# ML librariesfrom sklearn.model_selection import train_test_split, cross_validate, GridSearchCVfrom sklearn.ensemble import RandomForestClassifier, RandomForestRegressorfrom sklearn.metrics import (    accuracy_score, precision_score, recall_score, f1_score,     roc_auc_score, mean_squared_error, mean_absolute_error, r2_score)import xgboost as xgb# Our modulesfrom src.ml.data_collection import extract_race_results, extract_qualifying_resultsfrom src.ml.features import calculate_historical_stats, add_feature_columnsfrom src.ml.validation import validate_ml_data, validate_no_leakageprint("✅ Imports successful")print(f"📅 Training started: {datetime.now()}")

## 2️⃣ Load Historical Data

In [None]:
# Load processed data (from notebooks/explore_dataset.ipynb results)print("Loading historical data...")# TODO: Specify your data pathDATA_PATH = Path("data/processed/f1_historical_data.parquet")if not DATA_PATH.exists():    raise FileNotFoundError(        f"Data not found: {DATA_PATH}\n"        "Please run data collection script first:\n"        "  python src/ml/collect_historical_data.py"    )df = pd.read_parquet(DATA_PATH)print(f"✅ Loaded {len(df)} rows, {len(df.columns)} columns")print(f"   Years: {sorted(df['year'].unique())}")print(f"   Rounds: {df['round_number'].nunique()} unique rounds")

## 3️⃣ Feature Engineering

In [None]:
# Calculate historical statisticsprint("Calculating historical statistics...")df = calculate_historical_stats(df)# Add derived featuresprint("Adding derived features...")df = add_feature_columns(df)print(f"✅ Feature engineering complete")print(f"   Total features: {len(df.columns)}")

## 4️⃣ 🔒 DATA VALIDATION (CRITICAL)

In [None]:
# Separate features from targetsprint("Separating features from targets...")# Save targetstargets = {    'winner': df['winner'].copy(),    'race_position': df['race_position'].copy(),    'points': df['points'].copy()}# Remove targets and identifiers from featuresfeature_cols_to_drop = [    # Targets (POST-race info - data leakage if included)    'winner', 'race_position', 'points', 'dnf', 'status',     'fastest_lap_time', 'fastest_lap_rank', 'race_time',    # Identifiers (not predictive features)    'driver_code', 'constructor', 'circuit_name', 'country', 'event_name']# Create feature matrixX = df.drop(columns=[c for c in feature_cols_to_drop if c in df.columns])# Keep year and round for temporal validationprint(f"✅ Feature matrix created: {X.shape}")print(f"   Features: {X.shape[1]}")print(f"   Samples: {X.shape[0]}")

In [None]:
# 🔒 CRITICAL VALIDATION: Check for data leakageprint("\n" + "=" * 80)print("🔍 VALIDATING DATA (ANTI-LEAKAGE CHECK)")print("=" * 80)try:    # This will FAIL if forbidden features are present    validate_ml_data(        X,        current_year=2025,  # Training for future predictions        current_round=1,        strict=True  # Fail hard if leakage detected    )    print("\n✅ VALIDATION PASSED - No data leakage detected")    print("   Safe to proceed with training")except Exception as e:    print(f"\n❌ VALIDATION FAILED: {e}")    print("\n🚨 DO NOT PROCEED - Fix data leakage first")    raise

## 5️⃣ Train/Test Split (Temporal)

In [None]:
# Temporal split: 2023 = train, 2024 = testprint("\nCreating temporal train/test split...")train_mask = X['year'] == 2023test_mask = X['year'] == 2024# Remove year and round_number from features (used only for validation)X_numeric = X.drop(columns=['year', 'round_number'])X_train = X_numeric[train_mask].copy()X_test = X_numeric[test_mask].copy()y_winner_train = targets['winner'][train_mask]y_winner_test = targets['winner'][test_mask]y_position_train = targets['race_position'][train_mask]y_position_test = targets['race_position'][test_mask]y_points_train = targets['points'][train_mask]y_points_test = targets['points'][test_mask]print(f"✅ Split complete:")print(f"   Train: {len(X_train)} samples (2023)")print(f"   Test: {len(X_test)} samples (2024)")print(f"   Features: {X_train.shape[1]}")

## 6️⃣ Train Models with Cross-Validation

In [None]:
# Cross-validation setupfrom sklearn.model_selection import TimeSeriesSplit# Use TimeSeriesSplit for temporal datacv = TimeSeriesSplit(n_splits=5)print("Training Winner Classifier...")clf = RandomForestClassifier(    n_estimators=200,    max_depth=15,    min_samples_split=10,    class_weight='balanced',    random_state=42,    n_jobs=-1)# Cross-validationcv_scores = cross_validate(    clf, X_train, y_winner_train,    cv=cv,    scoring=['roc_auc', 'f1'],    return_train_score=True,    n_jobs=-1)print(f"✅ Cross-validation complete:")print(f"   ROC-AUC: {cv_scores['test_roc_auc'].mean():.4f} ± {cv_scores['test_roc_auc'].std():.4f}")print(f"   F1-Score: {cv_scores['test_f1'].mean():.4f} ± {cv_scores['test_f1'].std():.4f}")# Final training on full training setclf.fit(X_train, y_winner_train)print("✅ Final model trained")

## 7️⃣ Feature Importance Analysis

In [None]:
# Feature importance from Random Forestimportances = pd.DataFrame({    'feature': X_train.columns,    'importance': clf.feature_importances_}).sort_values('importance', ascending=False)print("\n📊 Top 20 Most Important Features:")print(importances.head(20).to_string(index=False))# Visualizationplt.figure(figsize=(12, 8))top_20 = importances.head(20)plt.barh(range(len(top_20)), top_20['importance'])plt.yticks(range(len(top_20)), top_20['feature'])plt.xlabel('Importance')plt.title('Top 20 Feature Importances - Winner Classifier')plt.gca().invert_yaxis()plt.tight_layout()plt.show()# Save importancesimportances.to_csv('models/feature_importances_v1.1.0.csv', index=False)print("\n✅ Feature importances saved")

## 8️⃣ Evaluate & Save Models

In [None]:
# Evaluate on test sety_pred = clf.predict(X_test)y_proba = clf.predict_proba(X_test)[:, 1]metrics = {    'roc_auc': roc_auc_score(y_winner_test, y_proba),    'f1': f1_score(y_winner_test, y_pred),    'precision': precision_score(y_winner_test, y_pred),    'recall': recall_score(y_winner_test, y_pred)}print("\n📊 Test Set Metrics:")for metric, value in metrics.items():    print(f"   {metric}: {value:.4f}")

In [None]:
# Save model with semantic versioningimport picklefrom pathlib import PathVERSION = "v1.1.0"models_dir = Path(f"models/{VERSION}")models_dir.mkdir(exist_ok=True, parents=True)# Save classifiermodel_path = models_dir / "classifier_winner.pkl"with open(model_path, 'wb') as f:    pickle.dump(clf, f)print(f"\n✅ Model saved: {model_path}")# Save features listfeatures_path = models_dir / "features.json"with open(features_path, 'w') as f:    json.dump(X_train.columns.tolist(), f, indent=2)print(f"✅ Features saved: {features_path}")# Save metricsmetrics_path = models_dir / "metrics.json"with open(metrics_path, 'w') as f:    json.dump({        'version': VERSION,        'timestamp': datetime.now().isoformat(),        'train_samples': len(X_train),        'test_samples': len(X_test),        'features_count': len(X_train.columns),        'classifier_winner': metrics    }, f, indent=2)print(f"✅ Metrics saved: {metrics_path}")

## 9️⃣ Next Steps✅ **Completed:**- Data loaded and validated- No data leakage detected- Models trained with cross-validation- Feature importance analyzed- Models saved with semantic versioning📋 **TODO:**1. Train position regressor (target RMSE < 2.5)2. Train points regressor (target RMSE < 4.0)3. Compare v1.1.0 vs v1.0.0 metrics4. Create symlink: `models/latest -> models/v1.1.0`5. Update production code to use v1.1.0**Target Improvements:**- Position RMSE: 4.3 → < 2.5 ⚡- Add SHAP analysis for interpretability- Deploy to production after validation