# Experiment: Rising Qualification Prediction Research

Objective:
- Compare candidate models for predicting Q3 delta (qualifying) and race position using FP data.
- Success criteria: lower MAE on target, higher Spearman rank correlation, and better top-10 hit rate.


In [None]:
# Setup: imports and reproducibility
from __future__ import annotations

import pathlib
import random
import sys

import numpy as np
import pandas as pd

SEED = 7
random.seed(SEED)
np.random.seed(SEED)

PROJECT_ROOT = pathlib.Path('..').resolve()
PYTHON_DIR = PROJECT_ROOT / 'Python'
if str(PYTHON_DIR) not in sys.path:
    sys.path.insert(0, str(PYTHON_DIR))

from rqp.data import build_training_data
from rqp.prediction import predict_with_model
from rqp.providers import FastF1Provider, OpenF1Provider
from rqp.training import train_model


## Plan

- Hypothesis: FP deltas and ranks are sufficient early in the season.
- Variables to sweep: model type, feature set, and training seasons.
- Metrics to record: MAE, Spearman rank correlation, top-10 hit rate.


In [None]:
# Experiment configuration
CONFIG = {
    'source': 'fastf1',  # or 'openf1'
    'mode': 'qualifying',  # or 'race'
    'target_year': 2025,
    'target_round': 1,
    'train_seasons': [2023, 2024],
    'include_standings': False,
    'cache_dir': str(PROJECT_ROOT / '.cache' / 'fastf1'),
    'meeting_name': None,  # openf1 only
    'country_name': None,  # openf1 only
}

if CONFIG['source'] == 'fastf1':
    provider = FastF1Provider(CONFIG['cache_dir'])
else:
    provider = OpenF1Provider(
        cache_dir=CONFIG['cache_dir'],
        target_round=CONFIG['target_round'],
        meeting_name=CONFIG['meeting_name'],
        country_name=CONFIG['country_name'],
    )


In [None]:
# Build training dataset
train_df, notes = build_training_data(
    provider=provider,
    mode=CONFIG['mode'],
    train_seasons=CONFIG['train_seasons'],
    target_year=CONFIG['target_year'],
    target_round=CONFIG['target_round'],
    include_standings=CONFIG['include_standings'],
)

notes, train_df.head()


In [None]:
# Define features + metrics
if CONFIG['mode'] == 'qualifying':
    feature_cols = [
        'fp1_delta', 'fp2_delta', 'fp3_delta', 'fp_mean_delta',
        'fp1_rank', 'fp2_rank', 'fp3_rank', 'fp_mean_rank'
    ]
    fallback_cols = [c for c in feature_cols if c.endswith('_delta')]
else:
    feature_cols = [
        'fp1_delta', 'fp2_delta', 'fp3_delta', 'fp_mean_delta',
        'fp1_rank', 'fp2_rank', 'fp3_rank', 'fp_mean_rank',
        'qualy_position'
    ]
    if CONFIG['include_standings']:
        feature_cols.append('position_start')
    fallback_cols = ['qualy_position']

def mae(y_true, y_pred):
    return float(np.mean(np.abs(np.array(y_true) - np.array(y_pred))))

def spearman(y_true, y_pred):
    return float(pd.Series(y_true).rank().corr(pd.Series(y_pred).rank()))

def top10_hit_rate(df):
    pred_rank = df['pred'].rank(method='first')
    true_rank = df['target'].rank(method='first')
    pred_top10 = set(df.loc[pred_rank <= 10].index)
    true_top10 = set(df.loc[true_rank <= 10].index)
    if not true_top10:
        return 0.0
    return len(pred_top10 & true_top10) / 10


In [None]:
# Baseline model: Ridge (via train_model)
if train_df.empty:
    raise ValueError('Training data is empty. Adjust seasons or source.')

# Simple random split for fast iteration
train_df = train_df.sample(frac=1.0, random_state=SEED).reset_index(drop=True)
split = int(len(train_df) * 0.8)
train_part = train_df.iloc[:split]
test_part = train_df.iloc[split:]

model = train_model(train_part, feature_cols)
test_part = test_part.copy()
test_part['pred'] = predict_with_model(model, test_part, feature_cols, fallback_cols)

metrics = {
    'mae': mae(test_part['target'], test_part['pred']),
    'spearman': spearman(test_part['target'], test_part['pred']),
    'top10_hit_rate': top10_hit_rate(test_part),
}
metrics


## Results and Notes

- Record the metrics from each run here.
- Capture any surprising failures (missing data, session gaps, etc.).


## Next Steps

- Try alternative models (RandomForest, XGBoost) and compare metrics.
- Add features: sector times, long-run pace, or weather signals.
- Evaluate a time-based split (earlier rounds train, later rounds test).
