# IMPORT DATASET

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import requests
from bs4 import BeautifulSoup
import sqlite3
import pickle
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
import numpy as np
import re
from itertools import product

# XAI
import shap
import dice_ml
from dice_ml import Data, Model, Dice
from dice_ml.explainer_interfaces import dice_tensorflow2
from lime.lime_tabular import LimeTabularExplainer

In [None]:
conn = sqlite3.connect('formula_1.sql')

In [None]:
races_by_year = {
    '2022': [
        {"race_id": "1124/bahrain", "sessions": ['1', '2', '3'], 'race_name': 'Bahrain Grand Prix'},
        {"race_id": "1125/saudi-arabia", "sessions": ['1', '2', '3'], 'race_name': 'Saudi Arabian Grand Prix'},
        {"race_id": "1108/australia", "sessions": ['1', '2', '3'], 'race_name': 'Australian Grand Prix'},
        {"race_id": "1109/italy", "sessions": ['1', '2'], 'race_name': 'Emilia Romagna Grand Prix'},
        {"race_id": "1110/miami", "sessions": ['1', '2', '3'], 'race_name': 'Miami Grand Prix'},
        {"race_id": "1111/spain", "sessions": ['1', '2', '3'], 'race_name': 'Spanish Grand Prix'},
        {"race_id": "1112/monaco", "sessions": ['1', '2', '3'], 'race_name': 'Monaco Grand Prix'},
        {"race_id": "1126/azerbaijan", "sessions": ['1', '2', '3'], 'race_name': 'Azerbaijan Grand Prix'},
        {"race_id": "1113/canada", "sessions": ['1', '2', '3'], 'race_name': 'Canadian Grand Prix'},
        {"race_id": "1114/great-britain", "sessions": ['1', '2', '3'], 'race_name': 'British Grand Prix'},
        {"race_id": "1115/austria", "sessions": ['1', '2'], 'race_name': 'Austrian Grand Prix'},
        {"race_id": "1116/france", "sessions": ['1', '2', '3'], 'race_name': 'French Grand Prix'},
        {"race_id": "1117/hungary", "sessions": ['1', '2', '3'], 'race_name': 'Hungarian Grand Prix'},
        {"race_id": "1118/belgium", "sessions": ['1', '2', '3'], 'race_name': 'Belgian Grand Prix'},
        {"race_id": "1119/netherlands", "sessions": ['1', '2', '3'], 'race_name': 'Dutch Grand Prix'},
        {"race_id": "1120/italy", "sessions": ['1', '2', '3'], 'race_name': 'Italian Grand Prix'},
        {"race_id": "1133/singapore", "sessions": ['1', '2', '3'], 'race_name': 'Singapore Grand Prix'},
        {"race_id": "1134/japan", "sessions": ['1', '2', '3'], 'race_name': 'Japanese Grand Prix'},
        {"race_id": "1135/united-states", "sessions": ['1', '2', '3'], 'race_name': 'United States Grand Prix'},
        {"race_id": "1136/mexico", "sessions": ['1', '2', '3'], 'race_name': 'Mexico City Grand Prix'},
        {"race_id": "1137/brazil", "sessions": ['1', '2'], 'race_name': 'Brazilian Grand Prix'},
        {"race_id": "1138/abu-dhabi", "sessions": ['1', '2', '3'], 'race_name': 'Abu Dhabi Grand Prix'}
    ],
    '2023': [
        {"race_id": "1141/bahrain", "sessions": ['1', '2', '3'], 'race_name': 'Bahrain Grand Prix'},
        {"race_id": "1142/saudi-arabia", "sessions": ['1', '2', '3'], 'race_name': 'Saudi Arabian Grand Prix'},
        {"race_id": "1143/australia", "sessions": ['1', '2', '3'], 'race_name': 'Australian Grand Prix'},
        {"race_id": "1207/azerbaijan", "sessions": ['1'], 'race_name': 'Azerbaijan Grand Prix'},
        {"race_id": "1208/miami", "sessions": ['1', '2', '3'], 'race_name': 'Miami Grand Prix'},
        #{"race_id": "1209/italy", "sessions": ['1', '2', '3']},
        {"race_id": "1210/monaco", "sessions": ['1', '2', '3'], 'race_name': 'Monaco Grand Prix'},
        {"race_id": "1211/spain", "sessions": ['1', '2', '3'], 'race_name': 'Spanish Grand Prix'},
        {"race_id": "1212/canada", "sessions": ['1', '2', '3'], 'race_name': 'Canadian Grand Prix'},
        {"race_id": "1213/austria", "sessions": ['1'], 'race_name': 'Austrian Grand Prix'},
        {"race_id": "1214/great-britain", "sessions": ['1', '2', '3'], 'race_name': 'British Grand Prix'},
        {"race_id": "1215/hungary", "sessions": ['1', '2', '3'], 'race_name': 'Hungarian Grand Prix'},
        {"race_id": "1216/belgium", "sessions": ['1'], 'race_name': 'Belgian Grand Prix'},
        {"race_id": "1217/netherlands", "sessions": ['1', '2', '3'], 'race_name': 'Dutch Grand Prix'},
        {"race_id": "1218/italy", "sessions": ['1', '2', '3'], 'race_name': 'Italian Grand Prix'},
        {"race_id": "1219/singapore", "sessions": ['1', '2', '3'], 'race_name': 'Singapore Grand Prix'},
        {"race_id": "1220/japan", "sessions": ['1', '2', '3'], 'race_name': 'Japanese Grand Prix'},
        {"race_id": "1221/qatar", "sessions": ['1'], 'race_name': 'Qatar Grand Prix'},
        {"race_id": "1222/united-states", "sessions": ['1'], 'race_name': 'United States Grand Prix'},
        {"race_id": "1223/mexico", "sessions": ['1', '2', '3'], 'race_name': 'Mexico City Grand Prix'},
        {"race_id": "1224/brazil", "sessions": ['1'], 'race_name': 'Brazilian Grand Prix'},
        {"race_id": "1225/las-vegas", "sessions": ['1', '2', '3'], 'race_name': 'Las Vegas Grand Prix'},
        {"race_id": "1226/abu-dhabi", "sessions": ['1', '2', '3'], 'race_name': 'Abu Dhabi Grand Prix'}
    ]
}

def scrape_f1_results(url):
    """
    Scrape F1 results from a given URL for different years and races.
    
    Parameters
    ----------
    url: The base URL to fetch data from.
    
    Returns
    -------
    - A DataFrame containing the practice session results for all specified races.
    """
    all_results_df = pd.DataFrame()
    
    for year, races in races_by_year.items():
        for race in races:
            race_id_split, location = race["race_id"].split('/')
            for practice_session in race["sessions"]:
                full_url = f"{url}/{year}/races/{race_id_split}/{location}/practice-{practice_session}.html"
                # print(f"Fetching data from {full_url}")
                
                try:
                    response = requests.get(full_url)
                    response.raise_for_status()
                    
                    html_content = response.text
                    soup = BeautifulSoup(html_content, 'html.parser')
                    
                    table = soup.find('table', class_='resultsarchive-table')
                    if table:
                        rows = table.find_all('tr')
                        columns = ['Year', 'Location', 'Session', 'Pos', 'No', 'Driver', 'Car', 'Time', 'Gap', 'Laps', 'Racename']
                        data = []
                        
                        for row in rows[1:]:
                            cols = row.find_all('td')
                            row_data = [col.get_text(strip=True) for col in cols][1:-1]  # Assume trimming is needed
                            row_data.insert(0, practice_session)
                            row_data.insert(0, location)
                            row_data.insert(0, year)
                            row_data.append(race['race_name'])
                            data.append(row_data)
                        
                        df = pd.DataFrame(data, columns=columns)
                        all_results_df = pd.concat([all_results_df, df], ignore_index=True)
                    else:
                        print(f"Table not found for {year}, {location}, Practice {practice_session}")
                except requests.exceptions.RequestException as e:
                    print(f"Request failed: {e}")

    return all_results_df


df_all_practice = scrape_f1_results('https://www.formula1.com/en/results.html')
df_all_practice['driverCode'] = df_all_practice['Driver'].str[-3:]
df_all_practice['driverName'] = df_all_practice['Driver'].str[:-3]
df_all_practice['Pos'] = pd.to_numeric(df_all_practice['Pos'])
df_all_practice['Session'] = pd.to_numeric(df_all_practice['Session'])
df_all_practice['Year'] = pd.to_numeric(df_all_practice['Year'])
df_all_practice.to_sql('freePracticeRanking', conn, if_exists='replace', index=False)

In [None]:
pd.read_csv('formula_one/circuits.csv').to_sql('circuits', conn, index=False, if_exists='replace')
pd.read_csv('formula_one/constructor_results.csv').to_sql('constructor_results', conn, index=False, if_exists='replace')
pd.read_csv('formula_one/constructor_standings.csv').to_sql('constructor_standings', conn, index=False, if_exists='replace')
pd.read_csv('formula_one/constructors.csv').to_sql('constructors', conn, index=False, if_exists='replace')
pd.read_csv('formula_one/driver_standings.csv').to_sql('driver_standing', conn, index=False, if_exists='replace')
pd.read_csv('formula_one/drivers.csv').to_sql('drivers', conn, index=False, if_exists='replace')
pd.read_csv('formula_one/lap_times.csv').to_sql('lap_times', conn, index=False, if_exists='replace')
pd.read_csv('formula_one/pit_stops.csv').to_sql('pit_stops', conn, index=False, if_exists='replace')
pd.read_csv('formula_one/qualifying.csv').to_sql('qualifying', conn, index=False, if_exists='replace')
pd.read_csv('formula_one/races.csv').to_sql('races', conn, index=False, if_exists='replace')
pd.read_csv('formula_one/results.csv').to_sql('results', conn, index=False, if_exists='replace')
pd.read_csv('formula_one/seasons.csv').to_sql('seasons', conn, index=False, if_exists='replace')
pd.read_csv('formula_one/sprint_results.csv').to_sql('sprint_results', conn, index=False, if_exists='replace')
pd.read_csv('formula_one/status.csv').to_sql('status', conn, index=False, if_exists='replace')

In [None]:
def get_distances():
    """
    Return track distances
    """
    track_distances = []
    df_track = pd.read_csv('formula_one/circuits.csv')
    df_track = df_track.copy()
    for i, url in enumerate(df_track['url']):
        try:
            response = requests.get(url).text
            soup = BeautifulSoup(response)
            text = (soup
                    .find_all('table', class_='infobox vcard')[0]
                    .find_all('th', string=lambda text: 'Length' in text if text else False)[0]
                    .find_next_sibling()
                    ).text
            track_distances.append(text)
        except:
            text = None
            track_distances.append(text)

    distance_copy = track_distances.copy()
    distances = ['6.201', '6.201', '3.186']
    counter = 0
    for i, distance in enumerate(distance_copy):
        if (distance == None) | (distance == 'Surface\n'):
            distance_copy[i] = distances[counter]
            counter += 1
    distance_copy = [float(x[:5]) for x in distance_copy]
    df_track['track_km_distance'] = distance_copy
    return df_track

distances = get_distances()
distances.to_sql('circuitsNew', conn, if_exists='replace', index=False)

# EXTRACT FROM SQL

In [None]:
df_sql = pd.read_sql(
    """
    SELECT
        re.resultId,
        re.raceId,
        ra.year,
        ra.date,
        ra.circuitId,
        ci.circuitRef,
        ci.name as circuitName,
        ra.name as raceName,
        re.driverId,
        d.forename,
        d.surname,
        d.driverRef,
        d.code,
        re.constructorId,
        c.name as constructorName,
        re.positionOrder,
        q.position as qualifyingPosition,
        re.rank as fastestLapRank,
        ds.points as dsPoints,
        ds.position as dsPosition,
        ds.wins as driverWins,
        cs.points as constructorPoints,
        cs.position as constructorPosition,
        cs.wins as constructorWins,
        lap.average_position as aveLapPosition,
        ci.track_km_distance / (lap.total_lap_time / lap.total_laps) as aveLapSpeed,
        ci.track_km_distance,
        q.q1,
        q.q2,
        q.q3,
        fp.Time as fpTime,
        s.statusId
        
    FROM
        results re
    LEFT JOIN
        races ra USING(raceId)
    LEFT JOIN
        drivers d USING(driverId)
    LEFT JOIN
        driver_standing ds USING(raceId, driverId)
    LEFT JOIN
        constructors c USING(constructorId)
    LEFT JOIN
        constructor_standings cs USING(constructorId, raceId)
    LEFT JOIN
        qualifying q USING(raceId, driverId)
    LEFT JOIN
        circuitsNew ci USING(circuitId)
    LEFT JOIN
        status s USING(statusId)
    LEFT JOIN
        (
            SELECT
                driverCode,
                Racename,
                Year,
                Pos,
                Time,
                MAX(Session)
            FROM
                freePracticeRanking 
            GROUP BY
                1,2,3
        ) fp ON (fp.driverCode = d.code AND fp.Racename = ra.name AND fp.Year = ra.year)
    LEFT JOIN
        (
            SELECT
                l.raceId,
                l.driverId,
                SUM(l.milliseconds)/60000 as total_lap_time,
                AVG(l.position) as average_position,
                COUNT(l.lap) as total_laps
            FROM
                lap_times l
            LEFT JOIN
                races r USING(raceId)
            WHERE
                year > 2021
            GROUP BY
                1,2
        ) lap USING(raceId, driverId)    
    WHERE ra.year > 2021
    AND d.driverRef not in ('vettel', 'mick_schumacher', 'latifi', 'lawson', 'de_vries')
    AND s.statusId in (1, 11, 12, 16)
    ORDER BY
        3, 2, 7
    """,
    conn
)

In [None]:
def convert_to_timedelta(time_str):
    # Check if the time_str is a string
    if not isinstance(time_str, str):
        return pd.NaT  # Return Not-A-Time for non-strings
    
    # Check if time_str matches the expected format 'mm:ss.xxx'
    match = re.match(r'^(\d+):(\d+\.\d+)$', time_str)
    if match:
        # Prepend '00:' to match 'hh:mm:ss.xxx' format expected by to_timedelta
        time_str_formatted = '00:' + time_str
    else:
        # If not matching, return Not-A-Time
        return pd.NaT
    
    try:
        # Attempt to convert the formatted string to a timedelta
        return pd.to_timedelta(time_str_formatted).total_seconds()/60
    except ValueError:
        # In case of ValueError, return Not-A-Time
        return pd.NaT

In [None]:
df_sql['q1_time'] = df_sql['q1'].apply(convert_to_timedelta)
df_sql['q2_time'] = df_sql['q2'].apply(convert_to_timedelta)
df_sql['q3_time'] = df_sql['q3'].apply(convert_to_timedelta)
df_sql['fp_time'] = df_sql['fpTime'].apply(convert_to_timedelta)
df_sql['impute_time'] = df_sql[['q1_time', 'q2_time', 'q3_time', 'fp_time']].mean(axis=1)
df_sql['impute_speed'] = np.divide(df_sql['track_km_distance'], df_sql['impute_time'])
df_sql = df_sql.drop(columns=['q1', 'q2', 'q3', 'q1_time', 'q2_time', 'q3_time', 'fp_time', 'fpTime'])

In [None]:
df_sql

# DATA PREPROCESSING

In [None]:
driver_dict = {}
for driver in df_sql.groupby('driverId'):
    driver_dict[driver[1].driverRef.unique()[0]] = driver[1].set_index('raceId').sort_index().copy()

In [None]:
def create_features(df):
    df['fastestLapRank'] = pd.to_numeric(df['fastestLapRank'])
    if any(np.isnan(df['aveLapPosition'])):
        idx = np.where(np.isnan(df['aveLapPosition']))[0]
        for i in idx:
            df.iloc[i,23] = df.iloc[i,15]
    if any(np.isnan(df['aveLapSpeed'])):
        idx = np.where(np.isnan(df['aveLapSpeed']))[0]
        for i in idx:
            df.iloc[i,24] = df.iloc[i,-1]
    df_race = df.sort_index()
    for i in range(1,6):
        df_race[str(i)+'LagPosition'] = df_race['positionOrder'].shift(i)
        df_race[str(i)+'LagQPos'] = df_race['qualifyingPosition'].shift(i)
    df_race[
        [
            '1Lag_dsPoints',
            '1Lag_dsPosition',
            '1Lag_driverWins',
            '1Lag_constructorPoints',
            '1Lag_constructorPosition',
            '1Lag_constructorWins',
            '1Lag_aveLapPosition',
            '1Lag_aveLapSpeed',
            '1Lag_fastestLapRank'
        ]
    ] = df_race[
        [
            'dsPoints',
            'dsPosition',
            'driverWins',
            'constructorPoints',
            'constructorPosition',
            'constructorWins',
            'aveLapPosition',
            'aveLapSpeed',
            'fastestLapRank'
        ]
    ].shift(1)
    df_res = df_race.dropna()
    return df_race, df_res[
        [
            'positionOrder',
            '1Lag_dsPoints',
            '1Lag_dsPosition',
            '1Lag_driverWins',
            '1Lag_constructorPoints',
            '1Lag_constructorPosition',
            '1Lag_constructorWins',
            '1Lag_aveLapPosition',
            '1Lag_aveLapSpeed',
            '1Lag_fastestLapRank',
            'qualifyingPosition',
            # 'fpPosition',
            '1LagPosition',
            '2LagPosition',
            '3LagPosition',
            '4LagPosition',
            '5LagPosition',
            '1LagQPos',
            '2LagQPos',
            '3LagQPos',
            '4LagQPos',
            '5LagQPos'
        ]
    ]

In [None]:
complete_data = {}
dataset = {}
for driver, df in driver_dict.items():
    complete_data[driver], dataset[driver] = create_features(df.copy())

In [None]:
df_drivers = pd.read_sql(
    f"""
    SELECT
        *
    FROM
        drivers
    WHERE
        driverRef in {tuple(dataset.keys())}
    """,
    conn
).set_index('driverRef')

# BASELINE

In [None]:
def average_precision_at_k(true_ranking, predicted_ranking, k_at=10):
    # Truncate the rankings at k
    k = np.sum(true_ranking <= k_at)
    top_k_predictions = predicted_ranking.index[:k]
    
    # Initialize variables to keep track of hits and calculate precision
    hits = 0
    sum_precision = 0
    
    # Iterate over the top k predicted rankings
    for i, predicted_driver in enumerate(top_k_predictions, start=1):
        # Check if the predicted driver is in the true rankings' top k
        if predicted_driver in true_ranking.index[:k]:
            hits += 1
            precision_at_i = hits / i
            sum_precision += precision_at_i
            
    # If there are no hits, the average precision is 0
    if hits == 0:
        return 0
    
    # Calculate average precision
    average_precision = sum_precision / k
    return average_precision

## Using Qualifying Position

In [None]:
baseline = {}
total = {}
for race in range(1106, 1121):
    baseline[race] = {}
    for driver, df in dataset.items():
        if race in df.index:
            y_true = df.at[race, 'positionOrder']
            y_pred = df.at[race, 'qualifyingPosition']
            baseline[race][driver] = {
                'True': y_true,
                'Pred': y_pred
            }
        else:
            pass
    results_df = pd.DataFrame(baseline[race]).T
    total[race] = results_df

In [None]:
baseline_apak = {}
for n in range(1106,1121):
    true = total[n]['True'].sort_values()
    pred = total[n]['Pred'].sort_values()
    driver_true = df_drivers.loc[true.index]['driverId']
    driver_pred = df_drivers.loc[pred.index]['driverId']
    prec = average_precision_at_k(true, pred, k_at=10)
    baseline_apak[n] = prec
baseline_mapak = {}
for n in range(1106,1121):
    apak_val = []
    for k, v in baseline_apak.items():
        if k >= n:
            apak_val.append(v)
    baseline_mapak[n] = np.mean(apak_val)
np.mean([v for k, v in baseline_mapak.items()])

## Using Walk Forward

In [None]:
baseline = {}
total = {}
wf_dataset = {}
for driver, df in dataset.items():
    df_copy = df.copy()
    df_copy['Pred'] = df_copy['positionOrder'].shift(1)
    wf_dataset[driver] = df_copy.dropna()

for race in range(1106, 1121):
    baseline[race] = {}
    for driver, df in wf_dataset.items():
        if race in df.index:
            y_true = df.at[race, 'positionOrder']
            y_pred = df.at[race, 'Pred']
            baseline[race][driver] = {
                'True': y_true,
                'Pred': y_pred
            }
        else:
            pass
    results_df = pd.DataFrame(baseline[race]).T
    total[race] = results_df

In [None]:
baseline_apak = {}
for n in range(1106,1121):
    true = total[n]['True'].sort_values()
    pred = total[n]['Pred'].sort_values()
    driver_true = df_drivers.loc[true.index]['driverId']
    driver_pred = df_drivers.loc[pred.index]['driverId']
    prec = average_precision_at_k(true, pred, k_at=10)
    baseline_apak[n] = prec
baseline_mapak = {}
for n in range(1106,1121):
    apak_val = []
    for k, v in baseline_apak.items():
        if k >= n:
            apak_val.append(v)
    baseline_mapak[n] = np.mean(apak_val)
np.mean([v for k, v in baseline_mapak.items()])

# MODEL TRAINING & PREDICTIONS

### LINEAR REGRESSION

In [None]:
lr_dict = {}
total_pred = {}
for race in range(1106, 1121):
    lr_dict[race] = {}
    for driver, df in dataset.items():
        if race in df.index:
            df_features = df.drop(columns=['positionOrder'])
            df_target = df['positionOrder']
            X_holdout, y_holdout = df_features.loc[race:], df_target.loc[race:]
            X_train, y_train = df_features.loc[:race-1], df_target.loc[:race-1]
            test_race = X_holdout.loc[race:race]
            lr = LinearRegression(n_jobs=-1)
            lr.fit(X_train, y_train)
            lr_res = lr.predict(test_race)
            lr_dict[race][driver] = {
                'X_train': X_train,
                'X_test': X_holdout.loc[race].to_numpy().reshape(1,-1),
                'model': lr,
                'Pred': lr_res[0],
                'True': y_holdout[race]
            }
        else:
            pass
    results_df = pd.DataFrame(lr_dict[race], index=['Pred', 'True']).T
    total_pred[race] = results_df

In [None]:
lr_apak = {}
for n in range(1106,1121):
    true = total_pred[n]['True'].sort_values()
    pred = total_pred[n]['Pred'].sort_values()
    driver_true = df_drivers.loc[true.index]['driverId']
    driver_pred = df_drivers.loc[pred.index]['driverId']
    prec = average_precision_at_k(true, pred, k_at=10)
    lr_apak[n] = prec
lr_mapak = {}
for n in range(1106,1121):
    apak_val = []
    for k, v in lr_apak.items():
        if k >= n:
            apak_val.append(v)
    lr_mapak[n] = np.mean(apak_val)
np.mean([v for k, v in lr_mapak.items()])

### GRID SEARCH

In [None]:
def gridsearch_gb(model, parameters):
    model_dict = {}
    for d, r in parameters:
        params = (d, r)
        model_dict[params] = {}
        model_apak = {}
        for race in range(1106, 1121):
            model_dict[params][race] = {}
            
            for driver, df in dataset.items():
                if race in df.index:
                    df_features = df.drop(columns=['positionOrder'])
                    df_target = df['positionOrder']
                    X_holdout, y_holdout = df_features.loc[race:], df_target.loc[race:]
                    X_train, y_train = df_features.loc[:race-1], df_target.loc[:race-1]
                    test_race = X_holdout.loc[race:race]
                    mod = model(random_state=0, n_estimators=300, max_depth=d, learning_rate=r)
                    mod.fit(X_train, y_train)
                    model_res = mod.predict(test_race)
                    model_dict[params][race][driver] = {
                        # 'X_train': X_train,
                        # 'X_test': X_holdout.loc[race].to_numpy().reshape(1,-1),
                        # 'model': model,
                        'Pred': model_res[0],
                        'True': y_holdout[race]
                    }
                else:
                    pass
            results_df = pd.DataFrame(model_dict[params][race], index=['Pred', 'True']).T
            # model_dict[model][params][race][total_pred] = results_df
            true = results_df['True'].sort_values()
            pred = results_df['Pred'].sort_values()
            # driver_true = df_drivers.loc[true.index]['driverId']
            # driver_pred = df_drivers.loc[pred.index]['driverId']
            prec = average_precision_at_k(true, pred, k_at=10)
            model_apak[race] = prec
        model_mapak = {}
        for n in range(1106, 1121):
            apak_val = []
            for k, v in model_apak.items():
                if k >= n:
                    apak_val.append(v)
            model_mapak[n] = np.mean(apak_val)
        model_dict[params]['apak'] = model_apak
        model_dict[params]['mapak'] = model_mapak
        model_dict[params]['model_score'] = np.mean([v for k, v in model_mapak.items()])
    return model_dict

In [None]:
max_depth_range = [8, 10, 12]
learning_rate_range = [0.01, 0.05, 0.1]
param = product(max_depth_range, learning_rate_range)
GBM_results = gridsearch_gb(GradientBoostingRegressor, param)
for param in GBM_results.keys():
    print(f"GBM - {param} - {GBM_results[param]['model_score']}")

In [None]:
max_depth_range = [8, 10, 12]
learning_rate_range = [0.01, 0.05, 0.1]
param = product(max_depth_range, learning_rate_range)
XGB_results = gridsearch_gb(XGBRegressor, param)
for param in XGB_results.keys():
    print(f"XGB - {param} - {XGB_results[param]['model_score']}")

In [None]:
def gridsearch_rf(model, parameters):
    model_dict = {}
    for d, f in parameters:
        params = (d, f)
        model_dict[params] = {}
        model_apak = {}
        for race in range(1106, 1121):
            model_dict[params][race] = {}
            for driver, df in dataset.items():
                if race in df.index:
                    df_features = df.drop(columns=['positionOrder'])
                    df_target = df['positionOrder']
                    X_holdout, y_holdout = df_features.loc[race:], df_target.loc[race:]
                    X_train, y_train = df_features.loc[:race-1], df_target.loc[:race-1]
                    test_race = X_holdout.loc[race:race]
                    mod = model(random_state=0, n_estimators=300, max_depth=d, max_features=f)
                    mod.fit(X_train, y_train)
                    model_res = mod.predict(test_race)
                    model_dict[params][race][driver] = {
                        # 'X_train': X_train,
                        # 'X_test': X_holdout.loc[race].to_numpy().reshape(1,-1),
                        # 'model': model,
                        'Pred': model_res[0],
                        'True': y_holdout[race]
                    }
                else:
                    pass
            results_df = pd.DataFrame(model_dict[params][race], index=['Pred', 'True']).T
            # model_dict[model][params][race][total_pred] = results_df
            true = results_df['True'].sort_values()
            pred = results_df['Pred'].sort_values()
            # driver_true = df_drivers.loc[true.index]['driverId']
            # driver_pred = df_drivers.loc[pred.index]['driverId']
            prec = average_precision_at_k(true, pred, k_at=10)
            model_apak[race] = prec
        model_mapak = {}
        for n in range(1106, 1121):
            apak_val = []
            for k, v in model_apak.items():
                if k >= n:
                    apak_val.append(v)
            model_mapak[n] = np.mean(apak_val)
        model_dict[params]['apak'] = model_apak
        model_dict[params]['mapak'] = model_mapak
        model_dict[params]['model_score'] = np.mean([v for k, v in model_mapak.items()])
    return model_dict

In [None]:
max_depth_range = [4, 5, 6, 7, 8]
max_features_range = [0.5, 0.6, 0.7, 0.8, 0.9, None]
param = product(max_depth_range, max_features_range)
RF_results = gridsearch_rf(RandomForestRegressor, param)
for param in RF_results.keys():
    print(f"RF - {param} - {RF_results[param]['model_score']}")

In [None]:
max_depth_range = [2, 3, 4]
max_features_range = [0.3, 0.35, 0.4, 0.45, 0.5, 0.55]
param = product(max_depth_range, max_features_range)
RF_results = gridsearch_rf(RandomForestRegressor, param)
for param in RF_results.keys():
    print(f"RF - {param} - {RF_results[param]['model_score']}")

In [None]:
max_depth_range = [1, 2]
max_features_range = [0.2, 0.25, 0.3, 0.35, 0.4]
param = product(max_depth_range, max_features_range)
RF_results = gridsearch_rf(RandomForestRegressor, param)
for param in RF_results.keys():
    print(f"RF - {param} - {RF_results[param]['model_score']}")

In [None]:
rf_dict = {}
total_pred = {}
for race in range(1106, 1121):
    rf_dict[race] = {}
    for driver, df in dataset.items():
        if race in df.index:
            df_features = df.drop(columns=['positionOrder'])
            df_target = df['positionOrder']
            X_holdout, y_holdout = df_features.loc[race:], df_target.loc[race:]
            X_train, y_train = df_features.loc[:race-1], df_target.loc[:race-1]
            test_race = X_holdout.loc[race:race]
            rf = RandomForestRegressor(random_state=0, n_estimators=300, max_depth=1, max_features=0.3)
            rf.fit(X_train, y_train)
            rf_res = rf.predict(test_race)
            rf_dict[race][driver] = {
                'X_train': X_train,
                'X_test': X_holdout.loc[race].to_numpy().reshape(1,-1),
                'y_train': y_train,
                'model': rf,
                'Pred': rf_res[0],
                'True': y_holdout[race]
            }
        else:
            pass
    results_df = pd.DataFrame(rf_dict[race], index=['Pred', 'True']).T
    total_pred[race] = results_df

In [None]:
rf_apak = {}
for n in range(1106,1121):
    true = total_pred[n]['True'].sort_values()
    pred = total_pred[n]['Pred'].sort_values()
    driver_true = df_drivers.loc[true.index]['driverId']
    driver_pred = df_drivers.loc[pred.index]['driverId']
    prec = average_precision_at_k(true, pred, k_at=10)
    rf_apak[n] = prec
rf_mapak = {}
for n in range(1106,1121):
    apak_val = []
    for k, v in rf_apak.items():
        if k >= n:
            apak_val.append(v)
    rf_mapak[n] = np.mean(apak_val)
np.mean([v for k, v in rf_mapak.items()])

In [None]:
rf_apak

In [None]:
for r in range(1106, 1121):
    print(f'Race {r}:')
    display(total_pred[r].sort_values('Pred'))

# EXPLAINABILITY

In [None]:
shap_dict = {}
model_dict = rf_dict
for n in model_dict.keys():
    shap_dict[n] = {}
    for driver, df in dataset.items():
        if n in df.index:
            train = model_dict[n][driver]['X_train']
            test = model_dict[n][driver]['X_test']
            model = model_dict[n][driver]['model']
            shap_explainer = shap.Explainer(model.predict, train, feature_names=train.columns.tolist(), seed=0)
            shap_values = shap_explainer(test)
            shap_explanation = shap.Explanation(
                shap_values.values[0],
                shap_values.base_values[0],
                shap_values.data[0],
                feature_names=train.columns.tolist()
            )
            shap_dict[n][driver] = {
                'explainer': shap_explainer,
                'shap_values': shap_values,
                'shap_explanation': shap_explanation
            }

In [None]:
dice_dict = {}
model_dict = rf_dict
for n in model_dict.keys():
    dice_dict[n] = {}
    for driver, df in dataset.items():
        if n in df.index:
            model = model_dict[n][driver]['model']
            X_train = model_dict[n][driver]['X_train']
            y_train = model_dict[n][driver]['y_train']
            test = model_dict[n][driver]['X_test']
            dice_model = Model(model=model, backend='sklearn', model_type='regressor')
            dice_data = Data(
                dataframe=X_train.join(y_train),
                continuous_features=X_train.columns.tolist(),
                outcome_name='positionOrder')
            dice_exp = Dice(dice_data, dice_model, method='genetic')
            
            dice_dict[n][driver] = {
                'explainer': dice_exp,
                'test': test
            }

In [None]:
lime_dict = {}

for n in model_dict.keys():
    lime_dict[n] = {}
    for driver, df in dataset.items(): 
        if n in df.index:
            train = model_dict[n][driver]['X_train']
            test = model_dict[n][driver]['X_test']
            model = model_dict[n][driver]['model']
            lime_explainer = LimeTabularExplainer(train.values,
                                                  feature_names=train.columns.tolist(),
                                                  verbose=False,
                                                  mode='regression',
                                                  random_state=0)
            lime_explanation = lime_explainer.explain_instance(test.reshape(-1), model.predict)
            lime_dict[n][driver] = {
                'lime_explainer': lime_explainer,
                'lime_explanation': lime_explanation
            }

In [None]:
driver = 'gasly'
n = 1120
X = model_dict[n][driver]['X_train']
cfes1 = dice_dict[n][driver]['explainer'].generate_counterfactuals(
    pd.DataFrame(dice_dict[n][driver]['test'].reshape(1,-1), columns=X.columns.tolist()),
    total_CFs=5,
    desired_range=[9, 10],
    features_to_vary=[
        # '1Lag_aveLapSpeed',
        'qualifyingPosition',
         '1LagPosition',
         '2LagPosition',
         '3LagPosition',
         '4LagPosition',
         '5LagPosition',
         # '1LagQPos',
         # '2LagQPos',
         # '3LagQPos',
         # '4LagQPos',
         # '5LagQPos'
                     ]
)

In [None]:
driver = 'gasly'
n = 1120

print(driver, model_dict[n][driver]['True'])
shap.plots.waterfall(shap_dict[n][driver]['shap_explanation'])

# lime_dict[n][driver]['lime_explanation'].as_pyplot_figure();
lime_dict[n][driver]['lime_explanation'].show_in_notebook();

In [None]:
cfes1.visualize_as_dataframe(show_only_changes=True)

In [None]:
cfes1.visualize_as_list(show_only_changes=True)

# PREDICTING BAHRAIN 2024

In [None]:
def create_bahrain_test(df):
    df['fastestLapRank'] = pd.to_numeric(df['fastestLapRank'])
    if any(np.isnan(df['aveLapPosition'])):
        idx = np.where(np.isnan(df['aveLapPosition']))[0]
        for i in idx:
            df.iloc[i,23] = df.iloc[i,15]
    if any(np.isnan(df['aveLapSpeed'])):
        idx = np.where(np.isnan(df['aveLapSpeed']))[0]
        for i in idx:
            df.iloc[i,24] = df.iloc[i,-1]
    df_race = df.sort_index()
    df_race.loc['bahrain'] = np.nan
    for i in range(1,6):
        df_race[str(i)+'LagPosition'] = df_race['positionOrder'].shift(i)
        df_race[str(i)+'LagQPos'] = df_race['qualifyingPosition'].shift(i)
    df_race[
        [
            '1Lag_dsPoints',
            '1Lag_dsPosition',
            '1Lag_driverWins',
            '1Lag_constructorPoints',
            '1Lag_constructorPosition',
            '1Lag_constructorWins',
            '1Lag_aveLapPosition',
            '1Lag_aveLapSpeed',
            '1Lag_fastestLapRank'
        ]
    ] = df_race[
        [
            'dsPoints',
            'dsPosition',
            'driverWins',
            'constructorPoints',
            'constructorPosition',
            'constructorWins',
            'aveLapPosition',
            'aveLapSpeed',
            'fastestLapRank'
        ]
    ].shift(1)
    
    qualPos = {
        'hamilton': 9,
        'alonso': 6,
        'gasly': 20,
        'hulkenberg': 10,
        'perez': 5,
        'ricciardo': 14,
        'bottas': 16,
        'kevin_magnussen': 15,
        'max_verstappen': 1,
        'sainz': 4,
        'ocon': 19,
        'stroll': 12,
        'leclerc': 2,
        'norris': 7,
        'russell': 3,
        'albon': 13,
        'tsunoda': 11,
        'zhou': 17,
        'piastri': 8,
        'sargeant': 18
    }
    
    df_race.at['bahrain', 'qualifyingPosition'] = qualPos[driver]
    
    return df_race.loc['bahrain', [
            '1Lag_dsPoints',
            '1Lag_dsPosition',
            '1Lag_driverWins',
            '1Lag_constructorPoints',
            '1Lag_constructorPosition',
            '1Lag_constructorWins',
            '1Lag_aveLapPosition',
            '1Lag_aveLapSpeed',
            '1Lag_fastestLapRank',
            'qualifyingPosition',
            # 'fpPosition',
            '1LagPosition',
            '2LagPosition',
            '3LagPosition',
            '4LagPosition',
            '5LagPosition',
            '1LagQPos',
            '2LagQPos',
            '3LagQPos',
            '4LagQPos',
            '5LagQPos'
        ]
    ]

In [None]:
bahrain_test = {}
for driver, df in driver_dict.items():
    bahrain_test[driver] = create_bahrain_test(df.copy())

In [None]:
bahrain_pred = {}
bahrain_models = {}
for driver, df in dataset.items():
    df_features = df.drop(columns=['positionOrder'])
    df_target = df['positionOrder']
    X_train, y_train = df_features, df_target
    test_race = bahrain_test[driver].to_numpy().reshape(1, -1)
    rf = RandomForestRegressor(random_state=0, n_estimators=300, max_depth=1, max_features=0.3)
    rf.fit(X_train, y_train)
    rf_res = rf.predict(test_race)
    bahrain_pred[driver] = rf_res[0]
    bahrain_models[driver] = rf
results_df = pd.DataFrame(bahrain_pred, index=['Bahrain 2024']).T.sort_values('Bahrain 2024')

In [None]:
results_df['Ranking'] = range(1,21)
results_df