In [None]:
# Update setuptools and wheel
!pip install --upgrade setuptools wheel

# Handle specific version requirements for dash and tensorflow dependencies
!pip install dash-core-components==2.0.0 dash-html-components==2.0.0 dash-table==5.0.0
!pip install typing-extensions==4.5.0

# Uninstall and reinstall pydantic to fix the ImportError
!pip uninstall -y pydantic openai
!pip install pydantic==1.10.4 openai==0.27.0

# Install specific dependencies
!pip install python-dotenv


import pandas as pd
import os
from dotenv import load_dotenv
from typing import Any
import openai
import numpy as np

## Training

### Feedback Loop

In [31]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_absolute_error, mean_squared_error
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# --- Data Loading

def get_clean_df(race_stats_csv) -> pd.DataFrame:
    df = (
        pd.read_csv(race_stats_csv)
        .drop(columns=['car_num', 'status', 'winnings']) # maybe come back to status. i don't know what to do with it, but it might be meaningful
    )

    return df


# --- Feature Engineering

def embed_categorical_columns(df: pd.DataFrame, categorical: list[str]) -> pd.DataFrame:
    '''
    A module of functionality for creating embeddings wrapped inside one function call.
    '''

    def set_openai_secret() -> None:
        load_dotenv()
        api_key = os.environ.get('OPEN_API_KEY')
        openai.api_key = api_key


    def get_embedding(text: str, model="text-embedding-3-small") -> list[float]:
        return openai.Embedding.create(input=text, model=model).data[0].embedding


    def add_pretrained_embeddings(data, columns):
        for column in columns:
            data[f'{column}_emb'] = data[column].apply(lambda x: generate_embeddings(x))
        return data


    def expand_embeddings(data, columns):
        for column in columns:
            emb_df = pd.DataFrame(data[f'{column}_emb'].tolist(), index=data.index, columns=[f"{column}_emb_{i}" for i in range(len(data[f'{column}_emb'][0]))])
            data = pd.concat([data, emb_df], axis=1)
            data.drop(columns=[f'{column}_emb'], inplace=True)
        return data

    df = df.copy()

    set_openai_secret()

    for column in categorical:
        df[f'{column}_emb'] = df[column].apply(lambda text: get_embedding(text))

        # explode the embedding list's into their own columns and add them to the df

        emb_df = pd.DataFrame(
                    df[f'{column}_emb'].tolist(), 
                    index=df.index, 
                    columns=[f"{column}_emb_{i}" for i in range(len(df[f'{column}_emb'][0]))])

        df = (
            pd.concat([df, emb_df], axis=1)
            .drop(columns=[f'{column}_emb'])
        )

    return df


# --- Model Training

def evaluate_model(model, X_test, y_test) -> dict:
    y_pred = model.predict(X_test)
    return {
        'MAE': mean_absolute_error(y_test, y_pred),
        'MSE': mean_squared_error(y_test, y_pred),
    }
    

def train_and_evaluate_model(model, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3)
    model.fit(X_train, y_train)
    return evaluate_model(model, X_test, y_test)

### Using the Feedback Loop

In [5]:
categorical = ['driver', 'car_name', 'model']
numerical = ['start', 'qual_speed', 'laps', 'led', 'year']

embedded_df_path = 'embedded_df.csv'

if os.path.exists(embedded_df_path):
    df = pd.read_csv(embedded_df_path, index_col=0)
    scaler = StandardScaler()
    df[numerical] = scaler.fit_transform(df[numerical])

else:
    df = (
        get_clean_df('race_stats_2000-2024.csv')
        .pipe(embed_categorical_columns, ['driver', 'car_name', 'model'])
    )

    df.to_csv(embedded_df_path, index=False)

df

Unnamed: 0,finish,start,qual_speed,driver,car_name,model,laps,led,year,driver_emb_0,...,model_emb_1526,model_emb_1527,model_emb_1528,model_emb_1529,model_emb_1530,model_emb_1531,model_emb_1532,model_emb_1533,model_emb_1534,model_emb_1535
0,1,-1.574952,-0.107742,Juan Pablo Montoya,Target,G Force/Oldsmobile,0.595827,9.274995,-1.661325,0.009248,...,0.000077,0.016734,-0.009105,-0.006143,-0.010990,0.002766,0.007808,0.023753,-0.010200,0.000434
1,2,-0.104873,-0.307888,Buddy Lazier,Delta Faucet/Coors Light/Tae-Bo/Hemelgarn,Dallara/Oldsmobile,0.595827,-0.345199,-1.661325,-0.042749,...,0.011757,0.021775,-0.014351,0.007914,0.010260,-0.007183,-0.007410,0.015428,-0.000102,-0.013089
2,3,-1.469947,-0.117507,Eliseo Salazar,Rio A.J. Foyt Racing,G Force/Oldsmobile,0.595827,-0.345199,-1.661325,-0.007628,...,0.000077,0.016734,-0.009105,-0.006143,-0.010990,0.002766,0.007808,0.023753,-0.010200,0.000434
3,4,-1.154930,-0.158506,Jeff Ward,Harrah's A.J. Foyt Racing,G Force/Oldsmobile,0.595827,-0.345199,-1.661325,-0.021473,...,0.000077,0.016734,-0.009105,-0.006143,-0.010990,0.002766,0.007808,0.023753,-0.010200,0.000434
4,5,-0.734907,-0.253316,Eddie Cheever Jr.,#51 Excite@Home Indy Race Car,Dallara/Nissan Infiniti,0.595827,-0.345199,-1.661325,0.033247,...,0.018570,0.023989,-0.021232,0.012616,0.012400,0.001821,-0.031544,-0.001290,-0.024219,0.010521
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
787,29,0.630167,0.473788,David Malukas,HMD Trucking,Dallara/Honda,-0.161416,-0.345199,1.661325,0.000495,...,-0.003441,0.024446,-0.037460,0.006889,0.006606,-0.005055,-0.012809,-0.001069,0.002429,-0.002764
788,30,0.210144,0.489578,Romain Grosjean,DHL,Dallara/Honda,-0.369658,-0.345199,1.661325,-0.022040,...,-0.003441,0.024446,-0.037460,0.006889,0.006606,-0.005055,-0.012809,-0.001069,0.002429,-0.002764
789,31,1.470212,0.320043,Sting Ray Robb,biohaven,Dallara/Honda,-1.486592,-0.345199,1.661325,0.040616,...,-0.003441,0.024446,-0.037460,0.006889,0.006606,-0.005055,-0.012809,-0.001069,0.002429,-0.002764
790,32,1.155195,0.429465,RC Enerson,Abel Motorsports,Dallara/Chevrolet,-1.770558,-0.345199,1.661325,0.016893,...,0.017924,0.010203,-0.022248,0.026176,0.001087,-0.014457,-0.026035,0.014188,0.002944,0.003069


In [6]:
y = df.finish
X = df.drop(columns=['finish'] + categorical)

In [7]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

model = RandomForestRegressor(random_state=3)

train_and_evaluate_model(model, X, y)

{'MAE': 3.1976100628930815, 'MSE': 17.843066666666665}

### Model Scoreboard

#### Baseline

`model = RandomForestRegressor(random_state=3)`

- MAE = 3.18
- MSE = 18.07

dataframe.shape => 792, 4617 (no dimensionality reduction)

## 2024 Predictions

In [19]:
old_df = pd.read_csv('race_stats_2000-2023.csv')

pos = 1
(
    old_df
    .query(f'start == {pos}')
    .led
    .median()
)

29.5

In [24]:
def return_imputed_2024_df() -> pd.DataFrame:
    '''
    impute (laps, led) with the median values from the old_df for the corresponding start position
    '''

    def _median_laps_for_pos(old_df: pd.DataFrame, pos: int) -> float:
        return (
            old_df
            .query(f'start == {pos}')
            .laps
            .median()
        )

    def _median_led_for_pos(old_df: pd.DataFrame, pos: int) -> float:
        return (
            old_df
            .query(f'start == {pos}')
            .led
            .median()
        )

    old_df = pd.read_csv('race_stats_2000-2023.csv')
    df_2024 = (
        pd.read_csv('race_stats_2024.csv')
        .assign(laps=lambda df_: df_.start.apply(lambda pos: _median_laps_for_pos(old_df, pos)))
        .assign(led=lambda df_: df_.start.apply(lambda pos: _median_led_for_pos(old_df, pos)))
    )

    return df_2024

In [33]:
categorical = ['driver', 'car_name', 'model']
numerical = ['start', 'qual_speed', 'laps', 'led', 'year']

embedded_df_2024_path = 'embedded_df_2024.csv'

if os.path.exists(embedded_df_2024_path):
    df_2024 = pd.read_csv(embedded_df_2024_path, index_col=0)

else:
    df_2024 = (
        return_imputed_2024_df()
        .pipe(embed_categorical_columns, ['driver', 'car_name', 'model'])
    )

    scaler = StandardScaler()
    df_2024[numerical] = scaler.fit_transform(df_2024[numerical])

    df_2024.to_csv(embedded_df_2024_path, index=False)

df_2024

Unnamed: 0,start,qual_speed,driver,car_name,model,year,laps,led,driver_emb_0,driver_emb_1,...,model_emb_1526,model_emb_1527,model_emb_1528,model_emb_1529,model_emb_1530,model_emb_1531,model_emb_1532,model_emb_1533,model_emb_1534,model_emb_1535
0,-1.680336,2.289126,Scott McLaughlin,Team Penske,Dallara/Chevy,0.0,0.699211,4.071234,0.001385,-0.025217,...,0.012967,0.010191,-0.019037,0.029792,0.007145,-0.01979,-0.026915,0.003929,0.00732,0.000131
1,-1.575315,1.983903,Will Power,Team Penske,Dallara/Chevy,0.0,0.699211,1.747087,0.009116,0.02268,...,0.012967,0.010191,-0.019037,0.029792,0.007145,-0.01979,-0.026915,0.003929,0.00732,0.000131
2,-1.470294,1.874103,Josef Newgarden,Team Penske,Dallara/Chevy,0.0,0.699211,3.021619,-0.019921,0.020281,...,0.012967,0.010191,-0.019037,0.029792,0.007145,-0.01979,-0.026915,0.003929,0.00732,0.000131
3,-1.365273,1.150836,Alexander Rossi,Arrow McLaren,Dallara/Chevy,0.0,0.699211,0.922389,-0.033744,-0.052679,...,0.012967,0.010191,-0.019037,0.029792,0.007145,-0.01979,-0.026915,0.003929,0.00732,0.000131
4,-1.260252,0.905046,Kyle Larson,Arrow McLaren,Dallara/Chevy,0.0,0.585546,-0.052254,-0.023612,-0.017968,...,0.012967,0.010191,-0.019037,0.029792,0.007145,-0.01979,-0.026915,0.003929,0.00732,0.000131
5,-1.155231,0.749916,Santino Ferrucci,A.J. Foyt Enterprises,Dallara/Chevy,0.0,0.699211,-0.202199,-0.01033,-0.042434,...,0.012967,0.010191,-0.019037,0.029792,0.007145,-0.01979,-0.026915,0.003929,0.00732,0.000131
6,-1.05021,0.667315,Rinus VeeKay,Ed Carpenter Racing,Dallara/Chevy,0.0,0.471881,-0.352144,-0.00104,0.026501,...,0.012974,0.010218,-0.019038,0.029766,0.007132,-0.01979,-0.026943,0.003919,0.007327,0.000141
7,-0.945189,0.641124,Pato O'Ward,Arrow McLaren,Dallara/Chevy,0.0,0.699211,-0.352144,-0.004104,-0.03454,...,0.012967,0.010191,-0.019037,0.029792,0.007145,-0.01979,-0.026915,0.003929,0.00732,0.000131
8,-0.840168,0.360077,Felix Rosenqvist,Meyer Shank Racing,Dallara/Honda,0.0,0.585546,-0.352144,-0.046678,0.003106,...,-0.003441,0.024446,-0.03746,0.006889,0.006606,-0.005055,-0.012809,-0.001069,0.002429,-0.002764
9,-0.735147,0.225094,Takuma Sato,Rahal Letterman Lanigan Racing,Dallara/Honda,0.0,0.471881,-0.352144,0.012568,-0.009748,...,-0.003441,0.024446,-0.03746,0.006889,0.006606,-0.005055,-0.012809,-0.001069,0.002429,-0.002764
