In [4]:
# Update setuptools and wheel
!pip install --upgrade setuptools wheel

# Handle specific version requirements for dash and tensorflow dependencies
!pip install dash-core-components==2.0.0 dash-html-components==2.0.0 dash-table==5.0.0
!pip install typing-extensions==4.5.0

# Uninstall and reinstall pydantic to fix the ImportError
!pip uninstall -y pydantic openai
!pip install pydantic==1.10.4 openai==0.27.0

# Install specific dependencies
!pip install python-dotenv


import pandas as pd
import os
from dotenv import load_dotenv
from typing import Any
import openai
import numpy as np

Found existing installation: pydantic 1.10.4
Uninstalling pydantic-1.10.4:
  Successfully uninstalled pydantic-1.10.4
Found existing installation: openai 0.27.0
Uninstalling openai-0.27.0:
  Successfully uninstalled openai-0.27.0
Collecting pydantic==1.10.4
  Using cached pydantic-1.10.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (142 kB)
Collecting openai==0.27.0
  Using cached openai-0.27.0-py3-none-any.whl.metadata (13 kB)
Using cached pydantic-1.10.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
Using cached openai-0.27.0-py3-none-any.whl (70 kB)
Installing collected packages: pydantic, openai
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
amazon-sagemaker-jupyter-scheduler 3.0.7 requires jupyter-scheduler==2.5, but you have jupyter-scheduler 2.5.1 which is incompatible.[0m[31m
[0mSuccessfully installe

## Training

### Feedback Loop

In [3]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_absolute_error, mean_squared_error
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# --- Data Loading

def get_clean_df(race_stats_csv) -> pd.DataFrame:
    df = (
        pd.read_csv(race_stats_csv)
        .drop(columns=['car_num', 'status', 'winnings']) # maybe come back to status. i don't know what to do with it, but it might be meaningful
    )

    return df


# --- Feature Engineering

def embed_categorical_columns(df: pd.DataFrame, categorical: list[str]) -> pd.DataFrame:
    '''
    A module of functionality for creating embeddings wrapped inside one function call.
    '''

    def set_openai_secret() -> None:
        load_dotenv()
        api_key = os.environ.get('OPEN_API_KEY')
        openai.api_key = api_key


    def get_embedding(text: str, model="text-embedding-3-small") -> list[float]:
        return openai.Embedding.create(input=text, model=model).data[0].embedding


    def add_pretrained_embeddings(data, columns):
        for column in columns:
            data[f'{column}_emb'] = data[column].apply(lambda x: generate_embeddings(x))
        return data


    def expand_embeddings(data, columns):
        for column in columns:
            emb_df = pd.DataFrame(data[f'{column}_emb'].tolist(), index=data.index, columns=[f"{column}_emb_{i}" for i in range(len(data[f'{column}_emb'][0]))])
            data = pd.concat([data, emb_df], axis=1)
            data.drop(columns=[f'{column}_emb'], inplace=True)
        return data

    df = df.copy()

    set_openai_secret()

    for column in categorical:
        df[f'{column}_emb'] = df[column].apply(lambda text: get_embedding(text))

        # explode the embedding list's into their own columns and add them to the df

        emb_df = pd.DataFrame(
                    df[f'{column}_emb'].tolist(), 
                    index=df.index, 
                    columns=[f"{column}_emb_{i}" for i in range(len(df[f'{column}_emb'][0]))])

        df = (
            pd.concat([df, emb_df], axis=1)
            .drop(columns=[f'{column}_emb'])
        )

    return df


# --- Model Training

def evaluate_model(model, X_test, y_test) -> dict:
    y_pred = model.predict(X_test)
    return {
        'MAE': mean_absolute_error(y_test, y_pred),
        'MSE': mean_squared_error(y_test, y_pred),
    }
    

def train_and_evaluate_model(model, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3)
    model.fit(X_train, y_train)
    return evaluate_model(model, X_test, y_test)

NameError: name 'pd' is not defined

### Using the Feedback Loop

In [7]:
categorical = ['driver', 'car_name', 'model']
numerical = ['start', 'qual_speed', 'laps', 'led', 'year']

embedded_df_path = 'embedded_df.csv'

if os.path.exists(embedded_df_path):
    df = pd.read_csv(embedded_df_path, index_col=0)
    scaler = StandardScaler()
    df[numerical] = scaler.fit_transform(df[numerical])

else:
    df = (
        get_clean_df('race_stats_2000-2024.csv')
        .pipe(embed_categorical_columns, ['driver', 'car_name', 'model'])
    )

    df.to_csv(embedded_df_path, index=False)

df.head(5)

Unnamed: 0,finish,start,qual_speed,driver,car_name,model,laps,led,year,driver_emb_0,...,model_emb_1526,model_emb_1527,model_emb_1528,model_emb_1529,model_emb_1530,model_emb_1531,model_emb_1532,model_emb_1533,model_emb_1534,model_emb_1535
0,1,-1.574952,-0.107742,Juan Pablo Montoya,Target,G Force/Oldsmobile,0.595827,9.274995,-1.661325,0.009248,...,7.7e-05,0.016734,-0.009105,-0.006143,-0.01099,0.002766,0.007808,0.023753,-0.0102,0.000434
1,2,-0.104873,-0.307888,Buddy Lazier,Delta Faucet/Coors Light/Tae-Bo/Hemelgarn,Dallara/Oldsmobile,0.595827,-0.345199,-1.661325,-0.042749,...,0.011757,0.021775,-0.014351,0.007914,0.01026,-0.007183,-0.00741,0.015428,-0.000102,-0.013089
2,3,-1.469947,-0.117507,Eliseo Salazar,Rio A.J. Foyt Racing,G Force/Oldsmobile,0.595827,-0.345199,-1.661325,-0.007628,...,7.7e-05,0.016734,-0.009105,-0.006143,-0.01099,0.002766,0.007808,0.023753,-0.0102,0.000434
3,4,-1.15493,-0.158506,Jeff Ward,Harrah's A.J. Foyt Racing,G Force/Oldsmobile,0.595827,-0.345199,-1.661325,-0.021473,...,7.7e-05,0.016734,-0.009105,-0.006143,-0.01099,0.002766,0.007808,0.023753,-0.0102,0.000434
4,5,-0.734907,-0.253316,Eddie Cheever Jr.,#51 Excite@Home Indy Race Car,Dallara/Nissan Infiniti,0.595827,-0.345199,-1.661325,0.033247,...,0.01857,0.023989,-0.021232,0.012616,0.0124,0.001821,-0.031544,-0.00129,-0.024219,0.010521


In [8]:
y = df.finish
X = df.drop(columns=['finish'] + categorical)

In [9]:
model = RandomForestRegressor(random_state=3)

train_and_evaluate_model(model, X, y)

{'MAE': 3.1976100628930815, 'MSE': 17.843066666666665}

In [10]:
from pickle import dump

with open("RandomForestRegressor.pkl", "wb") as f:
    dump(model, f, protocol=5)

### Model Scoreboard

#### Baseline

`model = RandomForestRegressor(random_state=3)`

- MAE = 3.18
- MSE = 18.07

dataframe.shape => 792, 4617 (no dimensionality reduction)

## 2024 Predictions

In [3]:
def return_imputed_2024_df() -> pd.DataFrame:
    '''
    impute (laps, led) with the median values from the old_df for the corresponding start position
    '''

    def _median_laps_for_pos(old_df: pd.DataFrame, pos: int) -> float:
        return (
            old_df
            .query(f'start == {pos}')
            .laps
            .median()
        )

    def _median_led_for_pos(old_df: pd.DataFrame, pos: int) -> float:
        return (
            old_df
            .query(f'start == {pos}')
            .led
            .median()
        )

    old_df = pd.read_csv('race_stats_2000-2023.csv')
    df_2024 = (
        pd.read_csv('race_stats_2024.csv')
        .assign(laps=lambda df_: df_.start.apply(lambda pos: _median_laps_for_pos(old_df, pos)))
        .assign(led=lambda df_: df_.start.apply(lambda pos: _median_led_for_pos(old_df, pos)))
    )

    return df_2024

In [5]:
categorical = ['driver', 'car_name', 'model']
numerical = ['start', 'qual_speed', 'laps', 'led', 'year']

embedded_df_2024_path = 'embedded_df_2024.csv'

if os.path.exists(embedded_df_2024_path):
    df_2024 = pd.read_csv(embedded_df_2024_path)

else:
    df_2024 = (
        return_imputed_2024_df()
        .pipe(embed_categorical_columns, ['driver', 'car_name', 'model'])
    )

    scaler = StandardScaler()
    df_2024[numerical] = scaler.fit_transform(df_2024[numerical])

    df_2024.to_csv(embedded_df_2024_path, index=False)

df_2024.head(5)

Unnamed: 0,start,qual_speed,driver,car_name,model,year,laps,led,driver_emb_0,driver_emb_1,...,model_emb_1526,model_emb_1527,model_emb_1528,model_emb_1529,model_emb_1530,model_emb_1531,model_emb_1532,model_emb_1533,model_emb_1534,model_emb_1535
0,-1.680336,2.289126,Scott McLaughlin,Team Penske,Dallara/Chevy,0.0,0.699211,4.071234,0.001405,-0.025236,...,0.012953,0.010197,-0.01905,0.029764,0.007118,-0.019789,-0.026954,0.003939,0.00732,0.000111
1,-1.575315,1.983903,Will Power,Team Penske,Dallara/Chevy,0.0,0.699211,1.747087,0.009101,0.022619,...,0.012968,0.010212,-0.019038,0.029768,0.007133,-0.019765,-0.026944,0.003909,0.007314,0.00012
2,-1.470294,1.874103,Josef Newgarden,Team Penske,Dallara/Chevy,0.0,0.699211,3.021619,-0.019298,0.020488,...,0.012968,0.010212,-0.019038,0.029768,0.007133,-0.019765,-0.026944,0.003909,0.007314,0.00012
3,-1.365273,1.150836,Alexander Rossi,Arrow McLaren,Dallara/Chevy,0.0,0.699211,0.922389,-0.033744,-0.052679,...,0.012968,0.010212,-0.019038,0.029768,0.007133,-0.019765,-0.026944,0.003909,0.007314,0.00012
4,-1.260252,0.905046,Kyle Larson,Arrow McLaren,Dallara/Chevy,0.0,0.585546,-0.052254,-0.023643,-0.017999,...,0.012968,0.010212,-0.019038,0.029768,0.007133,-0.019765,-0.026944,0.003909,0.007314,0.00012


In [6]:
from pickle import load

with open("RandomForestRegressor.pkl", "rb") as f:
    model = load(f)

In [7]:
X_2024 = df_2024.drop(columns=categorical)

reorder_columns = ['start', 'qual_speed', 'laps', 'led', 'year']

# Columns to keep unchanged (all columns except the ones to reorder)
remaining_columns = [col for col in X_2024.columns if col not in reorder_columns]
new_column_order = reorder_columns + remaining_columns

X_2024 = X_2024[new_column_order]

X_2024.head()

Unnamed: 0,start,qual_speed,laps,led,year,driver_emb_0,driver_emb_1,driver_emb_2,driver_emb_3,driver_emb_4,...,model_emb_1526,model_emb_1527,model_emb_1528,model_emb_1529,model_emb_1530,model_emb_1531,model_emb_1532,model_emb_1533,model_emb_1534,model_emb_1535
0,-1.680336,2.289126,0.699211,4.071234,0.0,0.001405,-0.025236,-0.033293,-0.032422,-0.009255,...,0.012953,0.010197,-0.01905,0.029764,0.007118,-0.019789,-0.026954,0.003939,0.00732,0.000111
1,-1.575315,1.983903,0.699211,1.747087,0.0,0.009101,0.022619,-0.012523,0.013416,-0.012398,...,0.012968,0.010212,-0.019038,0.029768,0.007133,-0.019765,-0.026944,0.003909,0.007314,0.00012
2,-1.470294,1.874103,0.699211,3.021619,0.0,-0.019298,0.020488,-0.023918,0.004821,0.005615,...,0.012968,0.010212,-0.019038,0.029768,0.007133,-0.019765,-0.026944,0.003909,0.007314,0.00012
3,-1.365273,1.150836,0.699211,0.922389,0.0,-0.033744,-0.052679,-0.025343,-0.04061,0.013779,...,0.012968,0.010212,-0.019038,0.029768,0.007133,-0.019765,-0.026944,0.003909,0.007314,0.00012
4,-1.260252,0.905046,0.585546,-0.052254,0.0,-0.023643,-0.017999,-0.060526,-0.018074,0.004057,...,0.012968,0.010212,-0.019038,0.029768,0.007133,-0.019765,-0.026944,0.003909,0.007314,0.00012


In [27]:
preds_2024 = model.predict(X_2024)
preds_2024

predicted_df = (
    pd.DataFrame(
        data = {
            "driver": df_2024.driver,
            "starting_place": pd.read_csv('race_stats_2024.csv').start,
            "predicted_finish_value": preds_2024,
        }
    )
    .sort_values(by='predicted_finish_value')
    .assign(predicted_finish_place=list(range(33)))
    .assign(predicted_gap=lambda df_: df_.starting_place - df_.predicted_finish_place)
    .reset_index(drop=True)
)

predicted_df


Unnamed: 0,driver,starting_place,predicted_finish_value,predicted_finish_place,predicted_gap
0,Josef Newgarden,3,5.24,0,3
1,Scott McLaughlin,1,6.71,1,0
2,Alexander Rossi,4,6.78,2,2
3,Santino Ferrucci,6,7.86,3,3
4,Will Power,2,8.45,4,-2
5,Alex Palou,14,8.98,5,9
6,Kyffin Simpson,18,9.06,6,12
7,Pato O'Ward,8,11.74,7,1
8,Ed Carpenter,17,19.18,8,9
9,Christian Rasmussen,24,19.29,9,15
