In [8]:
import pandas as pd
from catboost import CatBoostRegressor, Pool
import ray
from ray import tune
from ray.tune.search.hyperopt import HyperOptSearch
from ray.air import session
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from fastf1 import get_event, get_session

from src.model_data.driver import RunAllMethods

In [None]:
seasons = list(range(2018, 2024))
end_date = '2023-07-13'

driver_class = RunAllMethods(seasons, end_date)
all_seasons = pd.DataFrame()

for season in driver_class:
    all_seasons = pd.concat([all_seasons, season])

In [None]:
all_seasons.to_csv('data/full_run_7_13.csv', index=False)

## Things to do/fix

1) Case when a driver has multiple control messsages in a session
2) Driver class
3) Add relative driver and team points at time of pre-race (or wait till all data is pulled)
4) Add docstrings:
    - driver

## Execution

In [10]:
season_df = pd.read_csv('data/data.csv')

## Testing Catboost

In [11]:
model_data = season_df.drop(['Time_min', 'Time_max'], axis=1).copy()
model_data = model_data.dropna(subset=['Points'])

categorical_features = ['Driver', 'DriverNumber', 'Category', 'TeamId',        'CountryCode', 'Country', 'Location', 'EventName', 'SessionType', 'SeasonYear']

integer_features = ['IsPersonalBest_pr_lap', 'LocalOrder', 'SeasonYear']

float_features = list(set(model_data.columns) - 
                      set(categorical_features) - 
                      set(integer_features))

for feature in categorical_features:
    model_data[feature] = model_data[feature].astype(object)

for feature in integer_features:
    model_data[feature] = model_data[feature].astype(int)

for feature in float_features:
    model_data[feature] = model_data[feature].astype(float)


In [12]:
model_data.loc[:,model_data.dtypes == object] = (
    model_data.loc[:,model_data.dtypes == object].fillna('-1')
)
model_data.loc[:,model_data.dtypes != object] = (
    model_data.loc[:,model_data.dtypes != object].fillna(-1)
)
model_data[['DriverNumber', 'LocalOrder']] = (
    model_data[['DriverNumber', 'LocalOrder']].astype(object)
)

In [13]:
train_data = model_data.query('SeasonYear < 2023 or LocalOrder < 5')

test_data = model_data.query('LocalOrder >= 5 & SeasonYear == 2023')

In [14]:
X = train_data.drop(['Points'], axis=1)
y = train_data.Points


X_train, X_val, y_train, y_val = train_test_split(X,
                                                  y, 
                                                  test_size=0.2,
                                                  stratify=X['Position'])

X_test = test_data.drop(['Points', 'Position'], axis=1)
y_test = test_data.Points

X_train = X_train.drop(['Position'], axis=1)
X_val = X_val.drop(['Position'], axis=1)

In [15]:
categorical_columns_indices = [i for i, dtype in enumerate(X_train.dtypes) 
                               if dtype == object]

In [16]:
train_pool = Pool(X_train, y_train, cat_features=categorical_columns_indices)
val_pool = Pool(X_val, y_val, cat_features=categorical_columns_indices)

In [17]:
ray.shutdown()
ray.init(num_gpus=1)

2023-07-13 23:05:57,864	INFO worker.py:1627 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


0,1
Python version:,3.11.3
Ray version:,2.5.1
Dashboard:,http://127.0.0.1:8265


In [43]:
space = {
    "learning_rate": tune.loguniform(1e-2, 1),
    "depth": tune.randint(4, 12),
    "l2_leaf_reg": tune.randint(1, 40),
    "min_data_in_leaf": tune.randint(1, 40),
    "iterations": tune.randint(10, 300)
}

data = {'X_train': X_train,
        'X_val': X_val,
        'y_train': y_train, 
        'y_test': y_test, 
        'fit_params': {
            'early_stopping_rounds': 2,
            'verbose': False
        },
        'cat_features': categorical_columns_indices,
        'metric': 'RMSE'
        }

hyperopt_search = HyperOptSearch(metric="rmse", mode="min")
trainable_with_cpu_gpu = tune.with_resources(objective, {"cpu" : 4, "gpu": 0.2})

In [44]:
# Create Tuner object
tuner = tune.Tuner(
    tune.with_parameters(trainable_with_cpu_gpu, data=data),
    tune_config=tune.TuneConfig(
        search_alg=hyperopt_search,
        max_concurrent_trials=5, 
        num_samples=5
    ),
    param_space=space,
)

# Fit Tuner
results = tuner.fit()

0,1
Current time:,2023-07-13 23:22:24
Running for:,00:00:35.14
Memory:,19.4/46.8 GiB

Trial name,status,loc,depth,iterations,l2_leaf_reg,learning_rate,min_data_in_leaf,iter,total time (s),rmse
objective_727fbc80,TERMINATED,192.168.0.162:28103,11,146,14,0.230999,24,1,20.6355,2.20025
objective_bd43d9c1,TERMINATED,192.168.0.162:28177,10,268,26,0.0947052,23,1,28.348,3.203
objective_c9c61ab1,TERMINATED,192.168.0.162:28275,5,177,30,0.265457,2,1,4.7307,4.09766
objective_7e739e1b,TERMINATED,192.168.0.162:28275,5,207,1,0.133531,14,1,5.47983,3.33861
objective_51c2423c,TERMINATED,192.168.0.162:28103,10,166,36,0.470366,29,1,13.0838,2.32845


Trial name,date,done,hostname,iterations_since_restore,node_ip,pid,rmse,time_since_restore,time_this_iter_s,time_total_s,timestamp,training_iteration,trial_id
objective_51c2423c,2023-07-13_23-22-23,True,jbalda-B660-GAMING-X-AX-DDR4,1,192.168.0.162,28103,2.32845,13.0838,13.0838,13.0838,1689304943,1,51c2423c
objective_727fbc80,2023-07-13_23-22-10,True,jbalda-B660-GAMING-X-AX-DDR4,1,192.168.0.162,28103,2.20025,20.6355,20.6355,20.6355,1689304930,1,727fbc80
objective_7e739e1b,2023-07-13_23-22-15,True,jbalda-B660-GAMING-X-AX-DDR4,1,192.168.0.162,28275,3.33861,5.47983,5.47983,5.47983,1689304935,1,7e739e1b
objective_c9c61ab1,2023-07-13_23-22-09,True,jbalda-B660-GAMING-X-AX-DDR4,1,192.168.0.162,28275,4.09766,4.7307,4.7307,4.7307,1689304929,1,c9c61ab1


2023-07-13 23:22:24,598	INFO tune.py:1111 -- Total run time: 35.14 seconds (35.13 seconds for the tuning loop).


In [None]:
best_result = results.get_best_result(metric='rmse', mode='min')
best_result.config

In [None]:
params = best_result.config

model = CatBoostRegressor(**params, eval_metric='RMSE', task_type='GPU')
model.fit(train_pool, eval_set=val_pool, plot=True, early_stopping_rounds=5)

In [None]:
feature_importance = pd.DataFrame({'Feature': model.feature_names_, 
                                   'Importance': model.feature_importances_})

feature_importance.sort_values('Importance',
                               ascending=False).reset_index(drop=True)

## Catboost for combination of predictions

In [None]:
complete_events_mask = (
    train_data.groupby(['Driver', 'EventName', 'SeasonYear'])
    ['SessionType'].nunique() == 4
)

filtered_df = train_data.merge(complete_events_mask,
                      left_on=['Driver', 'EventName', 'SeasonYear'], right_index=True)
filtered_df = filtered_df.query('SessionType_y == True')
filtered_df = (
    filtered_df.sort_values(['SeasonYear',
                             'LocalOrder',
                             'Driver',
                             'SessionType_x'], ascending=False)
                             .reset_index(drop=True)
                             .drop('SessionType_y', axis=1)
)
filtered_df = filtered_df.rename(columns={'SessionType_x': 'SessionType'})

filtered_df_X = filtered_df[X_train.columns].copy()
points = filtered_df.Points[::4]

y_train_pred = model.predict(filtered_df_X)

reg_features = y_train_pred.reshape(-1, 4)

X_train_reg, X_val_reg, y_train_reg, y_val_reg = (
    train_test_split(reg_features, points)
)

In [None]:
space = {
    "learning_rate": tune.loguniform(1e-2, 1),
    "depth": tune.randint(4, 12),
    "l2_leaf_reg": tune.randint(1, 40),
    "min_data_in_leaf": tune.randint(1, 40),
    "iterations": tune.randint(10, 300)
}

hyperopt_search = HyperOptSearch(metric="rmse", mode="min")
trainable_with_cpu_gpu = tune.with_resources(objective, {"cpu" : 4, "gpu": 0.2})

data = (X_train_reg, y_train_reg, X_val_reg, y_val_reg, None)

# Create Tuner object
tuner = tune.Tuner(
    tune.with_parameters(trainable_with_cpu_gpu, data=data),
    tune_config=tune.TuneConfig(
        search_alg=hyperopt_search,
        max_concurrent_trials=5, 
        num_samples=200
    ),
    param_space=space,
)

# Fit Tuner
results = tuner.fit()

In [None]:
cb = CatBoostRegressor(**results.get_best_result(metric='rmse', mode='min').config)
cb.fit(X_train_reg, y_train_reg, eval_set=(X_val_reg, y_val_reg))

In [None]:
cb.feature_importances_

In [None]:
reg_pred = cb.predict(X_val_reg)

rmse = mean_squared_error(y_val_reg, reg_pred)**(1/2)
rmse

## Testing

In [None]:
test_1 = (
    test_data.sort_values(['DriverNumber',
                        'LocalOrder',
                        'SessionType'])
                        .reset_index(drop=True)
                        .query('LocalOrder == 5')
)

results = test_1[['Driver', 'EventName', 'SeasonYear', 'Position', 'Points']]

predictions_1 = model.predict(test_1.drop(['Position', 'Points'], axis=1))
predictions_1_reg = predictions_1.reshape(-1, 4)

predictions_1 = lm.predict(predictions_1_reg)

results_select = results.iloc[::4].reset_index(drop=True)
results_select['PredictedPoints'] = predictions_1

results_select.sort_values('PredictedPoints',
                           ascending=False).reset_index(drop=True)

## Procedure

1) Create Season from **F1Season()** class and update dataframe from **update_season_dataframe()** method
2) For each session, pass lap data into **prepare_lap_data()** function
3) For each row in this lap data, pass into **weather_for_racer()** function
4) Join control message data to session data
5) Join driver data to session data