In [1]:
import pandas as pd
from catboost import CatBoostRegressor, Pool
from ray.tune.search.hyperopt import HyperOptSearch
from ray import tune
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

from src.model_data.driver import RunAllMethods
from src.ray_tuning.ray_tune import RayTune

In [2]:
# seasons = list(range(2018, 2024))
# end_date = '2023-07-13'

# driver_class = RunAllMethods(seasons, end_date)
# all_seasons = pd.DataFrame()

# for season in driver_class:
#     all_seasons = pd.concat([all_seasons, season])

In [3]:
# all_seasons.to_csv('data/full_run_7_15.csv', index=False)

## Things to do/fix

1) Case when a driver has multiple control messsages in a session
2) Add docstrings:
    - driver
    - ray tune

## Execution

In [4]:
season_df = pd.read_csv('data/full_run_7_15.csv')

In [5]:
season_df.head()

Unnamed: 0,Driver,DriverNumber,Time_min,Time_max,LapTimeSeconds_min,LapTimeSeconds_max,LapTimeSeconds_mean,LapTimeSeconds_std,LapTimeSeconds_count,Sector1TimeSeconds_min,...,TeamId,CountryCode,SessionType,SeasonYear,EventName,Position,Points,Country,Location,RoundNumber
0,ALO,14,0 days 00:30:47.908000,0 days 01:39:12.964000,85.896,115.334,93.453545,9.393776,11,28.485,...,mclaren,,Practice 1,2018,Australian Grand Prix,5.0,10.0,Australia,Melbourne,1
1,ALO,14,0 days 00:12:20.531000,0 days 01:38:57.172000,85.2,148.222,98.449,14.8022,23,28.165,...,mclaren,,Practice 2,2018,Australian Grand Prix,5.0,10.0,Australia,Melbourne,1
2,ALO,14,0 days 00:23:58.601000,0 days 01:05:05.890000,94.298,122.32,105.187667,10.557891,9,30.927,...,mclaren,,Practice 3,2018,Australian Grand Prix,5.0,10.0,Australia,Melbourne,1
3,BOT,77,0 days 00:25:57.761000,0 days 01:39:48.848000,84.577,122.941,96.972792,13.029348,24,28.037,...,mercedes,,Practice 1,2018,Australian Grand Prix,8.0,4.0,Australia,Melbourne,1
4,BOT,77,0 days 00:12:42.794000,0 days 01:39:37.880000,84.159,130.373,96.584207,12.853744,29,27.725,...,mercedes,,Practice 2,2018,Australian Grand Prix,8.0,4.0,Australia,Melbourne,1


## Testing Catboost

In [6]:
model_data = season_df.drop(['Time_min', 'Time_max'], axis=1).copy()
model_data = model_data.dropna(subset=['Points'])

categorical_features = ['Driver', 'DriverNumber', 'Category', 'TeamId',        'CountryCode', 'Country', 'Location', 'EventName', 'SessionType', 'SeasonYear']

integer_features = ['IsPersonalBest_pr_lap', 'RoundNumber', 'SeasonYear']

float_features = list(set(model_data.columns) - 
                      set(categorical_features) - 
                      set(integer_features))

for feature in categorical_features:
    model_data[feature] = model_data[feature].astype(object)

for feature in integer_features:
    model_data[feature] = model_data[feature].astype(int)

for feature in float_features:
    model_data[feature] = model_data[feature].astype(float)


In [7]:
model_data.loc[:,model_data.dtypes == object] = (
    model_data.loc[:,model_data.dtypes == object].fillna('-1')
)
model_data.loc[:,model_data.dtypes != object] = (
    model_data.loc[:,model_data.dtypes != object].fillna(-1)
)
model_data[['DriverNumber', 'RoundNumber']] = (
    model_data[['DriverNumber', 'RoundNumber']].astype(object)
)

In [8]:
train_data = model_data.query('SeasonYear < 2023 or RoundNumber < 5')

test_data = model_data.query('RoundNumber >= 5 & SeasonYear == 2023')

In [9]:
X = train_data.drop(['Points'], axis=1)
y = train_data.Points


X_train, X_val, y_train, y_val = train_test_split(X,
                                                  y, 
                                                  test_size=0.2,
                                                  stratify=X['Position'])

X_test = test_data.drop(['Points', 'Position'], axis=1)
y_test = test_data.Points

X_train = X_train.drop(['Position'], axis=1)
X_val = X_val.drop(['Position'], axis=1)

In [10]:
categorical_columns_indices = [i for i, dtype in enumerate(X_train.dtypes) 
                               if dtype == object]

In [11]:
train_pool = Pool(X_train, y_train, cat_features=categorical_columns_indices)
val_pool = Pool(X_val, y_val, cat_features=categorical_columns_indices)

In [15]:
space = {
    "learning_rate": tune.loguniform(1e-2, 1),
    "depth": tune.randint(4, 12),
    "l2_leaf_reg": tune.randint(1, 40),
    "min_data_in_leaf": tune.randint(1, 40),
    "iterations": tune.randint(10, 300)
}

data = {'X_train': X_train,
        'X_val': X_val,
        'y_train': y_train, 
        'y_val': y_val, 
        'fit_params': {
            'early_stopping_rounds': 2,
            'verbose': False
        },
        'cat_features': categorical_columns_indices,
        'metric': 'RMSE'
        }

hyperopt_search = HyperOptSearch(metric="rmse", mode="min")

In [16]:
tuning_class = RayTune(hyperopt_search, space, data)

tuning_config = {'num_gpus': 1}
cpu_per_trial = 4
gpu_per_trial = 0.2
max_concurrent_trials = 5
num_samples = 5

results = (
    tuning_class.tuner(tuning_config,
                       cpu_per_trial,
                       gpu_per_trial,
                       max_concurrent_trials,
                       num_samples)
)

2023-07-15 13:51:46,073	INFO worker.py:1627 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


0,1
Current time:,2023-07-15 13:51:49
Running for:,00:00:03.42
Memory:,7.9/46.8 GiB

Trial name,status,loc,depth,iterations,l2_leaf_reg,learning_rate,min_data_in_leaf,iter,total time (s),rmse
objective_3482ba54,TERMINATED,192.168.0.162:30032,7,144,6,0.0129058,8,1,0.631055,5.05677
objective_288a7063,TERMINATED,192.168.0.162:30032,4,173,7,0.0332567,32,1,0.182198,4.80266
objective_ab226055,TERMINATED,192.168.0.162:30032,8,107,31,0.376847,11,1,0.248373,4.61404
objective_2ffd479d,TERMINATED,192.168.0.162:30032,7,288,28,0.015768,15,1,1.63139,4.08377
objective_990a3a4c,TERMINATED,192.168.0.162:30411,5,168,7,0.0463223,1,1,0.407164,4.76005


Trial name,date,done,hostname,iterations_since_restore,node_ip,pid,rmse,time_since_restore,time_this_iter_s,time_total_s,timestamp,training_iteration,trial_id
objective_288a7063,2023-07-15_13-51-48,True,jbalda-B660-GAMING-X-AX-DDR4,1,192.168.0.162,30032,4.80266,0.182198,0.182198,0.182198,1689443508,1,288a7063
objective_3482ba54,2023-07-15_13-51-47,True,jbalda-B660-GAMING-X-AX-DDR4,1,192.168.0.162,30032,5.05677,0.631055,0.631055,0.631055,1689443507,1,3482ba54
objective_990a3a4c,2023-07-15_13-51-49,True,jbalda-B660-GAMING-X-AX-DDR4,1,192.168.0.162,30411,4.76005,0.407164,0.407164,0.407164,1689443509,1,990a3a4c
objective_ab226055,2023-07-15_13-51-48,True,jbalda-B660-GAMING-X-AX-DDR4,1,192.168.0.162,30032,4.61404,0.248373,0.248373,0.248373,1689443508,1,ab226055


2023-07-15 13:51:49,966	INFO tune.py:1111 -- Total run time: 3.42 seconds (3.42 seconds for the tuning loop).


In [None]:
best_result = results.get_best_result(metric='rmse', mode='min')
best_result.config

In [None]:
params = best_result.config

model = CatBoostRegressor(**params, eval_metric='RMSE', task_type='GPU')
model.fit(train_pool, eval_set=val_pool, plot=True, early_stopping_rounds=5)

In [None]:
feature_importance = pd.DataFrame({'Feature': model.feature_names_, 
                                   'Importance': model.feature_importances_})

feature_importance.sort_values('Importance',
                               ascending=False).reset_index(drop=True)

## Catboost for combination of predictions

In [None]:
complete_events_mask = (
    train_data.groupby(['Driver', 'EventName', 'SeasonYear'])
    ['SessionType'].nunique() == 4
)

filtered_df = train_data.merge(complete_events_mask,
                      left_on=['Driver', 'EventName', 'SeasonYear'], right_index=True)
filtered_df = filtered_df.query('SessionType_y == True')
filtered_df = (
    filtered_df.sort_values(['SeasonYear',
                             'RoundNumber',
                             'Driver',
                             'SessionType_x'], ascending=False)
                             .reset_index(drop=True)
                             .drop('SessionType_y', axis=1)
)
filtered_df = filtered_df.rename(columns={'SessionType_x': 'SessionType'})

filtered_df_X = filtered_df[X_train.columns].copy()
points = filtered_df.Points[::4]

y_train_pred = model.predict(filtered_df_X)

reg_features = y_train_pred.reshape(-1, 4)

X_train_reg, X_val_reg, y_train_reg, y_val_reg = (
    train_test_split(reg_features, points)
)

In [None]:
space = {
    "learning_rate": tune.loguniform(1e-2, 1),
    "depth": tune.randint(4, 12),
    "l2_leaf_reg": tune.randint(1, 40),
    "min_data_in_leaf": tune.randint(1, 40),
    "iterations": tune.randint(10, 300)
}

hyperopt_search = HyperOptSearch(metric="rmse", mode="min")
trainable_with_cpu_gpu = tune.with_resources(objective, {"cpu" : 4, "gpu": 0.2})

data = (X_train_reg, y_train_reg, X_val_reg, y_val_reg, None)

# Create Tuner object
tuner = tune.Tuner(
    tune.with_parameters(trainable_with_cpu_gpu, data=data),
    tune_config=tune.TuneConfig(
        search_alg=hyperopt_search,
        max_concurrent_trials=5, 
        num_samples=200
    ),
    param_space=space,
)

# Fit Tuner
results = tuner.fit()

In [None]:
cb = CatBoostRegressor(**results.get_best_result(metric='rmse', mode='min').config)
cb.fit(X_train_reg, y_train_reg, eval_set=(X_val_reg, y_val_reg))

In [None]:
cb.feature_importances_

In [None]:
reg_pred = cb.predict(X_val_reg)

rmse = mean_squared_error(y_val_reg, reg_pred)**(1/2)
rmse

## Testing

In [None]:
test_1 = (
    test_data.sort_values(['DriverNumber',
                        'RoundNumber',
                        'SessionType'])
                        .reset_index(drop=True)
                        .query('RoundNumber == 5')
)

results = test_1[['Driver', 'EventName', 'SeasonYear', 'Position', 'Points']]

predictions_1 = model.predict(test_1.drop(['Position', 'Points'], axis=1))
predictions_1_reg = predictions_1.reshape(-1, 4)

predictions_1 = lm.predict(predictions_1_reg)

results_select = results.iloc[::4].reset_index(drop=True)
results_select['PredictedPoints'] = predictions_1

results_select.sort_values('PredictedPoints',
                           ascending=False).reset_index(drop=True)

## Procedure

1) Create Season from **F1Season()** class and update dataframe from **update_season_dataframe()** method
2) For each session, pass lap data into **prepare_lap_data()** function
3) For each row in this lap data, pass into **weather_for_racer()** function
4) Join control message data to session data
5) Join driver data to session data