In [165]:
import pandas as pd
from catboost import CatBoostRegressor, Pool
import ray
from ray import tune
from ray.tune.search.hyperopt import HyperOptSearch
from ray.air import session
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

## Execution

In [2]:
season_df = pd.read_csv('data.csv')

## Testing Catboost

In [3]:
model_data = season_df.drop(['Time_min', 'Time_max'], axis=1).copy()
model_data = model_data.dropna(subset=['Points'])

categorical_features = ['Driver', 'DriverNumber', 'Category', 'TeamId',        'CountryCode', 'Country', 'Location', 'EventName', 'SessionType', 'SeasonYear']

integer_features = ['IsPersonalBest_pr_lap', 'LocalOrder', 'SeasonYear']

float_features = list(set(model_data.columns) - 
                      set(categorical_features) - 
                      set(integer_features))

for feature in categorical_features:
    model_data[feature] = model_data[feature].astype(object)

for feature in integer_features:
    model_data[feature] = model_data[feature].astype(int)

for feature in float_features:
    model_data[feature] = model_data[feature].astype(float)


In [4]:
model_data.loc[:,model_data.dtypes == object] = (
    model_data.loc[:,model_data.dtypes == object].fillna('-1')
)
model_data.loc[:,model_data.dtypes != object] = (
    model_data.loc[:,model_data.dtypes != object].fillna(-1)
)
model_data[['DriverNumber', 'LocalOrder']] = (
    model_data[['DriverNumber', 'LocalOrder']].astype(object)
)

In [287]:
train_data = model_data.query('SeasonYear < 2023 or LocalOrder < 5')

test_data = model_data.query('LocalOrder >= 5 & SeasonYear == 2023')

In [33]:
X = train_data.drop(['Points'], axis=1)
y = train_data.Points


X_train, X_val, y_train, y_val = train_test_split(X,
                                                  y, 
                                                  test_size=0.2,
                                                  stratify=X['Position'])

X_test = test_data.drop(['Points', 'Position'], axis=1)
y_test = test_data.Points

X_train = X_train.drop(['Position'], axis=1)
X_val = X_val.drop(['Position'], axis=1)

In [36]:
categorical_columns_indices = [i for i, dtype in enumerate(X_train.dtypes) 
                               if dtype == object]

In [37]:
train_pool = Pool(X_train, y_train, cat_features=categorical_columns_indices)
val_pool = Pool(X_val, y_val, cat_features=categorical_columns_indices)

In [38]:
ray.shutdown()
ray.init(num_gpus=1)

2023-07-11 21:58:55,060	INFO worker.py:1627 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


0,1
Python version:,3.11.3
Ray version:,2.5.1
Dashboard:,http://127.0.0.1:8265


In [39]:
def objective(config, data):

    # Set the CatBoostRegressor parameters based on the config
    model = CatBoostRegressor(
        learning_rate=config["learning_rate"],
        depth=int(config["depth"]),
        l2_leaf_reg=int(config["l2_leaf_reg"]),
        min_data_in_leaf=int(config["min_data_in_leaf"]),
        iterations=int(config["iterations"]),
        thread_count=1,
        gpu_ram_part=0.3,
        task_type='GPU',
        eval_metric='RMSE'
    )

    # Train the model
    model.fit(data[0], 
              data[1], 
              eval_set=(data[2], data[3]), 
              verbose=False, 
              early_stopping_rounds=2, 
              cat_features=categorical_columns_indices)

    # Evaluate the model
    score = model.get_evals_result()

    rmse = score['validation']['RMSE'][-1]
    session.report({"rmse": rmse, "done": True})

space = {
    "learning_rate": tune.loguniform(1e-3, 1),
    "depth": tune.randint(4, 12),
    "l2_leaf_reg": tune.randint(1, 10),
    "min_data_in_leaf": tune.randint(1, 40),
    "iterations": tune.randint(10, 300)
}

hyperopt_search = HyperOptSearch(metric="rmse", mode="min")
trainable_with_cpu_gpu = tune.with_resources(objective, {"cpu" : 1, "gpu": 0.3})

In [40]:
data = (X_train, y_train, X_val, y_val)

# Create Tuner object
tuner = tune.Tuner(
    tune.with_parameters(trainable_with_cpu_gpu, data=data),
    tune_config=tune.TuneConfig(
        search_alg=hyperopt_search,
        max_concurrent_trials=3, 
        num_samples=75
    ),
    param_space=space,
)

# Fit Tuner
results = tuner.fit()

0,1
Current time:,2023-07-11 21:59:53
Running for:,00:00:55.79
Memory:,9.2/46.8 GiB

Trial name,status,loc,depth,iterations,l2_leaf_reg,learning_rate,min_data_in_leaf,iter,total time (s),rmse
objective_5619144a,RUNNING,192.168.0.162:96467,10,298,3,0.0514659,13,,,
objective_7240c81b,RUNNING,192.168.0.162:96148,8,221,4,0.0642675,11,,,
objective_e7c5a4b0,RUNNING,192.168.0.162:96226,9,268,8,0.00509245,38,,,
objective_075144ca,TERMINATED,192.168.0.162:96148,4,14,7,0.0218885,18,1.0,0.612859,6.4517
objective_12f29717,TERMINATED,192.168.0.162:96148,7,53,6,0.148668,37,1.0,1.60263,4.48925
objective_130d9651,TERMINATED,192.168.0.162:96467,11,296,3,0.370185,8,1.0,1.8927,4.08144
objective_177f15ce,TERMINATED,192.168.0.162:96148,9,116,7,0.122433,3,1.0,2.23563,4.2656
objective_1db7a1c6,TERMINATED,192.168.0.162:96467,9,48,7,0.0866278,14,1.0,1.78566,4.44818
objective_254e6968,TERMINATED,192.168.0.162:96226,6,201,8,0.00193316,19,1.0,9.33052,6.20479
objective_3a86f7c2,TERMINATED,192.168.0.162:96148,11,95,4,0.186389,3,1.0,3.45378,4.21095




Trial name,date,done,hostname,iterations_since_restore,node_ip,pid,rmse,time_since_restore,time_this_iter_s,time_total_s,timestamp,training_iteration,trial_id
objective_06fe1fbd,2023-07-11_22-00-12,True,jbalda-B660-GAMING-X-AX-DDR4,1,192.168.0.162,96467,3.86651,15.6254,15.6254,15.6254,1689127212,1,06fe1fbd
objective_075144ca,2023-07-11_21-59-04,True,jbalda-B660-GAMING-X-AX-DDR4,1,192.168.0.162,96148,6.4517,0.612859,0.612859,0.612859,1689127144,1,075144ca
objective_12f29717,2023-07-11_21-59-27,True,jbalda-B660-GAMING-X-AX-DDR4,1,192.168.0.162,96148,4.48925,1.60263,1.60263,1.60263,1689127167,1,12f29717
objective_130d9651,2023-07-11_21-59-42,True,jbalda-B660-GAMING-X-AX-DDR4,1,192.168.0.162,96467,4.08144,1.8927,1.8927,1.8927,1689127182,1,130d9651
objective_177f15ce,2023-07-11_21-59-36,True,jbalda-B660-GAMING-X-AX-DDR4,1,192.168.0.162,96148,4.2656,2.23563,2.23563,2.23563,1689127176,1,177f15ce
objective_187fa699,2023-07-11_22-00-00,True,jbalda-B660-GAMING-X-AX-DDR4,1,192.168.0.162,96226,4.20249,3.38056,3.38056,3.38056,1689127200,1,187fa699
objective_1ad4a8f7,2023-07-11_22-01-22,True,jbalda-B660-GAMING-X-AX-DDR4,1,192.168.0.162,96148,4.22472,4.44961,4.44961,4.44961,1689127282,1,1ad4a8f7
objective_1db7a1c6,2023-07-11_21-59-34,True,jbalda-B660-GAMING-X-AX-DDR4,1,192.168.0.162,96467,4.44818,1.78566,1.78566,1.78566,1689127174,1,1db7a1c6
objective_2064f404,2023-07-11_22-01-17,True,jbalda-B660-GAMING-X-AX-DDR4,1,192.168.0.162,96148,3.98804,9.46839,9.46839,9.46839,1689127277,1,2064f404
objective_230d8204,2023-07-11_22-00-29,True,jbalda-B660-GAMING-X-AX-DDR4,1,192.168.0.162,96467,3.8844,17.3419,17.3419,17.3419,1689127229,1,230d8204


2023-07-11 22:02:44,620	INFO tune.py:1111 -- Total run time: 227.40 seconds (227.38 seconds for the tuning loop).


In [41]:
best_result = results.get_best_result(metric='rmse', mode='min')
best_result.config

{'learning_rate': 0.07048777741238169,
 'depth': 9,
 'l2_leaf_reg': 2,
 'min_data_in_leaf': 33,
 'iterations': 263}

In [45]:
params = best_result.config

model = CatBoostRegressor(**params, eval_metric='RMSE', task_type='GPU')
model.fit(train_pool, eval_set=val_pool, plot=True, early_stopping_rounds=5)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 7.0575065	test: 7.0465280	best: 7.0465280 (0)	total: 20.5ms	remaining: 5.37s
1:	learn: 6.8091114	test: 6.7923730	best: 6.7923730 (1)	total: 39.3ms	remaining: 5.13s
2:	learn: 6.5879220	test: 6.5552525	best: 6.5552525 (2)	total: 55.5ms	remaining: 4.81s
3:	learn: 6.3896806	test: 6.3483949	best: 6.3483949 (3)	total: 71.9ms	remaining: 4.65s
4:	learn: 6.2118009	test: 6.1682581	best: 6.1682581 (4)	total: 92ms	remaining: 4.75s
5:	learn: 6.0511191	test: 6.0046062	best: 6.0046062 (5)	total: 112ms	remaining: 4.8s
6:	learn: 5.9021124	test: 5.8477211	best: 5.8477211 (6)	total: 129ms	remaining: 4.72s
7:	learn: 5.7597780	test: 5.7078546	best: 5.7078546 (7)	total: 149ms	remaining: 4.74s
8:	learn: 5.6368609	test: 5.5914046	best: 5.5914046 (8)	total: 167ms	remaining: 4.72s
9:	learn: 5.5200643	test: 5.4797122	best: 5.4797122 (9)	total: 186ms	remaining: 4.71s
10:	learn: 5.4069546	test: 5.3671309	best: 5.3671309 (10)	total: 207ms	remaining: 4.75s
11:	learn: 5.3070428	test: 5.2664675	best: 5.26646

<catboost.core.CatBoostRegressor at 0x7f1fad7aa190>

In [46]:
feature_importance = pd.DataFrame({'Feature': model.feature_names_, 
                                   'Importance': model.feature_importances_})

feature_importance.sort_values('Importance',
                               ascending=False).reset_index(drop=True)

## Linear Regression for combination of predictions

In [330]:
complete_events_mask = (
    train_data.groupby(['Driver', 'EventName', 'SeasonYear'])
    ['SessionType'].nunique() == 4
)

filtered_df = train_data.merge(complete_events_mask,
                      left_on=['Driver', 'EventName', 'SeasonYear'], right_index=True)
filtered_df = filtered_df.query('SessionType_y == True')
filtered_df = (
    filtered_df.sort_values(['SeasonYear',
                             'LocalOrder',
                             'Driver',
                             'SessionType_x'], ascending=False)
                             .reset_index(drop=True)
                             .drop('SessionType_y', axis=1)
)
filtered_df = filtered_df.rename(columns={'SessionType_x': 'SessionType'})

filtered_df_X = filtered_df[X_train.columns].copy()
points = filtered_df.Points[::4]

y_train_pred = model.predict(filtered_df_X)

reg_features = y_train_pred.reshape(-1, 4)

X_train_reg, X_test_reg, y_train_reg, y_test_reg = (
    train_test_split(reg_features, points)
)

In [347]:
lm = LinearRegression(n_jobs=-1, fit_intercept=False)
lm.fit(X_train_reg, y_train_reg)

In [348]:
lm.coef_

array([ 1.03496521,  0.03460515,  0.0699136 , -0.09880934])

In [349]:
reg_pred = lm.predict(X_test_reg)

rmse = mean_squared_error(y_test_reg, reg_pred)**(1/2)
rmse

3.0382368367601953

## Testing

In [350]:
test_1 = (
    test_data.sort_values(['DriverNumber',
                        'LocalOrder',
                        'SessionType'])
                        .reset_index(drop=True)
                        .query('LocalOrder == 5')
)

results = test_1[['Driver', 'EventName', 'SeasonYear', 'Position', 'Points']]

predictions_1 = model.predict(test_1.drop(['Position', 'Points'], axis=1))
predictions_1_reg = predictions_1.reshape(-1, 4)

predictions_1 = lm.predict(predictions_1_reg)

results_select = results.iloc[::4].reset_index(drop=True)
results_select['PredictedPoints'] = predictions_1

results_select.sort_values('PredictedPoints',
                           ascending=False).reset_index(drop=True)

Unnamed: 0,Driver,EventName,SeasonYear,Position,Points,PredictedPoints
0,PER,Monaco Grand Prix,2023,16.0,0.0,18.33214
1,VER,Monaco Grand Prix,2023,1.0,25.0,16.900543
2,SAI,Monaco Grand Prix,2023,8.0,4.0,12.772655
3,LEC,Monaco Grand Prix,2023,6.0,8.0,11.023501
4,RUS,Monaco Grand Prix,2023,5.0,10.0,10.427455
5,HAM,Monaco Grand Prix,2023,4.0,13.0,7.940976
6,ALO,Monaco Grand Prix,2023,2.0,18.0,5.9543
7,NOR,Monaco Grand Prix,2023,9.0,2.0,5.75386
8,OCO,Monaco Grand Prix,2023,3.0,15.0,3.529634
9,GAS,Monaco Grand Prix,2023,7.0,6.0,3.5047


## Procedure

1) Create Season from **F1Season()** class and update dataframe from **update_season_dataframe()** method
2) For each session, pass lap data into **prepare_lap_data()** function
3) For each row in this lap data, pass into **weather_for_racer()** function
4) Join control message data to session data
5) Join driver data to session data

## Things to do/fix

1) Case when a driver has multiple control messsages in a session
3) Add relative driver and team points at time of pre-race (or wait till all data is pulled)