In [1]:
import pandas as pd
import numpy as np
from sklearn import linear_model
import joblib
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
from sklearn import tree

In [2]:
data = pd.read_pickle('train_data.pkl')

In [3]:
data.columns

Index(['DriverNumber', 'Team', 'Driver', 'LapTime_1', 'Sector1Time_1',
       'Sector2Time_1', 'Sector3Time_1', 'TyreLife_1', 'SpeedI1_1',
       'SpeedI2_1', 'SpeedFL_1', 'LapTimePercent_1', 'Sector1TimePercent_1',
       'Sector2TimePercent_1', 'Sector3TimePercent_1', 'TyreLifePercent_1',
       'SpeedI1Percent_1', 'SpeedI2Percent_1', 'SpeedFLPercent_1',
       'LapTime_1_1', 'Sector1Time_1_1', 'Sector2Time_1_1', 'Sector3Time_1_1',
       'TyreLife_1_1', 'SpeedI1_1_1', 'SpeedI2_1_1', 'SpeedFL_1_1',
       'LapTimePercent_1_1', 'Sector1TimePercent_1_1',
       'Sector2TimePercent_1_1', 'Sector3TimePercent_1_1',
       'TyreLifePercent_1_1', 'SpeedI1Percent_1_1', 'SpeedI2Percent_1_1',
       'SpeedFLPercent_1_1', 'LapTime_2', 'Sector1Time_2', 'Sector2Time_2',
       'Sector3Time_2', 'TyreLife_2', 'SpeedI1_2', 'SpeedI2_2', 'SpeedFL_2',
       'LapTimePercent_2', 'Sector1TimePercent_2', 'Sector2TimePercent_2',
       'Sector3TimePercent_2', 'TyreLifePercent_2', 'SpeedI1Percent_2',
      

In [4]:
train_data = data[data['EventDate'] <= np.datetime64('2022-06-30')]
test_data = data[data['EventDate'] > np.datetime64('2022-06-30')]

In [5]:
# train_data = train_data.dropna()
# train_data = train_data.fillna(train_data.mean())
# test_data = test_data.fillna(test_data.mean())

In [6]:
train_data.columns

Index(['DriverNumber', 'Team', 'Driver', 'LapTime_1', 'Sector1Time_1',
       'Sector2Time_1', 'Sector3Time_1', 'TyreLife_1', 'SpeedI1_1',
       'SpeedI2_1', 'SpeedFL_1', 'LapTimePercent_1', 'Sector1TimePercent_1',
       'Sector2TimePercent_1', 'Sector3TimePercent_1', 'TyreLifePercent_1',
       'SpeedI1Percent_1', 'SpeedI2Percent_1', 'SpeedFLPercent_1',
       'LapTime_1_1', 'Sector1Time_1_1', 'Sector2Time_1_1', 'Sector3Time_1_1',
       'TyreLife_1_1', 'SpeedI1_1_1', 'SpeedI2_1_1', 'SpeedFL_1_1',
       'LapTimePercent_1_1', 'Sector1TimePercent_1_1',
       'Sector2TimePercent_1_1', 'Sector3TimePercent_1_1',
       'TyreLifePercent_1_1', 'SpeedI1Percent_1_1', 'SpeedI2Percent_1_1',
       'SpeedFLPercent_1_1', 'LapTime_2', 'Sector1Time_2', 'Sector2Time_2',
       'Sector3Time_2', 'TyreLife_2', 'SpeedI1_2', 'SpeedI2_2', 'SpeedFL_2',
       'LapTimePercent_2', 'Sector1TimePercent_2', 'Sector2TimePercent_2',
       'Sector3TimePercent_2', 'TyreLifePercent_2', 'SpeedI1Percent_2',
      

In [7]:
X_cols = ['LapTimePercent_1', 'LapTimePercent_2',
       'PrevLapTimePercent1', 
       'PrevLapTimePercent2', 
       'PrevLapTimePercent3', 
       'LapTimePrevPercentTeamAverage']

In [8]:
def fill_nan_with_group_mean(df, group_col):
    numeric_columns = df.select_dtypes(include=np.number).columns
    df_filled = df.copy()
    
    for col in numeric_columns:
        grouped = df.groupby(group_col)
        df_filled[col] = grouped[col].transform(lambda x: x.fillna(x.mean()))
    
    return df_filled

train_data = fill_nan_with_group_mean(train_data, 'OfficialEventName').dropna()
test_data = fill_nan_with_group_mean(test_data, 'OfficialEventName').dropna()

In [9]:
for col in X_cols + ['LapTimePercent']:
    percentiles = train_data[col].quantile([0, 0.95]).values
    train_data.loc[train_data[col] >= percentiles[1], col] = percentiles[1]
    test_data.loc[test_data[col] >= percentiles[1], col] = percentiles[1]

In [10]:
train_X = train_data[X_cols]
train_y = train_data['LapTimePercent']

test_X = test_data[X_cols]
test_y = test_data['LapTimePercent']

In [11]:
train_X.describe()

Unnamed: 0,LapTimePercent_1,LapTimePercent_2,PrevLapTimePercent1,PrevLapTimePercent2,PrevLapTimePercent3,LapTimePrevPercentTeamAverage
count,1555.0,1555.0,1555.0,1555.0,1555.0,1555.0
mean,0.018956,0.016683,0.01917,0.01909,0.019013,0.097428
std,0.011751,0.010252,0.012041,0.011931,0.011812,0.053115
min,0.0,0.0,0.0,0.0,0.0,0.004088
25%,0.009952,0.008894,0.010011,0.010063,0.010076,0.056417
50%,0.018491,0.016146,0.018439,0.018359,0.018338,0.098589
75%,0.026475,0.023545,0.027626,0.02743,0.02724,0.135253
max,0.04319,0.037561,0.042465,0.041973,0.041579,0.196529


In [12]:
train_y.describe()

count    1555.000000
mean        0.019745
std         0.012633
min         0.000000
25%         0.010110
50%         0.018789
75%         0.028237
max         0.045483
Name: LapTimePercent, dtype: float64

In [13]:
train_X.to_pickle('train_X.pkl')

In [34]:
base_model = linear_model.Lasso()
params = {'alpha': [0.00000001, 0.0000001, 0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1], 'max_iter':[100000]}
# model = linear_model.LinearRegression()

# base_model = tree.DecisionTreeRegressor()
# params = {'max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10]}

model = GridSearchCV(base_model, param_grid = params)
model.fit(train_X, train_y)

GridSearchCV(estimator=Lasso(),
             param_grid={'alpha': [1e-08, 1e-07, 1e-06, 1e-05, 0.0001, 0.001,
                                   0.01, 0.1, 1],
                         'max_iter': [100000]})

In [35]:
model.best_params_

{'alpha': 1e-07, 'max_iter': 100000}

In [36]:
pd.concat([pd.DataFrame(train_X.columns),pd.DataFrame(np.transpose(model.best_estimator_.coef_))], axis = 1)


Unnamed: 0,0,0.1
0,LapTimePercent_1,0.132349
1,LapTimePercent_2,0.352418
2,PrevLapTimePercent1,0.129102
3,PrevLapTimePercent2,0.047613
4,PrevLapTimePercent3,0.097694
5,LapTimePrevPercentTeamAverage,0.057109


In [17]:
test_X = test_data[X_cols]
test_y = test_data['LapTimePercent']

In [18]:
pred = model.predict(test_X)

In [19]:
results = pd.DataFrame({'Pred': pred, 'Actual': test_y, 'Actual_Position': test_data['Position'], 'Race': test_data['OfficialEventName'], 'Driver': test_data['DriverNumber']})

In [20]:
results

Unnamed: 0,Pred,Actual,Actual_Position,Race,Driver
1675,0.009902,0.000000,2.0,FORMULA 1 LENOVO BRITISH GRAND PRIX 2022,1
1676,0.015253,0.014042,4.0,FORMULA 1 LENOVO BRITISH GRAND PRIX 2022,11
1677,0.028759,0.025734,11.0,FORMULA 1 LENOVO BRITISH GRAND PRIX 2022,10
1678,0.026665,0.027883,13.0,FORMULA 1 LENOVO BRITISH GRAND PRIX 2022,22
1679,0.017670,0.024907,7.0,FORMULA 1 LENOVO BRITISH GRAND PRIX 2022,14
...,...,...,...,...,...
2069,0.021963,0.005548,7.0,FORMULA 1 QATAR AIRWAYS HUNGARIAN GRAND PRIX 2023,77
2070,0.007134,0.001110,3.0,FORMULA 1 QATAR AIRWAYS HUNGARIAN GRAND PRIX 2023,4
2071,0.012820,0.003864,4.0,FORMULA 1 QATAR AIRWAYS HUNGARIAN GRAND PRIX 2023,81
2072,0.016655,0.000000,1.0,FORMULA 1 QATAR AIRWAYS HUNGARIAN GRAND PRIX 2023,44


In [21]:
results['Pred_Position'] = results.groupby('Race')['Pred'].rank(method = 'first')

In [22]:
results[['Pred_Position', 'Actual_Position']].describe()

Unnamed: 0,Pred_Position,Actual_Position
count,399.0,399.0
mean,10.47619,10.493734
std,5.761055,5.779389
min,1.0,1.0
25%,5.5,5.5
50%,10.0,10.0
75%,15.0,15.5
max,20.0,20.0


In [23]:
results[['Pred_Position', 'Actual_Position']]

Unnamed: 0,Pred_Position,Actual_Position
1675,3.0,2.0
1676,5.0,4.0
1677,16.0,11.0
1678,15.0,13.0
1679,7.0,7.0
...,...,...
2069,14.0,7.0
2070,1.0,3.0
2071,5.0,4.0
2072,10.0,1.0


In [24]:
mean_squared_error(results['Pred_Position'], results['Actual_Position'])

18.463659147869674

In [25]:
mean_absolute_error(results['Pred_Position'], results['Actual_Position'])

3.1604010025062657

In [26]:
pred2 = model.predict(train_X)

In [27]:
results2 = pd.DataFrame({'Pred': pred2, 'Actual': train_y, 'Actual_Position': train_data['Position'], 'Race': train_data['OfficialEventName'], 'Driver': train_data['DriverNumber']})

In [28]:
results2['Pred_Position'] = results2.groupby('Race')['Pred'].rank(method = 'first')

In [29]:
results2[['Pred_Position', 'Actual_Position']].describe()

Unnamed: 0,Pred_Position,Actual_Position
count,1555.0,1555.0
mean,10.473312,10.47074
std,5.757746,5.751443
min,1.0,1.0
25%,5.0,5.5
50%,10.0,10.0
75%,15.0,15.0
max,20.0,20.0


In [30]:
results2[['Pred_Position', 'Actual_Position']]

Unnamed: 0,Pred_Position,Actual_Position
100,16.0,10.0
101,15.0,16.0
102,10.0,9.0
103,12.0,6.0
104,11.0,7.0
...,...,...
1670,14.0,11.0
1671,9.0,9.0
1672,8.0,14.0
1673,7.0,4.0


In [31]:
mean_squared_error(results2['Pred_Position'], results2['Actual_Position'])

13.24662379421222

In [32]:
mean_absolute_error(results2['Pred_Position'], results2['Actual_Position'])

2.5491961414791

In [33]:
joblib.dump(model, 'model.pkl')

['model.pkl']