In [1]:
import pandas   as pd
import numpy    as np
import requests
import bs4
from   bs4                     import BeautifulSoup
from   dateutil.relativedelta  import *
from   sklearn.preprocessing   import StandardScaler
from   selenium                import webdriver
from   sklearn.linear_model    import LinearRegression
from   sklearn.linear_model    import LogisticRegression
from   sklearn.metrics         import precision_score
from   sklearn.ensemble        import RandomForestClassifier
from   sklearn.ensemble        import RandomForestRegressor
import sklearn.svm             as     svm
from   sklearn.neural_network  import MLPClassifier
from   sklearn.neural_network  import MLPRegressor
from   pprint                  import pprint
import warnings                as warn
warn.filterwarnings("ignore", category=DeprecationWarning)
warn.filterwarnings("ignore", category=Warning)

In [2]:
# Races DF
races = {'season'   : [], 'round'  : [],
        'circuit_id': [], 'lat'    : [],
        'long'      : [], 'country': [],
        'date'      : [], 'url'    : []}
for year in list(range(2015,2023)):
    url  = 'https://ergast.com/api/f1/{}.json'
    r    = requests.get(url.format(year))
    json = r.json()
    for item in json['MRData']['RaceTable']['Races']:
        try:
            races['season'].append(int(item['season']))
        except Exception as e:
            races['season'].append(None)
        try:
            races['round'].append(int(item['round']))
        except Exception as e:
            races['round'].append(None)
        try:
            races['circuit_id'].append(item['Circuit']['circuitId'])
        except Exception as e:
            races['circuit_id'].append(None)
        try:
            races['lat'].append(float(item['Circuit']['Location']['lat']))
        except Exception as e:
            races['lat'].append(None)
        try:
            races['long'].append(float(item['Circuit']['Location']['long']))
        except Exception as e:
            races['long'].append(None)
        try:
            races['country'].append(item['Circuit']['Location']['country'])
        except Exception as e:
            races['country'].append(None)
        try:
            races['date'].append(item['date'])
        except Exception as e:
            races['date'].append(None)
        try:
            races['url'].append(item['url'])
        except Exception as e:
            races['url'].append(None)
races = pd.DataFrame(races)

In [3]:
races.head(50)

Unnamed: 0,season,round,circuit_id,lat,long,country,date,url
0,2015,1,albert_park,-37.8497,144.968,Australia,2015-03-15,http://en.wikipedia.org/wiki/2015_Australian_G...
1,2015,2,sepang,2.76083,101.738,Malaysia,2015-03-29,http://en.wikipedia.org/wiki/2015_Malaysian_Gr...
2,2015,3,shanghai,31.3389,121.22,China,2015-04-12,http://en.wikipedia.org/wiki/2015_Chinese_Gran...
3,2015,4,bahrain,26.0325,50.5106,Bahrain,2015-04-19,http://en.wikipedia.org/wiki/2015_Bahrain_Gran...
4,2015,5,catalunya,41.57,2.26111,Spain,2015-05-10,http://en.wikipedia.org/wiki/2015_Spanish_Gran...
5,2015,6,monaco,43.7347,7.42056,Monaco,2015-05-24,http://en.wikipedia.org/wiki/2015_Monaco_Grand...
6,2015,7,villeneuve,45.5,-73.5228,Canada,2015-06-07,http://en.wikipedia.org/wiki/2015_Canadian_Gra...
7,2015,8,red_bull_ring,47.2197,14.7647,Austria,2015-06-21,http://en.wikipedia.org/wiki/2015_Austrian_Gra...
8,2015,9,silverstone,52.0786,-1.01694,UK,2015-07-05,http://en.wikipedia.org/wiki/2015_British_Gran...
9,2015,10,hungaroring,47.5789,19.2486,Hungary,2015-07-26,http://en.wikipedia.org/wiki/2015_Hungarian_Gr...


In [4]:
# Rounds DF
rounds = []
i = 0
for year in np.array(races.season.unique()):
    rounds.append([year, list(races[races.season == year]['round'])])
    print(rounds[i][0],rounds[i][1],len(rounds[i][1]))
    i = i + 1

2015 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19] 19
2016 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21] 21
2017 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20] 20
2018 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21] 21
2019 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21] 21
2020 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17] 17
2021 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22] 22
2022 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22] 22


In [5]:
# Results DF
results = {'season'       : [], 'round'      : [],
           'circuit_id'   : [], 'driver'     : [],
           'date_of_birth': [], 'nationality': [],
           'constructor'  : [], 'grid'       : [],
           'time'         : [], 'status'     : [],
           'points'       : [], 'podium'     : []}
for n in list(range(len(rounds))):
    for i in rounds[n][1]:
        url = 'http://ergast.com/api/f1/{}/{}/results.json'
        r = requests.get(url.format(rounds[n][0], i))
        json = r.json()
        for item in json['MRData']['RaceTable']['Races'][0]['Results']:
            try:
                results['season'].append(int(json['MRData']['RaceTable']['Races'][0]['season']))
            except Exception as e:
                results['season'].append(None)
            try:
                results['round'].append(int(json['MRData']['RaceTable']['Races'][0]['round']))
            except Exception as e:
                results['round'].append(None)
            try:
                results['circuit_id'].append(json['MRData']['RaceTable']['Races'][0]['Circuit']['circuitId'])
            except Exception as e:
                results['circuit_id'].append(None)
            try:
                results['driver'].append(item['Driver']['driverId'])
            except Exception as e:
                results['driver'].append(None)
            try:
                results['date_of_birth'].append(item['Driver']['dateOfBirth'])
            except Exception as e:
                results['date_of_birth'].append(None)
            try:
                results['nationality'].append(item['Driver']['nationality'])
            except Exception as e:
                results['nationality'].append(None)
            try:
                results['constructor'].append(item['Constructor']['constructorId'])
            except Exception as e:
                results['constructor'].append(None)
            try:
                results['grid'].append(int(item['grid']))
            except Exception as e:
                results['grid'].append(None)
            try:
                results['time'].append(int(item['Time']['millis']))
            except Exception as e:
                results['time'].append(None)
            try:
                results['status'].append(item['status'])
            except Exception as e:
                results['status'].append(None)
            try:
                results['points'].append(int(item['points']))
            except Exception as e:
                results['points'].append(None)
            try:
                results['podium'].append(int(item['position']))
            except Exception as e:
                results['podium'].append(None)
results = pd.DataFrame(results)

In [6]:
results

Unnamed: 0,season,round,circuit_id,driver,date_of_birth,nationality,constructor,grid,time,status,points,podium
0,2015,1,albert_park,hamilton,1985-01-07,British,mercedes,1,5514067.0,Finished,25.0,1
1,2015,1,albert_park,rosberg,1985-06-27,German,mercedes,2,5515427.0,Finished,18.0,2
2,2015,1,albert_park,vettel,1987-07-03,German,ferrari,4,5548590.0,Finished,15.0,3
3,2015,1,albert_park,massa,1981-04-25,Brazilian,williams,3,5552263.0,Finished,12.0,4
4,2015,1,albert_park,nasr,1992-08-21,Brazilian,sauber,10,5609216.0,Finished,10.0,5
...,...,...,...,...,...,...,...,...,...,...,...,...
3295,2022,22,yas_marina,mick_schumacher,1999-03-22,German,haas,12,,+1 Lap,0.0,16
3296,2022,22,yas_marina,kevin_magnussen,1992-10-05,Danish,haas,16,,+1 Lap,0.0,17
3297,2022,22,yas_marina,hamilton,1985-01-07,British,mercedes,5,,Hydraulics,0.0,18
3298,2022,22,yas_marina,latifi,1995-06-29,Canadian,williams,20,,Collision damage,0.0,19


In [None]:
# Driver Standings DF
driver_standings = {'season'     : [], 'round'               : [],
                    'driver'     : [], 'driver_points'       : [],
                    'driver_wins': [], 'driver_standings_pos': []}
# query API
for n in list(range(len(rounds))):
    for i in rounds[n][1]:    # iterate through rounds of each year
        url = 'https://ergast.com/api/f1/{}/{}/driverStandings.json'
        r = requests.get(url.format(rounds[n][0], i))
        json = r.json()
        for item in json['MRData']['StandingsTable']['StandingsLists'][0]['DriverStandings']:
            try:
                driver_standings['season'].append(int(json['MRData']['StandingsTable']['StandingsLists'][0]['season']))
            except Exception as e:
                driver_standings['season'].append(None)
            try:
                driver_standings['round'].append(int(json['MRData']['StandingsTable']['StandingsLists'][0]['round']))
            except Exception as e:
                driver_standings['round'].append(None)
            try:
                driver_standings['driver'].append(item['Driver']['driverId'])
            except Exception as e:
                driver_standings['driver'].append(None)
            try:
                driver_standings['driver_points'].append(int(item['points']))
            except Exception as e:
                driver_standings['driver_points'].append(None)
            try:
                driver_standings['driver_wins'].append(int(item['wins']))
            except Exception as e:
                driver_standings['driver_wins'].append(None)
            try:
                driver_standings['driver_standings_pos'].append(int(item['position']))
            except Exception as e:
                driver_standings['driver_standings_pos'].append(None)
driver_standings = pd.DataFrame(driver_standings)
# define lookup function to shift points and number of wins from previous rounds
def lookup (df, team, points):
    df['lookup1'] = df.season.astype(str) + df[team] + df['round'].astype(str)
    df['lookup2'] = df.season.astype(str) + df[team] + (df['round']-1).astype(str)
    new_df = df.merge(df[['lookup1', points]], how = 'left', left_on='lookup2',right_on='lookup1')
    new_df.drop(['lookup1_x', 'lookup2', 'lookup1_y'], axis = 1, inplace = True)
    new_df.rename(columns = {points+'_x': points+'_after_race', points+'_y': points}, inplace = True)
    new_df[points].fillna(0, inplace = True)
    return new_df
driver_standings = lookup(driver_standings, 'driver', 'driver_points')
driver_standings = lookup(driver_standings, 'driver', 'driver_wins')
driver_standings = lookup(driver_standings, 'driver', 'driver_standings_pos')
driver_standings.drop(['driver_points_after_race', 
                       'driver_wins_after_race', 
                       'driver_standings_pos_after_race'], 
                        axis    = 1, 
                        inplace = True)

In [None]:
driver_standings.head(2500)

In [None]:
print(rounds[8:])

In [None]:
# Constructor Standings DF
constructor_rounds = rounds
constructor_standings = {'season'          : [], 'round'                    : [],
                         'constructor'     : [], 'constructor_points'       : [],
                         'constructor_wins': [], 'constructor_standings_pos': []}
# query API
for n in list(range(len(constructor_rounds))):
    for i in constructor_rounds[n][1]:
        url  = 'https://ergast.com/api/f1/{}/{}/constructorStandings.json'
        r    = requests.get(url.format(constructor_rounds[n][0], i))
        json = r.json()
        for item in json['MRData']['StandingsTable']['StandingsLists'][0]['ConstructorStandings']:
            try:
                constructor_standings['season'].append(int(json['MRData']['StandingsTable']['StandingsLists'][0]['season']))
            except Exception as e:
                constructor_standings['season'].append(None)
            try:
                constructor_standings['round'].append(int(json['MRData']['StandingsTable']['StandingsLists'][0]['round']))
            except Exception as e:
                constructor_standings['round'].append(None)
            try:
                constructor_standings['constructor'].append(item['Constructor']['constructorId'])
            except Exception as e:
                constructor_standings['constructor'].append(None)
            try:
                constructor_standings['constructor_points'].append(int(item['points']))
            except Exception as e:
                constructor_standings['constructor_points'].append(None)
            try:
                constructor_standings['constructor_wins'].append(int(item['wins']))
            except Exception as e:
                constructor_standings['constructor_wins'].append(None)
            try:
                constructor_standings['constructor_standings_pos'].append(int(item['position']))
            except Exception as e:
                constructor_standings['constructor_standings_pos'].append(None)
constructor_standings = pd.DataFrame(constructor_standings)
constructor_standings = lookup(constructor_standings, 'constructor', 'constructor_points')
constructor_standings = lookup(constructor_standings, 'constructor', 'constructor_wins')
constructor_standings = lookup(constructor_standings, 'constructor', 'constructor_standings_pos')
constructor_standings.drop(['constructor_points_after_race', 'constructor_wins_after_race','constructor_standings_pos_after_race' ],
                           axis = 1, inplace = True)

In [None]:
constructor_standings.head(20)

In [None]:
# Qualifying DF
qualifying_results = pd.DataFrame()
# Qualifying times are only available from 1983
for year in list(range(2015,2023)):
    url = 'https://www.formula1.com/en/results.html/{}/races.html'
    r = requests.get(url.format(year))
    soup = BeautifulSoup(r.text, 'html.parser')
    # find links to all circuits for a certain year
    year_links = []
    for page in soup.find_all('a', attrs = {'class':"resultsarchive-filter-item-link FilterTrigger"}):
        link = page.get('href')
        if f'/en/results.html/{year}/races/' in link: 
            year_links.append(link)
    # for each circuit, switch to the starting grid page and read table
    year_df = pd.DataFrame()
    new_url = 'https://www.formula1.com{}'
    for n, link in list(enumerate(year_links)):
        link = link.replace('race-result.html', 'starting-grid.html')
        df = pd.read_html(new_url.format(link))
        df = df[0]
        df['season'] = year
        df['round'] = n+1
        for col in df:
            if 'Unnamed' in col:
                df.drop(col, axis = 1, inplace = True)
        year_df = pd.concat([year_df, df])
    # concatenate all tables from all years  
    qualifying_results = pd.concat([qualifying_results, year_df])
# rename columns
qualifying_results.rename(columns = {'Pos': 'grid', 'Driver': 'driver_name', 'Car': 'car',
                                     'Time': 'qualifying_time'}, inplace = True)
# drop driver number column
qualifying_results.drop('No', axis = 1, inplace = True)

In [None]:
qualifying_results.head()

In [None]:
weather = races.iloc[[0,1,2]]

In [None]:
# Weather DF    
weather = races.iloc[:,[0,1,2]]
info    = []
# read wikipedia tables
for link in races.url:
    try:
        df = pd.read_html(link)[0]
        if 'Weather' in list(df.iloc[:,0]):
            n = list(df.iloc[:,0]).index('Weather')
            info.append(df.iloc[n,1])
        else:
            df = pd.read_html(link)[1]
            if 'Weather' in list(df.iloc[:,0]):
                n = list(df.iloc[:,0]).index('Weather')
                info.append(df.iloc[n,1])
            else:
                df = pd.read_html(link)[2]
                if 'Weather' in list(df.iloc[:,0]):
                    n = list(df.iloc[:,0]).index('Weather')
                    info.append(df.iloc[n,1])
                else:
                    df = pd.read_html(link)[3]
                    if 'Weather' in list(df.iloc[:,0]):
                        n = list(df.iloc[:,0]).index('Weather')
                        info.append(df.iloc[n,1])
                    else:
                        driver = webdriver.Chrome()
                        driver.get(link)
                        # click language button
                        button = driver.find_element_by_link_text('Italiano')
                        button.click()
                        # find weather in italian with selenium
                        clima = driver.find_element_by_xpath('//*[@id="mw-content-text"]/div/table[1]/tbody/tr[9]/td').text
                        info.append(clima)  
    except Exception as e:
        info.append('not found')
# append column with weather information to dataframe  
weather['weather'] = info
# set up a dictionary to convert weather information into keywords
weather_dict = {'weather_warm' : ['soleggiato', 'clear'    , 
                                  'warm'      , 'hot'      , 
                                  'sunny'     , 'fine'     , 
                                  'mild', 'sereno'        ]    ,
               'weather_cold'  : ['cold'      , 'fresh'        , 
                                  'chilly'    , 'cool'    ]    ,
               'weather_dry'   : ['dry'       , 'asciutto']    ,
               'weather_wet'   : ['showers'   , 'wet'          ,  
                                  'rain'      , 'pioggia'      , 
                                  'damp'      , 'thunderstorms', 
                                  'rainy'],
               'weather_cloudy': ['overcast'  , 'nuvoloso'     , 
                                  'clouds'    , 'cloudy'       , 
                                  'grey'      , 'coperto']}
# map new df according to weather dictionary
weather_df = pd.DataFrame(columns = weather_dict.keys())
for col in weather_df:
    weather_df[col] = weather['weather'].map(lambda x: 1 if any(i in weather_dict[col] for i in x.lower().split()) else 0)
weather_info = pd.concat([weather, weather_df], axis = 1)

In [None]:
weather_info.head()

In [None]:
#merge df
df1 = pd.merge(races, weather, how='inner', 
               on=['season', 'round', 'circuit_id']).drop(['lat', 'long','country','weather'],axis = 1)
print(races.columns)
print(weather.columns)
print(results.columns)
df2 = pd.merge(df1, results, how='inner',on=['season', 'round', 'circuit_id']) \
        .drop(['points', 'status', 'time'],axis = 1)
df3 = pd.merge(df2, driver_standings, how='left',on=['season', 'round', 'driver'])
print(df1.columns)
print(df2.columns)
print(df3.columns)

In [None]:
df4 = pd.merge(df3, constructor_standings, how='left', 
               on=['season', 'round', 'constructor']) #from 1958
final_df = pd.merge(df4, qualifying_results, how='inner', 
                    on=['season', 'round', 'grid']).drop(['driver_name', 'car'],axis = 1) #from 1983
print(df4.columns)                    
print(final_df.columns)

In [None]:
final_df.head(20)

In [None]:
# calculate age of drivers


final_df['date'] = pd.to_datetime(final_df.date)
final_df['date_of_birth'] = pd.to_datetime(final_df.date_of_birth)
final_df['driver_age'] = final_df.apply(lambda x: relativedelta(x['date'], x['date_of_birth']).years, axis=1)
final_df.drop(['date', 'date_of_birth'], axis = 1, inplace = True)

In [None]:
print(final_df.columns)
final_df.head(20)

In [None]:
# fill/drop nulls
for col in ['driver_points', 'driver_wins', 'driver_standings_pos', 'constructor_points', 
            'constructor_wins' , 'constructor_standings_pos']:
    final_df[col].fillna(0, inplace = True)
    final_df[col] = final_df[col].map(lambda x: int(x))
final_df.dropna(inplace = True )

In [None]:
final_df.head(20)

In [None]:
# convert to boolean to save space
#for col in ['weather_warm', 'weather_cold','weather_dry', 'weather_wet', 'weather_cloudy']:
#    final_df[col] = final_df[col].map(lambda x: bool(x))

In [None]:
#print(final_df.qualifying_time)
float(final_df.strftime("%H%M%S"))
final_df

j = 0
for index, row in final_df.iterrows():
    print(j, ' ',row.qualifying_time)
    try:
        index = row.qualifying_time.find(":")
        if (index>0):
            h = float(row.qualifying_time.split(':')[0])
            m = float(row.qualifying_time.split(':')[1]) * 60
        else:
            h = 0
            m = float(row.qualifying_time)
    except Exception as e:
        print(row.qualifying_time)
        break
    print(j, ' ',row.qualifying_time, ' ', h, m, h + m)    
    j = j + 1

In [None]:
# calculate difference in qualifying times
final_df['qualifying_time'] = final_df.qualifying_time.map(lambda x: 0 if str(x) == '00.000' 
                             else((float(str(x).split(':')[1]) + (60 * float(str(x).split(':')[0])))) if x != 0 else 0)
final_df = final_df[final_df['qualifying_time'] != 0]
final_df.sort_values(['season', 'round', 'grid'], inplace = True)
final_df['qualifying_time_diff'] = final_df.groupby(['season', 'round']).qualifying_time.diff()
final_df['qualifying_time'] = final_df.groupby(['season', 
                                                'round']).qualifying_time_diff.cumsum().fillna(0)
final_df.drop('qualifying_time_diff', axis = 1, inplace = True)

In [None]:
# get dummies
df_dum = pd.get_dummies(final_df, columns = ['circuit_id', 'nationality', 'constructor'] )
for col in df_dum.columns:
    if 'nationality' in col and df_dum[col].sum() < 140:
        df_dum.drop(col, axis = 1, inplace = True)
    elif 'constructor' in col and df_dum[col].sum() < 140:
        df_dum.drop(col, axis = 1, inplace = True)
    elif 'circuit_id' in col and df_dum[col].sum() < 70:
        df_dum.drop(col, axis = 1, inplace = True)

In [None]:
# scoring function for regression
def score_regression(model):
    score = 0
    for circuit in df['round'].unique():
        test = df[(df['round'] == circuit)]
        X_test = test.drop(['driver', 'podium'], axis = 1)
        y_test = test.podium
        #scaling
        X_test = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns)
        # make predictions
        prediction_df = pd.DataFrame(model.predict(X_test), columns = ['results'])
        prediction_df['podium'] = y_test.reset_index(drop = True)
        prediction_df['actual'] = prediction_df.podium.map(lambda x: 1 if x == 1 else 0)
        prediction_df.sort_values('results', ascending = True, inplace = True)
        prediction_df.reset_index(inplace = True, drop = True)
        prediction_df['predicted'] = prediction_df.index
        prediction_df['predicted'] = prediction_df.predicted.map(lambda x: 1 if x == 0 else 0)
        score += precision_score(prediction_df.actual, prediction_df.predicted)
    model_score = score / df['round'].unique().max()
    print('Model score: ', model_score)
    return model_score

In [None]:
# scoring function for classification
def score_classification(model):
    score = 0
    for circuit in df['round'].unique():
        test = df[(df['round'] == circuit)]
        X_test = test.drop(['driver', 'podium'], axis = 1)
        y_test = test.podium
        #scaling
        X_test = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns)
        # make predictions
        prediction_df = pd.DataFrame(model.predict_proba(X_test), columns = ['proba_0', 'proba_1'])
        prediction_df['actual'] = y_test.reset_index(drop = True)
        prediction_df.sort_values('proba_1', ascending = False, inplace = True)
        prediction_df.reset_index(inplace = True, drop = True)
        prediction_df['predicted'] = prediction_df.index
        prediction_df['predicted'] = prediction_df.predicted.map(lambda x: 1 if x == 0 else 0)
        score += precision_score(prediction_df.actual, prediction_df.predicted)
    model_score = score / df['round'].unique().max()
    print('Model score: ', model_score)
    return model_score

In [None]:
df = final_df.copy()

In [None]:
df.head()

In [None]:
#train split
print(df.columns)
df = df.drop(['circuit_id'], axis = 1)
df = df.drop(['url'], axis = 1)
df = df.drop(['nationality'], axis = 1)
df = df.drop(['constructor'], axis = 1)
df = df.drop(['date_of_birth'], axis = 1)
df = df.drop(['date'], axis = 1)
print(df.columns)
train = df[df.season <2023]
X_train = train.drop(['driver', 'podium'], axis = 1)
y_train = train.podium

In [None]:
df.head()

In [None]:
print(X_train.columns)
scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)

In [None]:
# Linear Regression
comparison_dict = {'model': [], 'params': [], 'score': []}
params={'fit_intercept': [True, False]}
for fit_intercept in params['fit_intercept']:
    print(fit_intercept)
    model_params = (fit_intercept)
    model = LinearRegression(fit_intercept = fit_intercept)
    model.fit(X_train, y_train)   
    model_score = score_regression(model)
    comparison_dict['model'].append('linear_regression')
    comparison_dict['params'].append(model_params)
    comparison_dict['score'].append(model_score)
# Random Forest Regressor
params={'criterion': ['friedman_mse'],
        'max_features': [0.8, 1, None],
        'max_depth': [None]}
pd.DataFrame(comparison_dict).groupby('model')['score'].max()  

In [None]:
pprint(comparison_dict)

In [None]:
for criterion in params['criterion']:
    for max_features in params['max_features']:
        print(max_features)
        for max_depth in params['max_depth']:
            print(max_depth)
            model_params = (criterion, max_features, max_depth)
            model = RandomForestRegressor(criterion    = criterion,
                                          max_features = max_features, 
                                          max_depth    = max_depth, 
                                          random_state = 1)
            model.fit(X_train, y_train)
            model_score = score_regression(model)
            comparison_dict['model'].append('random_forest_regressor')
            comparison_dict['params'].append(model_params)
            comparison_dict['score'].append(model_score)
pd.DataFrame(comparison_dict).groupby('model')['score'].max()  

In [None]:
pprint(model_score)
pprint(comparison_dict)

In [None]:
# Support Vector Machines
params={'gamma' : np.logspace(-4, -1, 10),
        'C'     : np.logspace(-2, 1, 10),
        'kernel': ['linear', 
                   'poly', 
                   'rbf', 
                   'sigmoid']} 
for gamma in params['gamma']:
    print(gamma)
    for c in params['C']:
        print(c)
        for kernel in params['kernel']:
            print(kernel)
            model_params = (gamma, c, kernel)
            model = svm.SVR(gamma = gamma, C = c, kernel = kernel)
            model.fit(X_train, y_train)
            model_score = score_regression(model)
            comparison_dict['model'].append('svm_regressor')
            comparison_dict['params'].append(model_params)
            comparison_dict['score'].append(model_score)
# Neural network
params={'hidden_layer_sizes': [(80,20,40,5), (75,30,50,10,3)], 
        'activation'        : ['relu'  ,  'tanh'], 
        'solver'            : ['adam'], 
        'alpha'             : np.logspace(-4,1,20)} 
for hidden_layer_sizes in params['hidden_layer_sizes']:
    print(hidden_layer_sizes)
    for activation in params['activation']:
        print(activation)
        for solver in params['solver']:
            print(solver)
            for alpha in params['alpha']:
                print(alpha)
                model_params = (hidden_layer_sizes, activation, solver, alpha )
                model = MLPRegressor(hidden_layer_sizes = hidden_layer_sizes,
                                     activation         = activation, 
                                     solver             = solver, 
                                     alpha              = alpha, 
                                     random_state       = 1)
                model.fit(X_train, y_train)
                model_score = score_regression(model)
                print(model_score)
                comparison_dict['model'].append('nn_regressor')
                comparison_dict['params'].append(model_params)
                comparison_dict['score'].append(model_score)
                print(comparison_dict['score'])
#print best models  
pd.DataFrame(comparison_dict).groupby('model')['score'].max()    

In [None]:
df        = final_df.copy()
df        = df.drop(['circuit_id'] , axis = 1)
df        = df.drop(['url']        , axis = 1)
df        = df.drop(['nationality'], axis = 1)
df        = df.drop(['constructor'], axis = 1)
df        = df.drop(['date'], axis = 1)
df        = df.drop(['date_of_birth'], axis = 1)
df.podium = df.podium.map(lambda x: 1 if x == 1 else 0)
print(df.columns)
#split train
train   = df[df.season <2023]
X_train = train.drop(['driver', 'podium'], axis = 1)
y_train = train.podium
scaler  = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)

In [None]:
# gridsearch dictionary
comparison_dict ={'model':[], 
                  'params': [],
                  'score': []}
# Logistic Regression
params={'penalty': ['l1', 'l2'],
        'solver': ['saga', 'liblinear'],
        'C': np.logspace(-3,1,20)}
for penalty in params['penalty']:
    for solver in params['solver']:
        for c in params['C']:
            model_params = (penalty, solver, c)
            model = LogisticRegression(penalty = penalty, solver = solver, C = c, max_iter = 10000)
            model.fit(X_train, y_train)
            model_score = score_classification(model)
            comparison_dict['model'].append('logistic_regression')
            comparison_dict['params'].append(model_params)
            comparison_dict['score'].append(model_score)
#print best models  
pd.DataFrame(comparison_dict).groupby('model')['score'].max()

In [None]:
pprint(model_score)
pprint(comparison_dict)

In [None]:
# Random Forest Classifier

params={'criterion': ['gini', 'entropy'],
        'max_features': [None],
        'max_depth': list([None])}
for criterion in params['criterion']:
    for max_features in params['max_features']:
        print(max_depth)
        for max_depth in params['max_depth']:
            model_params = (criterion, max_features, max_depth)
            model = RandomForestClassifier(criterion = criterion, max_features = max_features, max_depth = max_depth)
            model.fit(X_train, y_train)
            model_score = score_classification(model)
            comparison_dict['model'].append('random_forest_classifier')
            comparison_dict['params'].append(model_params)
            comparison_dict['score'].append(model_score)
#print best models  
pd.DataFrame(comparison_dict).groupby('model')['score'].max()

In [None]:
# Support Vector Machines
params={'gamma': np.logspace(-4, -1, 20),
        'C': np.logspace(-2, 1, 20),
        'kernel': ['linear', 'poly', 'rbf', 'sigmoid']} 
for gamma in params['gamma']:
    for c in params['C']:
        for kernel in params['kernel']:
            model_params = (gamma, c, kernel)
            model = svm.SVC(probability = True, gamma = gamma, C = c, kernel = kernel )
            model.fit(X_train, y_train)
            model_score = score_classification(model)
            comparison_dict['model'].append('svm_classifier')
            comparison_dict['params'].append(model_params)
            comparison_dict['score'].append(model_score)
#print best models  
pd.DataFrame(comparison_dict).groupby('model')['score'].max()

In [None]:
pprint(model_score)
pprint(comparison_dict)

In [None]:
# Neural network
params={'hidden_layer_sizes': [(80,20,40,5), (75,25,50,10)], 
        'activation'        : ['tanh', 'relu'], 
        'solver'            : 'adam', 
        'alpha'             : np.logspace(-4,2,20)} 
for hidden_layer_sizes in params['hidden_layer_sizes']:
    print(hidden_layer_sizes)
    for activation in params['activation']:
        print(activation)
        for alpha in params['alpha']:
            print(alpha)
            model_params = (hidden_layer_sizes, activation, 'adam', alpha )
            model = MLPClassifier(hidden_layer_sizes = hidden_layer_sizes,
                                  activation = activation,
                                  solver = 'adam', 
                                  alpha = alpha, 
                                  random_state = 1)
            model.fit(X_train, y_train)
            model_score = score_classification(model)
            comparison_dict['model'].append('neural_network_classifier')
            comparison_dict['params'].append(model_params)
            comparison_dict['score'].append(model_score) 
#print best models  
pd.DataFrame(comparison_dict).groupby('model')['score'].max()

In [None]:
pprint(model_score)
pprint(comparison_dict)