In [1]:
import pandas   as pd
import numpy    as np
import requests
import bs4
import warnings                as warn
from   bs4                     import BeautifulSoup
from   dateutil.relativedelta  import *
from   sklearn.preprocessing   import StandardScaler
from   selenium                import webdriver
from   sklearn.linear_model    import LinearRegression
from   sklearn.linear_model    import LogisticRegression
from   sklearn.metrics         import precision_score
from   sklearn.ensemble        import RandomForestClassifier
from   sklearn.ensemble        import RandomForestRegressor
from   sklearn.neural_network  import MLPClassifier
from   sklearn.neural_network  import MLPRegressor
from   pprint                  import pprint
from   datetime                import datetime
from   sklearn.metrics         import confusion_matrix
from   sklearn.metrics         import classification_report
warn.filterwarnings("ignore", category=DeprecationWarning)
warn.filterwarnings("ignore", category=Warning)

In [2]:
# Races DF
races = {'season'   : [], 'round'  : [],
        'circuit_id': [], 'lat'    : [],
        'long'      : [], 'country': [],
        'date'      : [], 'url'    : []}
for year in list(range(2022,2023)):
    url  = 'https://ergast.com/api/f1/{}.json'
    r    = requests.get(url.format(year))
    json = r.json()
    for item in json['MRData']['RaceTable']['Races']:
        try:
            races['season'].append(int(item['season']))
        except Exception as e:
            races['season'].append(None)
        try:
            races['round'].append(int(item['round']))
        except Exception as e:
            races['round'].append(None)
        try:
            races['circuit_id'].append(item['Circuit']['circuitId'])
        except Exception as e:
            races['circuit_id'].append(None)
        try:
            races['lat'].append(float(item['Circuit']['Location']['lat']))
        except Exception as e:
            races['lat'].append(None)
        try:
            races['long'].append(float(item['Circuit']['Location']['long']))
        except Exception as e:
            races['long'].append(None)
        try:
            races['country'].append(item['Circuit']['Location']['country'])
        except Exception as e:
            races['country'].append(None)
        try:
            races['date'].append(item['date'])
        except Exception as e:
            races['date'].append(None)
        try:
            races['url'].append(item['url'])
        except Exception as e:
            races['url'].append(None)
races = pd.DataFrame(races)

In [3]:
races.head()

Unnamed: 0,season,round,circuit_id,lat,long,country,date,url
0,2022,1,bahrain,26.0325,50.5106,Bahrain,2022-03-20,http://en.wikipedia.org/wiki/2022_Bahrain_Gran...
1,2022,2,jeddah,21.6319,39.1044,Saudi Arabia,2022-03-27,http://en.wikipedia.org/wiki/2022_Saudi_Arabia...
2,2022,3,albert_park,-37.8497,144.968,Australia,2022-04-10,http://en.wikipedia.org/wiki/2022_Australian_G...
3,2022,4,imola,44.3439,11.7167,Italy,2022-04-24,http://en.wikipedia.org/wiki/2022_Emilia_Romag...
4,2022,5,miami,25.9581,-80.2389,USA,2022-05-08,http://en.wikipedia.org/wiki/2022_Miami_Grand_...


In [4]:
# Rounds DF
rounds = []
i = 0
for year in np.array(races.season.unique()):
    rounds.append([year, list(races[races.season == year]['round'])])
    print(rounds[i][0],rounds[i][1],len(rounds[i][1]))
    i = i + 1

2022 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22] 22


In [5]:
# Results DF
results = {'season'       : [], 'round'      : [],
           'circuit_id'   : [], 'driver'     : [],
           'date_of_birth': [], 'nationality': [],
           'constructor'  : [], 'grid'       : [],
           'time'         : [], 'status'     : [],
           'points'       : [], 'podium'     : []}
for n in list(range(len(rounds))):
    for i in rounds[n][1]:
        url = 'http://ergast.com/api/f1/{}/{}/results.json'
        r = requests.get(url.format(rounds[n][0], i))
        json = r.json()
        for item in json['MRData']['RaceTable']['Races'][0]['Results']:
            try:
                results['season'].append(int(json['MRData']['RaceTable']['Races'][0]['season']))
            except Exception as e:
                results['season'].append(None)
            try:
                results['round'].append(int(json['MRData']['RaceTable']['Races'][0]['round']))
            except Exception as e:
                results['round'].append(None)
            try:
                results['circuit_id'].append(json['MRData']['RaceTable']['Races'][0]['Circuit']['circuitId'])
            except Exception as e:
                results['circuit_id'].append(None)
            try:
                results['driver'].append(item['Driver']['driverId'])
            except Exception as e:
                results['driver'].append(None)
            try:
                results['date_of_birth'].append(item['Driver']['dateOfBirth'])
            except Exception as e:
                results['date_of_birth'].append(None)
            try:
                results['nationality'].append(item['Driver']['nationality'])
            except Exception as e:
                results['nationality'].append(None)
            try:
                results['constructor'].append(item['Constructor']['constructorId'])
            except Exception as e:
                results['constructor'].append(None)
            try:
                results['grid'].append(int(item['grid']))
            except Exception as e:
                results['grid'].append(None)
            try:
                results['time'].append(int(item['Time']['millis']))
            except Exception as e:
                results['time'].append(None)
            try:
                results['status'].append(item['status'])
            except Exception as e:
                results['status'].append(None)
            try:
                results['points'].append(int(item['points']))
            except Exception as e:
                results['points'].append(None)
            try:
                results['podium'].append(int(item['position']))
            except Exception as e:
                results['podium'].append(None)
results = pd.DataFrame(results)

In [6]:
results.head()

Unnamed: 0,season,round,circuit_id,driver,date_of_birth,nationality,constructor,grid,time,status,points,podium
0,2022,1,bahrain,leclerc,1997-10-16,Monegasque,ferrari,1,5853584.0,Finished,26,1
1,2022,1,bahrain,sainz,1994-09-01,Spanish,ferrari,3,5859182.0,Finished,18,2
2,2022,1,bahrain,hamilton,1985-01-07,British,mercedes,5,5863259.0,Finished,15,3
3,2022,1,bahrain,russell,1998-02-15,British,mercedes,9,5864795.0,Finished,12,4
4,2022,1,bahrain,kevin_magnussen,1992-10-05,Danish,haas,7,5868338.0,Finished,10,5


In [7]:
# Driver Standings DF
driver_standings = {'season'     : [], 'round'               : [],
                    'driver'     : [], 'driver_points'       : [],
                    'driver_wins': [], 'driver_standings_pos': []}
# query API
for n in list(range(len(rounds))):
    for i in rounds[n][1]:    # iterate through rounds of each year
        url = 'https://ergast.com/api/f1/{}/{}/driverStandings.json'
        r = requests.get(url.format(rounds[n][0], i))
        json = r.json()
        for item in json['MRData']['StandingsTable']['StandingsLists'][0]['DriverStandings']:
            try:
                driver_standings['season'].append(int(json['MRData']['StandingsTable']['StandingsLists'][0]['season']))
            except Exception as e:
                driver_standings['season'].append(None)
            try:
                driver_standings['round'].append(int(json['MRData']['StandingsTable']['StandingsLists'][0]['round']))
            except Exception as e:
                driver_standings['round'].append(None)
            try:
                driver_standings['driver'].append(item['Driver']['driverId'])
            except Exception as e:
                driver_standings['driver'].append(None)
            try:
                driver_standings['driver_points'].append(int(item['points']))
            except Exception as e:
                driver_standings['driver_points'].append(None)
            try:
                driver_standings['driver_wins'].append(int(item['wins']))
            except Exception as e:
                driver_standings['driver_wins'].append(None)
            try:
                driver_standings['driver_standings_pos'].append(int(item['position']))
            except Exception as e:
                driver_standings['driver_standings_pos'].append(None)
driver_standings = pd.DataFrame(driver_standings)
# define lookup function to shift points and number of wins from previous rounds
def lookup (df, team, points):
    df['lookup1'] = df.season.astype(str) + df[team] + df['round'].astype(str)
    df['lookup2'] = df.season.astype(str) + df[team] + (df['round']-1).astype(str)
    new_df = df.merge(df[['lookup1', points]], how = 'left', left_on='lookup2',right_on='lookup1')
    new_df.drop(['lookup1_x', 'lookup2', 'lookup1_y'], axis = 1, inplace = True)
    new_df.rename(columns = {points+'_x': points+'_after_race', points+'_y': points}, inplace = True)
    new_df[points].fillna(0, inplace = True)
    return new_df
driver_standings = lookup(driver_standings, 'driver', 'driver_points')
driver_standings = lookup(driver_standings, 'driver', 'driver_wins')
driver_standings = lookup(driver_standings, 'driver', 'driver_standings_pos')
driver_standings.drop(['driver_points_after_race', 
                       'driver_wins_after_race', 
                       'driver_standings_pos_after_race'], 
                        axis    = 1, 
                        inplace = True)

In [8]:
driver_standings[driver_standings.driver_points > 0].head(30)

Unnamed: 0,season,round,driver,driver_points,driver_wins,driver_standings_pos
20,2022,2,leclerc,26.0,1.0,1.0
21,2022,2,sainz,18.0,0.0,2.0
23,2022,2,russell,12.0,0.0,4.0
24,2022,2,hamilton,15.0,0.0,3.0
25,2022,2,ocon,6.0,0.0,7.0
27,2022,2,kevin_magnussen,10.0,0.0,5.0
28,2022,2,bottas,8.0,0.0,6.0
30,2022,2,tsunoda,4.0,0.0,8.0
32,2022,2,alonso,2.0,0.0,9.0
33,2022,2,zhou,1.0,0.0,10.0


In [9]:
# Constructor Standings DF
constructor_rounds = rounds
constructor_standings = {'season'          : [], 'round'                    : [],
                         'constructor'     : [], 'constructor_points'       : [],
                         'constructor_wins': [], 'constructor_standings_pos': []}
for n in list(range(len(constructor_rounds))):
    for i in constructor_rounds[n][1]:
        url  = 'https://ergast.com/api/f1/{}/{}/constructorStandings.json'
        r    = requests.get(url.format(constructor_rounds[n][0], i))
        json = r.json()
        for item in json['MRData']['StandingsTable']['StandingsLists'][0]['ConstructorStandings']:
            try:
                constructor_standings['season'].append(int(json['MRData']['StandingsTable']['StandingsLists'][0]['season']))
            except Exception as e:
                constructor_standings['season'].append(None)
            try:
                constructor_standings['round'].append(int(json['MRData']['StandingsTable']['StandingsLists'][0]['round']))
            except Exception as e:
                constructor_standings['round'].append(None)
            try:
                constructor_standings['constructor'].append(item['Constructor']['constructorId'])
            except Exception as e:
                constructor_standings['constructor'].append(None)
            try:
                constructor_standings['constructor_points'].append(int(item['points']))
            except Exception as e:
                constructor_standings['constructor_points'].append(None)
            try:
                constructor_standings['constructor_wins'].append(int(item['wins']))
            except Exception as e:
                constructor_standings['constructor_wins'].append(None)
            try:
                constructor_standings['constructor_standings_pos'].append(int(item['position']))
            except Exception as e:
                constructor_standings['constructor_standings_pos'].append(None)
constructor_standings = pd.DataFrame(constructor_standings)
constructor_standings = lookup(constructor_standings, 'constructor', 'constructor_points')
constructor_standings = lookup(constructor_standings, 'constructor', 'constructor_wins')
constructor_standings = lookup(constructor_standings, 'constructor', 'constructor_standings_pos')
constructor_standings.drop(['constructor_points_after_race', 
                            'constructor_wins_after_race',
                            'constructor_standings_pos_after_race' ],
                            axis = 1, 
                            inplace = True)

In [10]:
constructor_standings.head()

Unnamed: 0,season,round,constructor,constructor_points,constructor_wins,constructor_standings_pos
0,2022,1,ferrari,0.0,0.0,0.0
1,2022,1,mercedes,0.0,0.0,0.0
2,2022,1,haas,0.0,0.0,0.0
3,2022,1,alfa,0.0,0.0,0.0
4,2022,1,alpine,0.0,0.0,0.0


In [11]:
# Qualifying DF
qualifying_results = pd.DataFrame()
# Qualifying times are only available from 1983
for year in list(range(2022,2023)):
    url = 'https://www.formula1.com/en/results.html/{}/races.html'
    r = requests.get(url.format(year))
    soup = BeautifulSoup(r.text, 'html.parser')
    # find links to all circuits for a certain year
    year_links = []
    for page in soup.find_all('a', attrs = {'class':"resultsarchive-filter-item-link FilterTrigger"}):
        link = page.get('href')
        if f'/en/results.html/{year}/races/' in link: 
            year_links.append(link)
    # for each circuit, switch to the starting grid page and read table
    year_df = pd.DataFrame()
    new_url = 'https://www.formula1.com{}'
    for n, link in list(enumerate(year_links)):
        link = link.replace('race-result.html', 'starting-grid.html')
        df = pd.read_html(new_url.format(link))
        df = df[0]
        df['season'] = year
        df['round'] = n+1
        for col in df:
            if 'Unnamed' in col:
                df.drop(col, axis = 1, inplace = True)
        year_df = pd.concat([year_df, df])
    # concatenate all tables from all years  
    qualifying_results = pd.concat([qualifying_results, year_df])
# rename columns
qualifying_results
qualifying_results.rename(columns = {'Pos': 'grid', 'Driver': 'driver_name', 'Car': 'car',
                                     'Time': 'qualifying_time'}, inplace = True)
# drop driver number column
qualifying_results.drop('No', axis = 1, inplace = True)
qualifying_results.qualifying_time = qualifying_results.grid


In [12]:
qualifying_results.head()

Unnamed: 0,grid,driver_name,car,qualifying_time,season,round
0,1,Charles Leclerc LEC,Ferrari,1,2022,1
1,2,Max Verstappen VER,Red Bull Racing RBPT,2,2022,1
2,3,Carlos Sainz SAI,Ferrari,3,2022,1
3,4,Sergio Perez PER,Red Bull Racing RBPT,4,2022,1
4,5,Lewis Hamilton HAM,Mercedes,5,2022,1


In [13]:
# Weather DF    
weather = races.iloc[:,[0,1,2]]
info    = []
# read wikipedia tables
for link in races.url:
    try:
        df = pd.read_html(link)[0]
        if 'Weather' in list(df.iloc[:,0]):
            n = list(df.iloc[:,0]).index('Weather')
            info.append(df.iloc[n,1])
        else:
            df = pd.read_html(link)[1]
            if 'Weather' in list(df.iloc[:,0]):
                n = list(df.iloc[:,0]).index('Weather')
                info.append(df.iloc[n,1])
            else:
                df = pd.read_html(link)[2]
                if 'Weather' in list(df.iloc[:,0]):
                    n = list(df.iloc[:,0]).index('Weather')
                    info.append(df.iloc[n,1])
                else:
                    df = pd.read_html(link)[3]
                    if 'Weather' in list(df.iloc[:,0]):
                        n = list(df.iloc[:,0]).index('Weather')
                        info.append(df.iloc[n,1])
                    else:
                        driver = webdriver.Chrome()
                        driver.get(link)
                        # click language button
                        button = driver.find_element_by_link_text('Italiano')
                        button.click()
                        # find weather in italian with selenium
                        clima = driver.find_element_by_xpath('//*[@id="mw-content-text"]/div/table[1]/tbody/tr[9]/td').text
                        info.append(clima)  
    except Exception as e:
        info.append('not found')
# append column with weather information to dataframe  
weather['weather'] = info
# set up a dictionary to convert weather information into keywords
weather_dict = {'weather_warm' : ['soleggiato', 'clear'    , 
                                  'warm'      , 'hot'      , 
                                  'sunny'     , 'fine'     , 
                                  'mild', 'sereno'        ]    ,
               'weather_cold'  : ['cold'      , 'fresh'        , 
                                  'chilly'    , 'cool'    ]    ,
               'weather_dry'   : ['dry'       , 'asciutto']    ,
               'weather_wet'   : ['showers'   , 'wet'          ,  
                                  'rain'      , 'pioggia'      , 
                                  'damp'      , 'thunderstorms', 
                                  'rainy'],
               'weather_cloudy': ['overcast'  , 'nuvoloso'     , 
                                  'clouds'    , 'cloudy'       , 
                                  'grey'      , 'coperto']}
# map new df according to weather dictionary
weather_df = pd.DataFrame(columns = weather_dict.keys())
for col in weather_df:
    weather_df[col] = weather['weather'].map(lambda x: 1 if any(i in weather_dict[col] for i in x.lower().split()) else 0)
weather_info = pd.concat([weather, weather_df], axis = 1)

In [14]:
weather_info.head()

Unnamed: 0,season,round,circuit_id,weather,weather_warm,weather_cold,weather_dry,weather_wet,weather_cloudy
0,2022,1,bahrain,Clear,1,0,0,0,0
1,2022,2,jeddah,Clear,1,0,0,0,0
2,2022,3,albert_park,Partly cloudy,0,0,0,0,1
3,2022,4,imola,Cloudy in a wet and drying track[2],0,0,0,1,1
4,2022,5,miami,"Warm, partly cloudy",0,0,0,0,1


In [15]:
#merge df
df1 = pd.merge(races, weather, how='inner', 
               on=['season', 'round', 'circuit_id']).drop(['lat', 'long','country','weather'],axis = 1)
df2 = pd.merge(df1, results, how='inner',on=['season', 'round', 'circuit_id']) \
        .drop(['points', 'status', 'time'],axis = 1)
df3 = pd.merge(df2, driver_standings, how='left',on=['season', 'round', 'driver'])
df4 = pd.merge(df3, constructor_standings, how='left', 
               on=['season', 'round', 'constructor'])
final_df = pd.merge(df4, qualifying_results, how='inner', 
                    on=['season', 'round', 'grid']).drop(['driver_name', 'car'],axis = 1)

In [16]:
final_df.head()

Unnamed: 0,season,round,circuit_id,date,url,driver,date_of_birth,nationality,constructor,grid,podium,driver_points,driver_wins,driver_standings_pos,constructor_points,constructor_wins,constructor_standings_pos,qualifying_time
0,2022,1,bahrain,2022-03-20,http://en.wikipedia.org/wiki/2022_Bahrain_Gran...,leclerc,1997-10-16,Monegasque,ferrari,1,1,0.0,0.0,0.0,0.0,0.0,0.0,1
1,2022,1,bahrain,2022-03-20,http://en.wikipedia.org/wiki/2022_Bahrain_Gran...,sainz,1994-09-01,Spanish,ferrari,3,2,0.0,0.0,0.0,0.0,0.0,0.0,3
2,2022,1,bahrain,2022-03-20,http://en.wikipedia.org/wiki/2022_Bahrain_Gran...,hamilton,1985-01-07,British,mercedes,5,3,0.0,0.0,0.0,0.0,0.0,0.0,5
3,2022,1,bahrain,2022-03-20,http://en.wikipedia.org/wiki/2022_Bahrain_Gran...,russell,1998-02-15,British,mercedes,9,4,0.0,0.0,0.0,0.0,0.0,0.0,9
4,2022,1,bahrain,2022-03-20,http://en.wikipedia.org/wiki/2022_Bahrain_Gran...,kevin_magnussen,1992-10-05,Danish,haas,7,5,0.0,0.0,0.0,0.0,0.0,0.0,7


In [17]:
# calculate age of drivers
final_df['date'] = pd.to_datetime(final_df.date)
final_df['date_of_birth'] = pd.to_datetime(final_df.date_of_birth)
final_df['driver_age'] = final_df.apply(lambda x: relativedelta(x['date'], x['date_of_birth']).years, axis=1)
final_df.drop(['date', 'date_of_birth'], axis = 1, inplace = True)

In [18]:
# fill/drop nulls
for col in ['driver_points', 'driver_wins', 'driver_standings_pos', 'constructor_points', 
            'constructor_wins' , 'constructor_standings_pos']:
    final_df[col].fillna(0, inplace = True)
    final_df[col] = final_df[col].map(lambda x: int(x))
final_df.dropna(inplace = True )

In [19]:
final_df.head()

Unnamed: 0,season,round,circuit_id,url,driver,nationality,constructor,grid,podium,driver_points,driver_wins,driver_standings_pos,constructor_points,constructor_wins,constructor_standings_pos,qualifying_time,driver_age
0,2022,1,bahrain,http://en.wikipedia.org/wiki/2022_Bahrain_Gran...,leclerc,Monegasque,ferrari,1,1,0,0,0,0,0,0,1,24
1,2022,1,bahrain,http://en.wikipedia.org/wiki/2022_Bahrain_Gran...,sainz,Spanish,ferrari,3,2,0,0,0,0,0,0,3,27
2,2022,1,bahrain,http://en.wikipedia.org/wiki/2022_Bahrain_Gran...,hamilton,British,mercedes,5,3,0,0,0,0,0,0,5,37
3,2022,1,bahrain,http://en.wikipedia.org/wiki/2022_Bahrain_Gran...,russell,British,mercedes,9,4,0,0,0,0,0,0,9,24
4,2022,1,bahrain,http://en.wikipedia.org/wiki/2022_Bahrain_Gran...,kevin_magnussen,Danish,haas,7,5,0,0,0,0,0,0,7,29


In [20]:
final_df = final_df[final_df['qualifying_time'] != 0]
final_df.sort_values(['season', 'round', 'grid'], inplace = True)
final_df['qualifying_time_diff'] = final_df.groupby(['season', 'round']).qualifying_time.diff()
final_df['qualifying_time'] = final_df.groupby(['season','round']).qualifying_time_diff.cumsum().fillna(0)
final_df.drop('qualifying_time_diff', axis = 1, inplace = True)

In [21]:
# scoring function for regression
def score_regression(model):
    score = 0
    for circuit in df['round'].unique():
        test = df[(df['round'] == circuit)]
        X_test = test.drop(['driver', 'podium'], axis = 1)
        y_test = test.podium
        #scaling
        X_test = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns)
        # make predictions
        prediction_df = pd.DataFrame(model.predict(X_test), columns = ['results'])
        prediction_df['podium'] = y_test.reset_index(drop = True)
        prediction_df['actual'] = prediction_df.podium.map(lambda x: 1 if x == 1 else 0)
        prediction_df.sort_values('results', ascending = True, inplace = True)
        prediction_df.reset_index(inplace = True, drop = True)
        prediction_df['predicted'] = prediction_df.index
        prediction_df['predicted'] = prediction_df.predicted.map(lambda x: 1 if x == 0 else 0)
        score += precision_score(prediction_df.actual, prediction_df.predicted)
    model_score = score / df['round'].unique().max()
    #print(prediction_df.head(20))
    return model_score, prediction_df

In [22]:
# scoring function for classification
def score_classification(model):
    score = 0
    for circuit in df['round'].unique():
        test = df[(df['round'] == circuit)]
        X_test = test.drop(['driver', 'podium'], axis = 1)
        y_test = test.podium
        #scaling
        X_test = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns)
        # make predictions
        prediction_df = pd.DataFrame(model.predict_proba(X_test), columns = ['proba_0', 'proba_1'])
        prediction_df['actual'] = y_test.reset_index(drop = True)
        prediction_df.sort_values('proba_1', ascending = False, inplace = True)
        prediction_df.reset_index(inplace = True, drop = True)
        prediction_df['predicted'] = prediction_df.index
        prediction_df['predicted'] = prediction_df.predicted.map(lambda x: 1 if x == 0 else 0)
        score += precision_score(prediction_df.actual, prediction_df.predicted)
    model_score = score / df['round'].unique().max()
    #print(prediction_df.head(20))
    return model_score, prediction_df

In [23]:
df = final_df.copy()

In [24]:
df.head()

Unnamed: 0,season,round,circuit_id,url,driver,nationality,constructor,grid,podium,driver_points,driver_wins,driver_standings_pos,constructor_points,constructor_wins,constructor_standings_pos,qualifying_time,driver_age
0,2022,1,bahrain,http://en.wikipedia.org/wiki/2022_Bahrain_Gran...,leclerc,Monegasque,ferrari,1,1,0,0,0,0,0,0,0.0,24
18,2022,1,bahrain,http://en.wikipedia.org/wiki/2022_Bahrain_Gran...,max_verstappen,Dutch,red_bull,2,19,0,0,0,0,0,0,1.0,24
1,2022,1,bahrain,http://en.wikipedia.org/wiki/2022_Bahrain_Gran...,sainz,Spanish,ferrari,3,2,0,0,0,0,0,0,2.0,27
17,2022,1,bahrain,http://en.wikipedia.org/wiki/2022_Bahrain_Gran...,perez,Mexican,red_bull,4,18,0,0,0,0,0,0,3.0,32
2,2022,1,bahrain,http://en.wikipedia.org/wiki/2022_Bahrain_Gran...,hamilton,British,mercedes,5,3,0,0,0,0,0,0,4.0,37


In [25]:
#train split
df = df.drop(['circuit_id'], axis = 1)
df = df.drop(['url'], axis = 1)
df = df.drop(['nationality'], axis = 1)
df = df.drop(['constructor'], axis = 1)
df = df.drop(['qualifying_time'], axis = 1)
train = df[df.season <2023]
X_train = train.drop(['driver'], axis = 1)
y_train = train.podium

In [26]:
df.head()

Unnamed: 0,season,round,driver,grid,podium,driver_points,driver_wins,driver_standings_pos,constructor_points,constructor_wins,constructor_standings_pos,driver_age
0,2022,1,leclerc,1,1,0,0,0,0,0,0,24
18,2022,1,max_verstappen,2,19,0,0,0,0,0,0,24
1,2022,1,sainz,3,2,0,0,0,0,0,0,27
17,2022,1,perez,4,18,0,0,0,0,0,0,32
2,2022,1,hamilton,5,3,0,0,0,0,0,0,37


In [27]:
df        = final_df.copy()
df        = df.drop(['circuit_id'] , axis = 1)
df        = df.drop(['url']        , axis = 1)
df        = df.drop(['nationality'], axis = 1)
df        = df.drop(['constructor'], axis = 1)
df.podium = df.podium.map(lambda x: 1 if x == 1 else 0)
train   = df[df.season <2023]
X_train = train.drop(['driver', 'podium'], axis = 1)
y_train = train.podium
scaler  = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)

In [28]:
# Linear Regression
comparison_dict = {'model': [], 'params': [], 'score': []}
params={'fit_intercept': [True, False]}
for fit_intercept in params['fit_intercept']:
    model_params = (fit_intercept)
    model = LinearRegression(fit_intercept = fit_intercept)
    model.fit(X_train, y_train)   
    model_score, model_prediction = score_regression(model)
    comparison_dict['model'].append('linear_regression')
    comparison_dict['params'].append(model_params)
    comparison_dict['score'].append(model_score)
print(model_prediction)
pd.DataFrame(comparison_dict).groupby('model')['score'].max()  

     results  podium  actual  predicted
0  -0.138519       0       0          1
1  -0.105799       0       0          0
2  -0.101240       0       0          0
3  -0.096330       0       0          0
4  -0.095623       0       0          0
5  -0.090008       0       0          0
6  -0.082747       0       0          0
7  -0.075705       0       0          0
8  -0.062759       0       0          0
9  -0.049487       0       0          0
10 -0.045711       0       0          0
11 -0.044659       0       0          0
12 -0.038258       0       0          0
13 -0.032537       0       0          0
14 -0.026949       0       0          0
15 -0.023037       0       0          0
16  0.011271       0       0          0
17  0.071860       0       0          0
18  0.113625       0       0          0
19  0.857133       1       1          0


model
linear_regression    0.0
Name: score, dtype: float64

In [29]:
# Random Forest Regressor
params={'criterion': ['friedman_mse'],
        'max_features': [0.8, 1, None],
        'max_depth': [None]}
for criterion in params['criterion']:
    for max_features in params['max_features']:
        for max_depth in params['max_depth']:
            model_params = (criterion, max_features, max_depth)
            model = RandomForestRegressor(criterion    = criterion,
                                          max_features = max_features, 
                                          max_depth    = max_depth, 
                                          random_state = 1)
            model.fit(X_train, y_train)
            model_score, prediction_df = score_regression(model)
            comparison_dict['model'].append('random_forest_regressor')
            comparison_dict['params'].append(model_params)
            comparison_dict['score'].append(model_score)
print(model_prediction)
pd.DataFrame(comparison_dict).groupby('model')['score'].max()  

     results  podium  actual  predicted
0  -0.138519       0       0          1
1  -0.105799       0       0          0
2  -0.101240       0       0          0
3  -0.096330       0       0          0
4  -0.095623       0       0          0
5  -0.090008       0       0          0
6  -0.082747       0       0          0
7  -0.075705       0       0          0
8  -0.062759       0       0          0
9  -0.049487       0       0          0
10 -0.045711       0       0          0
11 -0.044659       0       0          0
12 -0.038258       0       0          0
13 -0.032537       0       0          0
14 -0.026949       0       0          0
15 -0.023037       0       0          0
16  0.011271       0       0          0
17  0.071860       0       0          0
18  0.113625       0       0          0
19  0.857133       1       1          0


model
linear_regression          0.0
random_forest_regressor    0.0
Name: score, dtype: float64

In [30]:
# Logistic Regression
#comparison_dict ={'model':[], 
#                  'params': [],
#                  'score': []}
# Logistic Regression
params={'penalty': ['l1', 'l2'],
        'solver': ['saga', 'liblinear'],
        'C': np.logspace(-3,1,20)}
for penalty in params['penalty']:
    for solver in params['solver']:
        for c in params['C']:
            model_params = (penalty, solver, c)
            model = LogisticRegression(penalty = penalty, solver = solver, C = c, max_iter = 10000)
            model.fit(X_train, y_train)
            model_score, model_prediction = score_classification(model)
            comparison_dict['model'].append('logistic_regression')
            comparison_dict['params'].append(model_params)
            comparison_dict['score'].append(model_score)
print(model_prediction)
pd.DataFrame(comparison_dict).groupby('model')['score'].max()

     proba_0   proba_1  actual  predicted
0   0.067031  0.932969       1          1
1   0.789947  0.210053       0          0
2   0.931079  0.068921       0          0
3   0.965687  0.034313       0          0
4   0.976953  0.023047       0          0
5   0.997810  0.002190       0          0
6   0.999309  0.000691       0          0
7   0.999475  0.000525       0          0
8   0.999738  0.000262       0          0
9   0.999831  0.000169       0          0
10  0.999868  0.000132       0          0
11  0.999904  0.000096       0          0
12  0.999946  0.000054       0          0
13  0.999976  0.000024       0          0
14  0.999980  0.000020       0          0
15  0.999982  0.000018       0          0
16  0.999987  0.000013       0          0
17  0.999989  0.000011       0          0
18  0.999991  0.000009       0          0
19  0.999996  0.000004       0          0


model
linear_regression          0.000000
logistic_regression        0.636364
random_forest_regressor    0.000000
Name: score, dtype: float64

In [31]:
# Random Forest Classifier
params={'criterion': ['gini', 'entropy'],
        'max_features': [None],
        'max_depth': list([None])}
for criterion in params['criterion']:
    for max_features in params['max_features']:
        for max_depth in params['max_depth']:
            model_params = (criterion, max_features, max_depth)
            model = RandomForestClassifier(criterion = criterion, max_features = max_features, max_depth = max_depth)
            model.fit(X_train, y_train)
            model_score, model_prediction = score_classification(model)
            comparison_dict['model'].append('random_forest_classifier')
            comparison_dict['params'].append(model_params)
            comparison_dict['score'].append(model_score)
print(model_prediction)
pd.DataFrame(comparison_dict).groupby('model')['score'].max()

    proba_0  proba_1  actual  predicted
0      0.21     0.79       1          1
1      0.76     0.24       0          0
2      0.95     0.05       0          0
3      0.99     0.01       0          0
4      0.99     0.01       0          0
5      1.00     0.00       0          0
6      1.00     0.00       0          0
7      1.00     0.00       0          0
8      1.00     0.00       0          0
9      1.00     0.00       0          0
10     1.00     0.00       0          0
11     1.00     0.00       0          0
12     1.00     0.00       0          0
13     1.00     0.00       0          0
14     1.00     0.00       0          0
15     1.00     0.00       0          0
16     1.00     0.00       0          0
17     1.00     0.00       0          0
18     1.00     0.00       0          0
19     1.00     0.00       0          0


model
linear_regression           0.000000
logistic_regression         0.636364
random_forest_classifier    1.000000
random_forest_regressor     0.000000
Name: score, dtype: float64

In [41]:
# Neural network
params={'hidden_layer_sizes': [(60,20,40,5), (50,25,50,10)], 
        'activation'        : ['tanh', 'relu'], 
        'solver'            : 'adam', 
        'alpha'             : np.logspace(-4,2,20)} 
for hidden_layer_sizes in params['hidden_layer_sizes']:
    for activation in params['activation']:
        for alpha in params['alpha']:
            model_params = (hidden_layer_sizes, activation, 'adam', alpha )
            model = MLPClassifier(hidden_layer_sizes = hidden_layer_sizes,
                                  activation = activation,
                                  solver = 'adam', 
                                  alpha = alpha, 
                                  random_state = 1)
            model.fit(X_train, y_train)
            model_score, model_prediction = score_classification(model)
            comparison_dict['model'].append('neural_network_classifier')
            comparison_dict['params'].append(model_params)
            comparison_dict['score'].append(model_score) 
print(model_prediction.head(50))
pd.DataFrame(comparison_dict).groupby('model')['score'].max()

predictions = model.predict(X_train)
target_names = df.driver
# Print the predictions
newpred = np.zeros(len(predictions))
for i in range(len(predictions)):
  newpred[i] = predictions[i]
p  = np.nan_to_num(newpred)
j = 0
for i in p:
    i = i * 100
    i = int(i)
    p[j] = i
    j = j + 1
y  = np.nan_to_num(y_train) 
print(classification_report(y, p, target_names = target_names,labels=np.unique(p)))

     proba_0   proba_1  actual  predicted
0   0.706873  0.293127       0          1
1   0.706873  0.293127       0          0
2   0.706873  0.293127       0          0
3   0.706874  0.293126       0          0
4   0.706875  0.293125       0          0
5   0.706876  0.293124       0          0
6   0.706876  0.293124       0          0
7   0.706876  0.293124       0          0
8   0.706877  0.293123       0          0
9   0.706878  0.293122       0          0
10  0.706878  0.293122       0          0
11  0.706878  0.293122       0          0
12  0.706878  0.293122       0          0
13  0.706878  0.293122       0          0
14  0.706879  0.293121       0          0
15  0.706879  0.293121       0          0
16  0.706879  0.293121       0          0
17  0.706879  0.293121       0          0
18  0.706880  0.293120       0          0
19  0.706880  0.293120       1          0
                 precision    recall  f1-score   support

        leclerc       0.95      1.00      0.97       406

  