In [22]:
# importing needed libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

Creating driver performance index (podium rate)

In [23]:
# reading driver information csv
d_stats = pd.read_csv('/media/jon/External/DS150_Project/f1db_csv/Main/driver_data.csv')

In [24]:
# initializing list of unneeded columns
c_drops1 = ['Unnamed: 0', 'Nationality', 'Seasons', 'Championships', 'Race_Entries',
       'Pole_Positions', 'Race_Wins', 'Fastest_Laps',
       'Points', 'Active', 'Championship Years', 'Decade', 'Points_Per_Entry', 'Years_Active', 'Champion']
# dropping unneeded columns
d_stats = d_stats.drop(c_drops1,axis=1)
# checking dropped columns
d_stats.head()

Unnamed: 0,Driver,Race_Starts,Podiums
0,Carlo Abate,0.0,0.0
1,George Abecassis,2.0,0.0
2,Kenny Acheson,3.0,0.0
3,Andrea de Adamich,30.0,0.0
4,Philippe Adams,2.0,0.0


In [25]:
d_stats.columns

Index(['Driver', 'Race_Starts', 'Podiums'], dtype='object')

In [26]:
# creating podium per race start rate
def div(row):
    try:
        return row['Podiums']/row['Race_Starts']
    except ZeroDivisionError:
        return np.NaN
d_stats['podium_rate'] = d_stats.apply(div,axis=1)

In [27]:
d_stats.head()

Unnamed: 0,Driver,Race_Starts,Podiums,podium_rate
0,Carlo Abate,0.0,0.0,
1,George Abecassis,2.0,0.0,0.0
2,Kenny Acheson,3.0,0.0,0.0
3,Andrea de Adamich,30.0,0.0,0.0
4,Philippe Adams,2.0,0.0,0.0


In [28]:
# function to make finding a specific driver easier
# double checking information with original data set
def driver_info(driver_name):
    return d_stats[d_stats['Driver'] == driver_name]

In [29]:
driver_info('Juan Manuel Fangio')

Unnamed: 0,Driver,Race_Starts,Podiums,podium_rate
238,Juan Manuel Fangio,51.0,35.0,0.686275


### Merging driver ids with d_stats df

In [30]:
# assigning driver id to d_stats df
# importing driver id guide data set
d_ids = pd.read_csv('/media/jon/External/DS150_Project/f1db_csv/Guide Sets/drivers.csv')

In [31]:
# initializing list of unneeded columns
c_drops3 = ['driverRef', 'number', 'code', 'dob',
       'nationality', 'url']
# dropping unneeded columns
d_ids = d_ids.drop(c_drops3,axis=1)
# checking dropped columns
d_ids.head()

Unnamed: 0,driverId,forename,surname
0,1,Lewis,Hamilton
1,2,Nick,Heidfeld
2,3,Nico,Rosberg
3,4,Fernando,Alonso
4,5,Heikki,Kovalainen


In [32]:
d_ids[d_ids['surname'] == 'Fangio']

Unnamed: 0,driverId,forename,surname
579,579,Juan,Fangio


In [33]:
# combining forename and surname into one column
d_ids['Driver'] = d_ids['forename'] +' ' + d_ids['surname']
# dropping seperate forename and surname columns
d_ids = d_ids.drop(['forename', 'surname'],axis=1)
# checking dataframe
d_ids.head()

Unnamed: 0,driverId,Driver
0,1,Lewis Hamilton
1,2,Nick Heidfeld
2,3,Nico Rosberg
3,4,Fernando Alonso
4,5,Heikki Kovalainen


In [41]:
d_ids[d_ids['Driver'] == 'Andrea de Adamich']

Unnamed: 0,driverId,Driver
332,333,Andrea de Adamich


### Some driver names in *d_stats* include regional "middle" names (e.g. Juan **Manuel** Fangio). These "middle" names are not included in the *d_ids* df. Below, I remove these extra names from the *d_stats* df.

In [42]:
# function to remove middle names from drivers
def remove_middle_name(driver_name):
    driver_name = driver_name
    # Splitting names at spaces
    name = driver_name.split()

    # Removing the "middle" names (if there is one)
    if len(name) > 2:
        name.pop(1)
    # rejoining the driver name into 1 string
    return ' '.join(name)

d_stats['Driver'] = [remove_middle_name(name) for name in list(d_stats['Driver'])]

In [43]:
d_stats

Unnamed: 0_level_0,driverId,Driver,Race_Starts,Podiums,podium_rate
Rankings,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,579,Juan Fangio,51.0,35.0,0.686275
2,1,Lewis Hamilton,311.0,191.0,0.614148
3,117,Alain Prost,199.0,106.0,0.532663
4,30,Michael Schumacher,306.0,155.0,0.506536
5,102,Ayrton Senna,161.0,80.0,0.496894
...,...,...,...,...,...
110,155,Kamui Kobayashi,75.0,1.0,0.013333
111,11,Takuma Sato,90.0,1.0,0.011111
112,813,Pastor Maldonado,95.0,1.0,0.010526
113,347,Jo Bonnier,104.0,1.0,0.009615


### Need to combine driver id guide set with d_stats df.

In [44]:
d_stats = d_ids.merge(d_stats, on='Driver', how='left')

In [45]:
d_stats = d_stats[(d_stats['podium_rate']!=0)&(d_stats['Race_Starts']>50)].sort_values('podium_rate',ascending=False)

In [46]:
d_stats['Ranking'] = range(len(d_stats['podium_rate']))
rankings = [i+1 for i in list(d_stats['Ranking'])]
d_stats['Rankings'] = rankings
d_stats = d_stats.drop('Ranking',axis=1)
d_stats = d_stats.set_index('Rankings')

In [48]:
d_stats.head()

Unnamed: 0_level_0,driverId_x,Driver,driverId_y,Race_Starts,Podiums,podium_rate
Rankings,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,579,Juan Fangio,579.0,51.0,35.0,0.686275
2,1,Lewis Hamilton,1.0,311.0,191.0,0.614148
3,117,Alain Prost,117.0,199.0,106.0,0.532663
4,30,Michael Schumacher,30.0,306.0,155.0,0.506536
5,102,Ayrton Senna,102.0,161.0,80.0,0.496894
