# F1 Race Predictions
### APC's Kaggle Project by Juan Carlos Soriano Valle (1493037)

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
from wordcloud import WordCloud
import descartes
import datetime

import geopandas as gpd
from shapely.geometry import Point, Polygon
from dateutil.relativedelta import *

ModuleNotFoundError: No module named 'wordcloud'

In [None]:
path = "./data/"
driversPath = "drivers"
constructorsPath = "constructors"
racesPath = "races"
resultsPath = "results"
seasonsPath = "seasons"
circuitsPath = "circuits"
qualifyingPath = "qualifying"
driverStandingsPath = "driver_standings"
constructorStandingsPath = "constructor_standings"
constructorResultsPath = "constructor_results"
statusPath = "status"
lapTimesPath = "lap_times"

### Status

In [None]:
statusDF = pd.read_csv(path+statusPath+".csv")

print(statusDF.shape)
print("Total status:", statusDF.shape[0])
pd.set_option("display.max_columns", None)
display(statusDF.head(10))

### Drivers

In [None]:
driversDF = pd.read_csv(path+driversPath+".csv")

print(driversDF.shape)
print("Total drivers:", driversDF.shape[0])
pd.set_option("display.max_columns", None)
driversDF.head(10)


In [None]:
print("missing values:\n",driversDF.isna().sum())

In [None]:
print(driversDF.number.str.count("N").sum())
print(driversDF.number.count())

### Constructors

In [None]:
constructorsDF = pd.read_csv(path+constructorsPath+".csv")

print(constructorsDF.shape)
print("Total Constructors:", constructorsDF.shape[0])
pd.set_option("display.max_columns", None)
constructorsDF.head(10)

In [None]:
print("missing values:\n",constructorsDF.isna().sum())

### Races

In [None]:
racesDF = pd.read_csv(path+racesPath+".csv")

print(racesDF.shape)
print("Total Races:", racesDF.shape[0])
pd.set_option("display.max_columns", None)
racesDF.head(10)

In [None]:
print("missing values:\n",racesDF.isna().sum())

In [None]:
rounds = []
for season in np.array(racesDF.year.unique()):
    rounds.append([season, list(racesDF[racesDF.year == season]["round"])])
rounds.sort()

countRounds = racesDF.groupby('year').round.max().reset_index()['round'].tolist()
yearRounds = racesDF.groupby('year').round.max().reset_index()['year'].tolist()

In [None]:
"""fig = plt.figure(figsize=(12,5))
plt.title("nRaces per Season")
ax = fig.add_axes([0,0,1,1])
ax.bar(yearRounds,countRounds)
plt.show()"""

fig_dims=(15,5)
fig, ax = plt.subplots(figsize=fig_dims)
sns.barplot(yearRounds,countRounds)
plt.xlabel("Years")
plt.ylabel("nRaces")
plt.title("nRaces per Season", fontsize=20)
ax.set_yticks(range(23))

xticks=ax.xaxis.get_major_ticks()
for i in range(len(xticks)):
    if i%2!=0:
        xticks[i].set_visible(False)

yticks=ax.yaxis.get_major_ticks()
for i in range(len(yticks)):
    if i%2!=0:
        yticks[i].set_visible(False)

ax.set_xticklabels(yearRounds,rotation=45);


### Results

In [None]:
resultsDF = pd.read_csv(path+resultsPath+".csv")

print(resultsDF.shape)
print("Total Races:", resultsDF.shape[0])
pd.set_option("display.max_columns", None)
resultsDF.head(22)

years = []
rounds = []
circuits = []
times = []
for ids in resultsDF.raceId:
    years.append(racesDF.loc[racesDF["raceId"]==ids, "year"].iloc[0])
    rounds.append(racesDF.loc[racesDF["raceId"]==ids, "round"].iloc[0])
    circuits.append(racesDF.loc[racesDF["raceId"]==ids, "circuitId"].iloc[0])
    
for lapTimes in resultsDF.fastestLapTime:
    x = lapTimes.split(".")
    try:
        dt_obj = time.strptime(lapTimes, "%M:%S.%f")
        millitime = datetime.timedelta(hours=dt_obj.tm_hour,minutes=dt_obj.tm_min,seconds=dt_obj.tm_sec,milliseconds=int(x[1])).total_seconds()*1000
        times.append(millitime)
    except:
        times.append(300000)
resultsDF["year"] = years
resultsDF["round"] = rounds
resultsDF["circuitId"] = circuits
resultsDF["fastestLapMill"] = times
display(resultsDF.head(22))
display(resultsDF.tail())

In [None]:
print("missing values:\n",resultsDF.isna().sum())

In [None]:
status = resultsDF['statusId'].value_counts()

In [None]:
statusFrame = status.to_frame()


indexnames={}
for idx in statusFrame.index:
    name = statusDF.loc[statusDF["statusId"]==idx, "status"].iloc[0]
    indexnames[idx]=name
statusDef=statusFrame.rename(index=indexnames)

s=statusDef["statusId"]
otherS = s.groupby(np.where(s>=200,s.index,'other')).sum()#.plot.pie(figsize=(10, 10))
otherS=otherS.sort_values(ascending=False)
otherS.plot.pie(figsize=(10, 10))
display(otherS)

In [None]:
winResults = resultsDF[resultsDF['position'] == "1"]
winCount = winResults["driverId"].value_counts()
winCount = winCount.to_frame()
#display(winCount)

indexnames={}
for idx in winCount.index:
    name = driversDF.loc[driversDF["driverId"]==idx, "surname"].iloc[0]
    indexnames[idx]=name
winCountDef=winCount.rename(index=indexnames)

winCountDef.plot.pie(figsize=(10, 10), subplots=True);

In [None]:
#winCountDef.index
wins = winResults['driverId'].tolist()
#str1 = ''.join(wins)

str2 = []

for idx in range(len(wins)):
    #print(str1[idx+1])
    #print(driversDF["driverId"]==int(str1[idx+1]))
    name = driversDF.loc[driversDF["driverId"]==int(wins[idx]), "surname"].iloc[0]
    str2.append(name)

str2=str(str2)
str2 = str2.replace(",", "")
str2 = str2.replace("'", "")
str2 = str2.replace(" ", "\n")

#print(str2)
wordcloud = WordCloud(width=1600, height=800, margin=0,collocations=False, colormap="hot").generate(str(str2))
"""plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.margins(x=0, y=0)
plt.show()"""


plt.figure( figsize=(20,10), facecolor='k')
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.tight_layout(pad=0)
plt.savefig('cloudword.png')
plt.show()



### Seasons

In [None]:
seasonsDF = pd.read_csv(path+seasonsPath+".csv")

print(seasonsDF.shape)
print("Total Seasons:", seasonsDF.shape[0])
pd.set_option("display.max_columns", None)
display(seasonsDF.head(5))
display(seasonsDF.tail(5))

In [None]:
print("missing values:\n",seasonsDF.isna().sum())

### Circuits

In [None]:
circuitsDF = pd.read_csv(path+circuitsPath+".csv")

print(circuitsDF.shape)
print("Total Circuits:", circuitsDF.shape[0])
pd.set_option("display.max_columns", None)
display(circuitsDF.head(6))
display(circuitsDF.tail(5))

In [None]:
print("missing values:\n",circuitsDF.isna().sum())

### Qualifying

In [None]:
qualifyingDF = pd.read_csv(path+qualifyingPath+".csv")

print(qualifyingDF.shape)
print("Total Qualifying:", qualifyingDF.shape[0])
pd.set_option("display.max_columns", None)
display(qualifyingDF.head(22))

In [None]:
print("missing values:\n",qualifyingDF.isna().sum())

### Driver Standings

In [None]:
driverStandingsDF = pd.read_csv(path+driverStandingsPath+".csv")

print(driverStandingsDF.shape)
print("Total Driver Standings:", driverStandingsDF.shape[0])
pd.set_option("display.max_columns", None)
display(driverStandingsDF.head(8))

In [None]:
print("missing values:\n",driverStandingsDF.isna().sum())

### Constructor Standings

In [None]:
constructorStandingsDF = pd.read_csv(path+constructorStandingsPath+".csv")

print(constructorStandingsDF.shape)
print("Total Constructor Standings:", constructorStandingsDF.shape[0])
pd.set_option("display.max_columns", None)
display(constructorStandingsDF.head(6))

In [None]:
print("missing values:\n",constructorStandingsDF.isna().sum())

### Constructor Results

In [None]:
constructorResultsDF = pd.read_csv(path+constructorResultsPath+".csv")

print(constructorResultsDF.shape)
print("Total Constructor Results:", constructorResultsDF.shape[0])
pd.set_option("display.max_columns", None)
display(constructorResultsDF.head(6))

In [None]:
print("missing values:\n",constructorResultsDF.isna().sum())

In [None]:
print(constructorResultsDF.status.str.count("N").sum())

In [None]:
pd.set_option("display.max_rows", None)
constructorResultsDF.loc[:,"status"]


In [None]:
constructorResultsDF['status'].value_counts()

### Lap Times

In [None]:
lapTimesDF = pd.read_csv(path+lapTimesPath+".csv")

print(lapTimesDF.shape)
print("Total Lap Times:", lapTimesDF.shape[0])
pd.set_option("display.max_columns", None)
display(lapTimesDF.head(6))

In [None]:
fig = plt.figure(figsize=(16,9))
_ = sns.lineplot(x="lap", y="position", hue="driverId", data=lapTimesDF[lapTimesDF["raceId"]==973], palette="Paired", legend="full")
_.set_yticks(range(19))
_.set_xticks(range(67))
_.yaxis.set_ticks_position("both")

driversNum=lapTimesDF[lapTimesDF["raceId"]==973]["driverId"].unique()
driversNum.sort()
names = []

for num in driversNum:
    names.append(driversDF.loc[driversDF["driverId"]==num, "code"].iloc[0])

#race{year;name}
race={}
race["year"]=racesDF.loc[racesDF["raceId"]==973, "year"].iloc[0]
race["name"]=racesDF.loc[racesDF["raceId"]==973, "name"].iloc[0]
plt.title(str(race["year"])+ " " + str(race["name"]), fontsize=20)
plt.legend(title="Drivers",loc='best',labels=names);

In [None]:
fig = plt.figure(figsize=(16,9))
"""valencia = lapTimesDF[lapTimesDF["raceId"]==867]
alonso = valencia[valencia["driverId"]==4]
display(alonso)
"""
_ = sns.lineplot(x="lap", y="position", hue="driverId", data=lapTimesDF[lapTimesDF["raceId"]==867], palette="Paired", legend="full")
_.set_yticks(range(25))
_.set_xticks(range(58))
_.yaxis.set_ticks_position("both")

driversNum=lapTimesDF[lapTimesDF["raceId"]==867]["driverId"].unique()
driversNum.sort()
names = []

for num in driversNum:
    names.append(driversDF.loc[driversDF["driverId"]==num, "code"].iloc[0])

#race{year;name}
race={}
race["year"]=racesDF.loc[racesDF["raceId"]==867, "year"].iloc[0]
race["name"]=racesDF.loc[racesDF["raceId"]==867, "name"].iloc[0]
plt.title(str(race["year"])+ " " + str(race["name"]), fontsize=20)
plt.legend(title="Drivers",loc='best',labels=names);

---

In [None]:
crs={"init":"epsg:4326"}
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
races=racesDF.loc[racesDF["year"]==1950]
races2019=racesDF.loc[racesDF["year"]==2019]
z=np.random.rand(100,100)

lat=[]
lon=[]

lat1=[]
lon1=[]


for race in races["circuitId"]:
    lat.append(circuitsDF.loc[circuitsDF["circuitId"]==race, "lat"].iloc[0])
    lon.append(circuitsDF.loc[circuitsDF["circuitId"]==race, "lng"].iloc[0])

races.insert(8,"lat", lat, True)
races.insert(9,"lng", lon, True)
#display(races)
points1950 = [Point(xy) for xy in zip( races["lng"], races["lat"] )]
geo1950 = gpd.GeoDataFrame(races, geometry=points1950, crs=crs)
#display(geo1950)


for race in races2019["circuitId"]:
    lat1.append(circuitsDF.loc[circuitsDF["circuitId"]==race, "lat"].iloc[0])
    lon1.append(circuitsDF.loc[circuitsDF["circuitId"]==race, "lng"].iloc[0])

races2019.insert(8,"lat", lat1, True)
races2019.insert(9,"lng", lon1, True)
points2019 = [Point(xy) for xy in zip( races2019["lng"], races2019["lat"] )]
geo2019 = gpd.GeoDataFrame(races2019, geometry=points2019, crs=crs)



fig,(ax1, ax2) = plt.subplots(ncols=2)

for ax in [ax1, ax2]:
    if ax == ax1:
        world.plot(ax=ax, alpha=0.4)
        geo1950.plot(ax=ax, markersize=40, marker="o",cmap='Dark2')
        ax.set_ylim([-5,75])
        ax.set_xlim([-120,50])
        ax.set_title("Race Locations on First Championship Season (1950)", fontsize=15)
    else:
        world.plot(ax=ax, alpha=0.3)
        geo2019.plot(ax=ax, markersize=40, marker="o", cmap="tab10")
        ax.set_ylim([-50,60])
        ax.set_xlim([-140,160])
        ax.set_title("Race Locations on Last Completed Championship Season (2019)", fontsize=15)

asp = np.diff(ax2.get_xlim())[0] / np.diff(ax2.get_ylim())[0]
ax2.set_aspect(asp*0.7)
fig.set_size_inches(20,8)


---

In [None]:
#importancia de la pole
perCircuits = {}
circuits = []
poleResults = resultsDF[resultsDF["grid"]==1]

for race in poleResults.raceId:
    circuit = racesDF.loc[racesDF["raceId"]==race, "circuitId"].iloc[0]
    circuits.append(circuit)

poleResults["circuitId"]=circuits

In [None]:
delta=[]
names={}

for index, row in poleResults.iterrows():
    delta.append(row.grid == row.positionOrder)

poleResults["poleWin"]=delta

#display(poleResults)


#print(poleResults.circuitId.unique())

for circuits in poleResults.circuitId.unique():
    counter=0
    perfect=0
    circuitResults = poleResults[poleResults["circuitId"]==circuits]
    for index, row in circuitResults.iterrows():
        if row.poleWin:
            perfect+=1
        counter+=1
    perCircuits[circuits]=perfect/counter


for key in perCircuits.keys():
    name = circuitsDF.loc[circuitsDF["circuitId"]==key,"circuitRef"].iloc[0]
    names[name]=perCircuits[key]


sortedNames = {}
sortedKeys = sorted(names, key=names.get, reverse=True)
for w in sortedKeys:
    sortedNames[w] = names[w]
    

sortedNames.pop("pedralbes")
sortedNames.pop("pescara")
sortedNames.pop("ain-diab")
sortedNames.pop("sebring")
sortedNames.pop("zeltweg")
sortedNames.pop("lemans")
sortedNames.pop("tremblant")
sortedNames.pop("montjuic")
sortedNames.pop("las_vegas")
sortedNames.pop("dallas")
sortedNames.pop("donington")
sortedNames.pop("okayama")
sortedNames.pop("riverside")
sortedNames.pop("avus")
sortedNames.pop("monsanto")
sortedNames.pop("buddh")
sortedNames.pop("mugello")
sortedNames.pop("portimao")
    

    

plt.figure(figsize=(15,9))
plt.bar(range(len(sortedNames)), list(sortedNames.values()), align="center")
plt.xticks(range(len(sortedNames)), list(sortedNames.keys()),rotation=90)
plt.title("Correlation Pole Position/Win per Circuit", fontsize=(20))


plt.show()





---

In [None]:
#guardado en ./data/weatherInfo.csv
"""from selenium import webdriver

weather = racesDF.iloc[:,[1,2,3]]
state = []

for link in racesDF.url:
    try:
        df = pd.read_html(link)[0]
        if "Weather" in list(df.iloc[:,0]):
            idx = list(df.iloc[:,0]).index("Weather")
            state.append(df.iloc[idx,1])
        else:
            df = pd.read_html(link)[1]
            if "Weather" in list(df.iloc[:,0]):
                idx = list(df.iloc[:,0]).index("Weather")
                state.append(df.iloc[idx,1])
            else:
                df = pd.read_html(link)[2]
                if "Weather" in list(df.iloc[:,0]):
                    idx = list(df.iloc[:,0]).index("Weather")
                    state.append(df.iloc[idx,1])
                else:
                    df = pd.read_html(link)[3]
                    if "Weather" in list(df.iloc[:,0]):
                        idx = list(df.iloc[:,0]).index("Weather")
                        state.append(df.iloc[idx,1])
                    else:
                        explorer = webdriver.Chrome()
                        explorer.get(link)
                        
                        lang = explorer.find_element_by_link_text("Italiano")
                        lang.click()
                        
                        itaWeather = explorer.find_element_by_xpath('//*[@id="mw-content-text"]/div/table[1]/tbody/tr[9]/td').text
                        state.append(itaWeather)
                    
                    
                    
    
    except:
        state.append("not found")
        
weather["weather"] = state

weatherDict = {'weather_warm': ['soleggiato', 'clear', 'warm', 'hot', 'sunny', 'fine', 'mild', 'sereno'],
               'weather_cold': ['cold', 'fresh', 'chilly', 'cool'],
               'weather_dry': ['dry', 'asciutto'],
               'weather_wet': ['showers', 'wet', 'rain', 'pioggia', 'damp', 'thunderstorms', 'rainy'],
               'weather_cloudy': ['overcast', 'nuvoloso', 'clouds', 'cloudy', 'grey', 'coperto']}

weatherDF = pd.DataFrame(columns = weatherDict.keys())
for col in weatherDF:
    weatherDF[col] = weather["weather"].map(lambda x: 1 if any(i in weatherDict[col] for i in x.lower().split()) else 0)
    
weatherInfo = pd.concat([weather, weatherDF], axis = 1)"""

weatherInfo = pd.read_csv("./data/weatherInfo.csv")
        

In [None]:
display(weatherInfo)
weatherInfo.to_csv("./data/weatherInfo.csv", index=False)

---

In [None]:
df1 = pd.merge(racesDF, weatherInfo, how="inner", on=["year", "round", "circuitId"]).drop(["weather", "time", "url"], axis=1)
#df1
df1.head()

In [None]:
resultsDriversDF = pd.merge(resultsDF,driversDF,how="left", on=["driverId"]).drop(["code", "forename", "surname", "url", "number_y", "number_x"], axis=1)
resultsDriversDF.head()

In [None]:
df2 = pd.merge(df1, resultsDriversDF, how="inner", on=["year", "round", "circuitId", "raceId"]).drop(["points", "statusId", "position", "positionText", "time", "rank", "fastestLapSpeed"], axis=1)
print(df2.shape)
df2.head()

In [None]:
driverStandingsDF.rename(columns={"position":"driverStandingPosition", "positionText":"driverStandingPositionText"}, inplace=True)
df3 = pd.merge(df2, driverStandingsDF, how="left", on=["raceId","driverId"]).drop(["driverStandingsId", "driverStandingPositionText","fastestLapTime","fastestLap"], axis=1)
df3.head()

In [None]:
constructors = []
for constructor in df3.constructorId:
    constructors.append(constructorsDF.loc[constructorsDF["constructorId"]==constructor, "constructorRef"].iloc[0])
df3["constructor"] = constructors
df3 = df3.drop(["constructorId"], axis=1)
df3.head()

In [None]:
constructorStandingsFinalDF = constructorStandingsDF

years = []
rounds = []
constructors = []
for ids in constructorStandingsFinalDF.raceId:
    years.append(racesDF.loc[racesDF["raceId"]==ids, "year"].iloc[0])
    rounds.append(racesDF.loc[racesDF["raceId"]==ids, "round"].iloc[0])
for ids in constructorStandingsFinalDF.constructorId:
    constructors.append(constructorsDF.loc[constructorsDF["constructorId"]==ids, "constructorRef"].iloc[0])
    
constructorStandingsFinalDF["year"] = years
constructorStandingsFinalDF["round"] = rounds
constructorStandingsFinalDF["constructor"] = constructors

constructorStandingsFinalDF = constructorStandingsFinalDF.drop(["constructorStandingsId", "raceId", "constructorId", "positionText"], axis = 1)
constructorStandingsFinalDF = constructorStandingsFinalDF.rename(columns={"points": "constructorPoints", "wins": "constructorWins", "position":"constructorPosition"})

constructorStandingsFinalDF.head()

In [None]:
df4 = pd.merge(df3, constructorStandingsFinalDF, how="left", on=["year", "round", "constructor"])
print(df4.shape)

df4.grid = df4.grid.replace({0: 99})

df4.head(20)

In [None]:
qualyTime = []
for index, row in qualifyingDF.iterrows():
    if row.q3 != str(r"\N"):
        qualyTime.append(row.q3)
    else:
        if row.q2 != str(r"\N"):
            qualyTime.append(row.q2)
        else:
            if row.q1 != str(r"\N"):
                qualyTime.append(row.q1)
            else:
                qualyTime.append(0)
qualifyingFinalDF = qualifyingDF
qualifyingFinalDF["qualyTime"] = qualyTime
qualifyingFinalDF=qualifyingFinalDF.drop(["q1", "q2", "q3", "number", "qualifyId"], axis=1)
qualifyingFinalDF = qualifyingFinalDF.rename(columns={"position":"grid"})

rounds = []
years = []
for race in qualifyingFinalDF.raceId:
    rounds.append(racesDF.loc[racesDF["raceId"]==race, "round"].iloc[0])
    years.append(racesDF.loc[racesDF["raceId"]==race, "year"].iloc[0])
    

qualifyingFinalDF = qualifyingFinalDF.drop(["raceId"], axis=1)
qualifyingFinalDF["year"] = years
qualifyingFinalDF["round"] = rounds
print(qualifyingDF.shape)
print(qualifyingFinalDF.shape)
qualifyingFinalDF.head()

In [None]:
df4.tail(200)

In [None]:
finalDF = pd.merge(df4, qualifyingFinalDF, how="left", on=["year", "round", "driverId"]).drop(["constructorId", "grid_y"], axis=1)
#IMPORTANTE, HAY 2 GRIDS AL HACER EL MERGE, Y SON LAS POSICIONES "RAW" DE LA CLASIFICACION DEL SABADO, MIENTRAS QUE EL X ES EL RESULTADO DE LAS CLASIFICACIONES+PENALIZACIONES
finalDF = finalDF.rename(columns={"grid_x":"grid"})
print(finalDF.shape)
finalDF.head(21)

In [None]:
minValue = finalDF['year'].min()
maxValue = finalDF['year'].max()
print(minValue)
print(maxValue)

In [None]:
print("missing values:\n",finalDF.isna().sum())

In [None]:
#age on every race for drivers
finalDF["date"] = pd.to_datetime(finalDF.date)
finalDF["dob"] = pd.to_datetime(finalDF.dob)
finalDF["driverAge"] = finalDF.apply(lambda x: relativedelta(x["date"],x["dob"]).years, axis=1)
finalDF.drop(["date", "dob"], axis=1, inplace=True)

In [None]:
print("Antes de la conversión:", finalDF.shape)



for col in ["points", "driverStandingPosition", "wins", "constructorPoints", "constructorPosition", "constructorWins"]:
    finalDF[col].fillna(0, inplace = True)
    finalDF[col] = finalDF[col].map(lambda x: int(x))
    
finalDF["qualyTime"] = finalDF["qualyTime"].fillna(value=0)
finalDF.loc[(finalDF.qualyTime == 0), "qualyTime"] = "4:59.999"
finalDF.loc[(finalDF.milliseconds == r"\N"), "milliseconds"] = 99999999
    
finalDF.dropna(inplace = True)
print("Despues de la conversion:", finalDF.shape)

In [None]:
print("missing values:\n",finalDF.isna().sum().sum())

In [None]:
#qualyDiff
finalDF["qualyTime"] = finalDF.qualyTime.map(lambda x: 0 if str(x) == "00.000" else(float(str(x).split(":")[1]) + (60 * float(str(x).split(":")[0])) if x!= 0 else 0))
finalDF = finalDF[finalDF["qualyTime"] != 0]
finalDF.sort_values(["year", "round", "grid"], inplace = True)
finalDF["qualyDiff"] = finalDF.groupby(["year", "round"]).qualyTime.diff()
finalDF["qualyTime"] = finalDF.groupby(["year", "round"]).qualyDiff.cumsum().fillna(0)
finalDF.drop("qualyDiff", axis=1, inplace=True)

In [None]:
finalDF.tail(20)

In [None]:
freq = finalDF["grid"].value_counts() 
print(freq)

In [None]:
finalDF.head(50)

In [None]:
#quitamos todas las filas de turkish gp porque no tienen los datos de wins bien definidos
finalDF.drop(finalDF.tail(20).index,inplace=True) # drop last n rows
finalDF.tail(20)

In [None]:
del finalDF['circuitId']
del finalDF['raceId']
del finalDF['resultId']
finalDF.tail(20)

In [None]:
#convertir a dummies -> strings a atributos y 0/1
# name -> circuit name
dummies = pd.get_dummies(finalDF, columns = ["name", "nationality", "constructor"])

indexNames = dummies[dummies["year"] < 1989].index
dummies.drop(indexNames, inplace=True)

for col in dummies.columns:
    if "name" in col and dummies[col].sum() < 60:
        dummies.drop(col, axis=1, inplace=True)
    elif "nationality" in col and dummies[col].sum() < 60:
        dummies.drop(col, axis=1, inplace=True)
    elif "constructor" in col and dummies[col].sum() < 80:
        dummies.drop(col, axis=1, inplace=True)
    else:
        pass

In [None]:
indexNames = dummies[dummies["year"] < 1989].index
dummies.drop(indexNames, inplace=True)

In [None]:
dummies['milliseconds']=dummies.milliseconds.astype('int64')

In [None]:
dummies.to_csv("./data/finalDF.csv", index=False)

---

In [None]:
from sklearn.metrics import confusion_matrix, precision_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn import svm
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.neural_network import MLPClassifier, MLPRegressor

In [None]:
f1DF = pd.read_csv("./data/finalDF.csv")

In [None]:
#regression
predictingYear = 1993


regressionDF = f1DF.copy()
scaler = StandardScaler()

trainSet = regressionDF[regressionDF.year != int(predictingYear)]
xTrain = trainSet.drop(["driverRef", "positionOrder"], axis=1)
yTrain = trainSet.positionOrder

xTrain = pd.DataFrame(scaler.fit_transform(xTrain), columns = xTrain.columns)

In [None]:
comparison_dict = {"model":[], "params":[], "score":[]}

In [None]:
#scoring functions
def scoreRegression(model, predictYear):
    score = 0
    driversList = []
    
    for race in f1DF[f1DF.year == int(predictYear)]["round"].unique():
        
        #train/test
        
        testDF = f1DF[(f1DF.year == int(predictYear)) & (f1DF["round"] == race)]
        #df.loc[df['B'] == 3, 'A'].iloc[0]
        driversList = f1DF[(f1DF.year == int(predictYear)) & (f1DF["round"] == race)]["driverRef"].tolist()
        xTest = testDF.drop(["driverRef", "positionOrder"], axis=1)
        yTest = testDF.positionOrder
        
        #scaler
        xTest = pd.DataFrame(scaler.transform(xTest), columns = xTest.columns)
        
        #predictions
        predictionDF = pd.DataFrame(model.predict(xTest), columns = ["results"])
        predictionDF["driver"] = driversList
        predictionDF["positionOrder"] = yTest.reset_index(drop=True)
        predictionDF["real"] = predictionDF.positionOrder.map(lambda x: 1 if x == 1 else 0)
        predictionDF.sort_values("results", ascending = True, inplace = True)
        predictionDF.reset_index(inplace = True, drop = True)
        predictionDF["predicted"] = predictionDF.index
        predictionDF["predicted"] = predictionDF.predicted.map(lambda x:1 if x== 0 else 0)
        
        score += precision_score(predictionDF.real, predictionDF.predicted)
        
        display(predictionDF)
        
    totalScore = score / f1DF[f1DF.year == int(predictYear)]["round"].unique().max()
    
    return totalScore
        
    
    

In [None]:
#linear regression

params={"fit_intercept" : ["True", "False"]}

for fit_intercept in params["fit_intercept"]:
    model_params = (fit_intercept)
    model = LinearRegression(fit_intercept = fit_intercept)
    model.fit(xTrain, yTrain)
    
    #print(model.score(xTrain, yTrain))
    
    modelScore = scoreRegression(model, predictingYear)
    
    comparison_dict["model"].append("linear_regression")
    comparison_dict["params"].append(model_params)
    comparison_dict["score"].append(modelScore)





comparison_dict

In [None]:
#random forest regressor
#n_estimators, max_features, max_depth
params={"max_features": ["auto", 0.5, 0.9],
        "n_estimators": [100, 500, 1000],
        "max_depth": [5, 10, 20]}

for feature in params["max_features"]:
    for estimator in params["n_estimators"]:
        for depth in params["max_depth"]:
            model_params = (feature, estimator, depth)
            model = RandomForestRegressor(criterion = "mse", max_features = feature, n_estimators=estimator, max_depth=depth, random_state=42)
            model.fit(xTrain, yTrain)
            
            modelScore = score_regression(model, predictingYear)
            
            comparison_dict["model"].append("random_forest_regressor")
            comparison_dict["params"].append(model_params)
            comparison_dict["score"].append(modelScore)
            
comparison_dict
