# Project question 2 - Which driver beat their teammates the hardest over their career in Formula One?

## 1. Importing data

In [3]:
import pandas as pd
import numpy as np

results = pd.read_csv('results.csv')
races = pd.read_csv('races.csv')
drivers = pd.read_csv('drivers.csv')
qualifying = pd.read_csv('qualifying.csv')
qualifying

Unnamed: 0,qualifyId,raceId,driverId,constructorId,number,position,q1,q2,q3
0,1,18,1,1,22,1,1:26.572,1:25.187,1:26.714
1,2,18,9,2,4,2,1:26.103,1:25.315,1:26.869
2,3,18,5,1,23,3,1:25.664,1:25.452,1:27.079
3,4,18,13,6,2,4,1:25.994,1:25.691,1:27.178
4,5,18,2,2,3,5,1:25.960,1:25.518,1:27.236
...,...,...,...,...,...,...,...,...,...
9810,9868,1110,848,3,23,16,2:00.314,\N,\N
9811,9869,1110,855,51,24,17,2:00.832,\N,\N
9812,9870,1110,858,3,2,18,2:01.535,\N,\N
9813,9871,1110,817,213,3,19,2:02.159,\N,\N


## 2. Preparing data

Preparing training data as well as adding useful features such as a ratio of points to teammate.

In [140]:
def get_results(year1, year2=None):
    if year2 is None:
        races_data = races.loc[races['year'] == year1]
    else:
        races_data = races.loc[(races['year'] >= year1) & (races['year'] <= year2)]

    races_year_circuit_data = pd.concat([races_data['raceId'], races_data['circuitId'], races_data['time']], axis=1)
    results_data = results.loc[(results['raceId'].isin(races_data['raceId']))]
    results_data = pd.merge(results_data, races_year_circuit_data, on='raceId')
    results_data = results_data.drop(columns=['time_x', 'positionText', 'resultId', 'fastestLap'])
    results_data = results_data.replace('\\N', 0)
    results_data['time_y'] = results_data['time_y'].apply(find_time)
    results_data['fastestLapTime'] = results_data['fastestLapTime'].apply(find_milliseconds)
    #Convert everything to numeric
    results_data = results_data.apply(pd.to_numeric, errors='coerce')
    return results_data


def get_driver_results(driverId, year1 = 1950, year2 = 2023):
    results_data = get_results(year1, year2)
    driver_results = results_data.loc[results_data['driverId'] == driverId].copy()

    teammate_results = results_data.loc[results_data['constructorId'] == driver_results['constructorId'].iloc[0]].copy()
    
    teammate_results = teammate_results.loc[teammate_results['driverId'] != driverId]
    teammate_results = teammate_results.drop(columns=['driverId', 'constructorId', 'time_y', 'circuitId'])

    for column in teammate_results.columns:
        if column != 'driverId' and column != 'constructorId' and column != 'raceId':
            teammate_results.rename(columns={column: 'teammate_' + column}, inplace=True)

    driver_results = pd.merge(driver_results, teammate_results, on='raceId', how='left')
    # Add a column of points difference between driver and teammate
    driver_results['points_difference'] = driver_results['points'] - driver_results['teammate_points']
    return driver_results

def find_time(time):
    if (time == 0):
        return 0
    hours, minutes, seconds = str(time).split(':')
    return (int(hours) * 100) + int(minutes)

get_driver_results(1)

Unnamed: 0,raceId,driverId,constructorId,number,grid,position,positionOrder,points,laps,milliseconds,...,teammate_position,teammate_positionOrder,teammate_points,teammate_laps,teammate_milliseconds,teammate_rank,teammate_fastestLapTime,teammate_fastestLapSpeed,teammate_statusId,points_difference
0,18,1,1,22,1,1,1,10.0,58,5690616,...,5,5,4.0,58,5708630,1,87418,218.385,1,6.0
1,19,1,1,22,9,5,5,4.0,56,5525103,...,3,3,6.0,56,5517005,7,95922,208.031,1,-2.0
2,20,1,1,22,3,13,13,0.0,56,0,...,5,5,4.0,57,5493759,1,93193,209.062,1,-4.0
3,21,1,1,22,5,3,3,6.0,66,5903238,...,0,17,0.0,21,0,6,82453,203.243,3,6.0
4,22,1,1,22,3,2,2,8.0,58,5213230,...,12,12,0.0,57,0,8,87640,219.269,11,8.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
529,1108,1,131,44,7,3,3,15.0,52,5123721,...,4,4,12.0,52,5124714,4,90850,233.435,1,3.0
530,1109,1,131,44,1,4,4,12.0,70,5927768,...,2,2,18.0,70,5922365,4,82178,191.919,1,-6.0
531,1109,1,131,44,1,4,4,12.0,70,5927768,...,5,5,10.0,70,5951206,8,82736,190.625,1,2.0
532,1110,1,131,44,3,4,4,13.0,44,5000121,...,7,7,6.0,44,5024169,18,111682,225.769,1,7.0


## 3. Making the model

In [145]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

data = get_driver_results(1)
X = data.drop(columns=['points_difference'])
y = data['points_difference']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the random forest regressor model
model = RandomForestRegressor()

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = model.score(X_test, y_test)
print("Mean Squared Error:", mse, "R2 Score:", r2)


Mean Squared Error: 10.680648598130842 R2 Score: 0.9337113822178325




ValueError: Expected 2D array, got scalar array instead:
array=1.0.
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.