### Formula 1 data collection, merging, cleaning

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import matplotlib.pyplot as plt
import numpy as np

circuits = pd.read_csv('circuits.csv')
constructor_results = pd.read_csv('constructor_results.csv')
constructor_standings = pd.read_csv('constructor_standings.csv')
constructors = pd.read_csv('constructors.csv')
driver_standings = pd.read_csv('driver_standings.csv')
drivers = pd.read_csv('drivers.csv')
lap_times = pd.read_csv('lap_times.csv')
pit_stops = pd.read_csv('pit_stops.csv')
qualifying = pd.read_csv('qualifying.csv')
races = pd.read_csv('races.csv')
results = pd.read_csv('results.csv')
seasons = pd.read_csv('seasons.csv')
sprint_results = pd.read_csv('sprint_results.csv')
status = pd.read_csv('status.csv')
weather_new = pd.read_csv('weather_f1.csv')

In [2]:
results.head(5)

Unnamed: 0,resultId,raceId,driverId,constructorId,number,grid,position,positionText,positionOrder,points,laps,time,milliseconds,fastestLap,rank,fastestLapTime,fastestLapSpeed,statusId
0,1,18,1,1,22,1,1,1,1,10.0,58,1:34:50.616,5690616,39,2,1:27.452,218.3,1
1,2,18,2,2,3,5,2,2,2,8.0,58,+5.478,5696094,41,3,1:27.739,217.586,1
2,3,18,3,3,7,7,3,3,3,6.0,58,+8.163,5698779,41,5,1:28.090,216.719,1
3,4,18,4,4,5,11,4,4,4,5.0,58,+17.181,5707797,58,7,1:28.603,215.464,1
4,5,18,5,1,23,3,5,5,5,4.0,58,+18.014,5708630,43,1,1:27.418,218.385,1


In [4]:
#merging tables
merged_data = pd.merge(results, races, on='raceId')
merged_data = pd.merge(merged_data, drivers, on='driverId')
merged_data = pd.merge(merged_data, constructors, on='constructorId')
merged_data = pd.merge(merged_data, weather_new, on='raceId', how='left')
merged_data.head(5)

Unnamed: 0.1,resultId,raceId,driverId,constructorId,number_x,grid,position,positionText,positionOrder,points,...,Unnamed: 0,year_y,round_y,weather,weather_keyword_not_found,weather_keyword_weather_cloudy,weather_keyword_weather_cold,weather_keyword_weather_dry,weather_keyword_weather_warm,weather_keyword_weather_wet
0,1,18,1,1,22,1,1,1,1,10.0,...,17,2008,1,"Clear, dry, air temperature of 39 °C (102 °F).",0,0,0,0,1,0
1,27,19,1,1,22,9,5,5,5,4.0,...,18,2008,2,"Sunny, partly cloudy",0,0,0,0,1,0
2,57,20,1,1,22,3,13,13,13,0.0,...,19,2008,3,Dry,0,0,0,1,0,0
3,69,21,1,1,22,5,3,3,3,6.0,...,20,2008,4,Sunny,0,0,0,0,1,0
4,90,22,1,1,22,3,2,2,2,8.0,...,21,2008,5,Sunny with temperatures reaching up to 17 °C (...,0,0,0,0,1,0


In [6]:
merged_data.to_csv("merged_10-4-2023.csv")

In [5]:
# Convert 'grid' column to dummy variables
grid_dummies = pd.get_dummies(merged_data['grid'], prefix='grid', drop_first=True)

# Concatenate dummy columns to the original merged_data DataFrame
merged_data = pd.concat([merged_data, grid_dummies], axis=1)

# Update the list of features to include the grid dummy columns
weather_columns = [
    'weather_keyword_weather_cloudy',
    'weather_keyword_weather_cold',
    'weather_keyword_weather_dry',
    'weather_keyword_weather_warm',
    'weather_keyword_weather_wet'
]
grid_dummy_columns = [col for col in merged_data.columns if 'grid_' in col]
features_columns = ['raceId', 'driverId', 'constructorId', 'laps', 'circuitId', 'statusId'] + weather_columns + grid_dummy_columns

# Extract features and target
features = merged_data[features_columns]
merged_data['podium'] = merged_data['positionOrder'].apply(lambda x: 1 if x in [1, 2, 3] else 0)

# Train-test split, model creation, and evaluation
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

X_train, X_test, y_train, y_test = train_test_split(features, merged_data['podium'], test_size=0.2)

logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.9285990712074303
              precision    recall  f1-score   support

           0       0.95      0.97      0.96      4553
           1       0.73      0.64      0.68       615

    accuracy                           0.93      5168
   macro avg       0.84      0.80      0.82      5168
weighted avg       0.93      0.93      0.93      5168



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [6]:
print("circuits columns:", circuits.columns)
print("constructor_results columns:", constructor_results.columns)
print("constructor_standings columns:", constructor_standings.columns)
print("constructors columns:", constructors.columns)
print("driver_standings columns:", driver_standings.columns)
print("drivers columns:", drivers.columns)
print("lap_times columns:", lap_times.columns)
print("pit_stops columns:", pit_stops.columns)
print("qualifying columns:", qualifying.columns)
print("races columns:", races.columns)
print("results columns:", results.columns)
print("seasons columns:", seasons.columns)
print("sprint_results columns:", sprint_results.columns)
print("status columns:", status.columns)
print("weather_new columns:", weather_new.columns)


circuits columns: Index(['circuitId', 'circuitRef', 'name', 'location', 'country', 'lat', 'lng',
       'alt', 'url'],
      dtype='object')
constructor_results columns: Index(['constructorResultsId', 'raceId', 'constructorId', 'points', 'status'], dtype='object')
constructor_standings columns: Index(['constructorStandingsId', 'raceId', 'constructorId', 'points',
       'position', 'positionText', 'wins'],
      dtype='object')
constructors columns: Index(['constructorId', 'constructorRef', 'name', 'nationality', 'url'], dtype='object')
driver_standings columns: Index(['driverStandingsId', 'raceId', 'driverId', 'points', 'position',
       'positionText', 'wins'],
      dtype='object')
drivers columns: Index(['driverId', 'driverRef', 'number', 'code', 'forename', 'surname', 'dob',
       'nationality', 'url'],
      dtype='object')
lap_times columns: Index(['raceId', 'driverId', 'lap', 'position', 'time', 'milliseconds'], dtype='object')
pit_stops columns: Index(['raceId', 'driverId', 

In [18]:
merged4.columns

Index(['raceId', 'year', 'round', 'circuitId', 'name_x', 'date', 'time',
       'url_x', 'fp1_date', 'fp1_time', 'fp2_date', 'fp2_time', 'fp3_date',
       'fp3_time', 'quali_date', 'quali_time', 'sprint_date', 'sprint_time',
       'circuitRef', 'name_y', 'location', 'country', 'lat', 'lng', 'alt',
       'url_y', 'constructorResultsId', 'constructorId', 'points_x', 'status',
       'constructorStandingsId', 'points_y', 'constructor_position',
       'positionText', 'wins', 'constructorRef', 'name', 'nationality', 'url'],
      dtype='object')

In [20]:
results.head(5)

Unnamed: 0,resultId,raceId,driverId,constructorId,result_number,result_grid,result_position,positionText,positionOrder,result_points,laps,result_time,milliseconds,fastestLap,rank,fastestLapTime,fastestLapSpeed,statusId
0,1,18,1,1,22,1,1,1,1,10.0,58,1:34:50.616,5690616,39,2,1:27.452,218.3,1
1,2,18,2,2,3,5,2,2,2,8.0,58,+5.478,5696094,41,3,1:27.739,217.586,1
2,3,18,3,3,7,7,3,3,3,6.0,58,+8.163,5698779,41,5,1:28.090,216.719,1
3,4,18,4,4,5,11,4,4,4,5.0,58,+17.181,5707797,58,7,1:28.603,215.464,1
4,5,18,5,1,23,3,5,5,5,4.0,58,+18.014,5708630,43,1,1:27.418,218.385,1


In [25]:
# Prefixing circuits columns
prefix = 'circuits_'
circuits = circuits.rename(columns=lambda x: f"{prefix}{x}" if x not in ['circuitId'] else x)

# Prefixing weather_new columns
prefix = 'weather_'
weather_new = weather_new.rename(columns=lambda x: f"{prefix}{x}" if x not in ['raceId'] else x)

# Prefixing results columns
prefix = 'results_'
results = results.rename(columns=lambda x: f"{prefix}{x}" if x not in ['raceId', 'driverId', 'constructorId'] else x)

# Prefixing drivers columns
prefix = 'drivers_'
drivers = drivers.rename(columns=lambda x: f"{prefix}{x}" if x not in ['driverId'] else x)

# Prefixing constructors columns
prefix = 'constructors_'
constructors = constructors.rename(columns=lambda x: f"{prefix}{x}" if x not in ['constructorId'] else x)

# Prefixing constructor_standings columns
prefix = 'constr_standings_'
constructor_standings = constructor_standings.rename(columns=lambda x: f"{prefix}{x}" if x not in ['raceId', 'constructorId'] else x)

# Prefixing driver_standings columns
prefix = 'driver_standings_'
driver_standings = driver_standings.rename(columns=lambda x: f"{prefix}{x}" if x not in ['raceId', 'driverId'] else x)

# Prefixing qualifying columns
prefix = 'qualifying_'
qualifying = qualifying.rename(columns=lambda x: f"{prefix}{x}" if x not in ['raceId', 'driverId', 'constructorId'] else x)

# Merging with the renamed columns
merged = races
merged = pd.merge(merged, circuits, on='circuitId', how='left')
merged = pd.merge(merged, weather_new, on='raceId', how='left')
merged = pd.merge(merged, results, on='raceId', how='left')
merged = pd.merge(merged, drivers, on='driverId', how='left')
merged = pd.merge(merged, constructors, on='constructorId', how='left')
merged = pd.merge(merged, constructor_standings, on=['raceId', 'constructorId'], how='left')
merged = pd.merge(merged, driver_standings, on=['raceId', 'driverId'], how='left')
merged = pd.merge(merged, qualifying, on=['raceId', 'driverId', 'constructorId'], how='left')


In [26]:
merged.head(5)

Unnamed: 0,raceId,year,round,circuitId,name,date,time,url,fp1_date,fp1_time,...,driver_standings_points,driver_standings_position,driver_standings_positionText,driver_standings_wins,qualifying_qualifyId,qualifying_qualifying_number,qualifying_qualifying_position,qualifying_q1,qualifying_q2,qualifying_q3
0,1,2009,1,1,Australian Grand Prix,2009-03-29,06:00:00,http://en.wikipedia.org/wiki/2009_Australian_G...,\N,\N,...,10.0,1.0,1,1.0,2986.0,22.0,1.0,1:25.211,1:24.855,1:26.202
1,1,2009,1,1,Australian Grand Prix,2009-03-29,06:00:00,http://en.wikipedia.org/wiki/2009_Australian_G...,\N,\N,...,8.0,2.0,2,0.0,2987.0,23.0,2.0,1:25.006,1:24.783,1:26.505
2,1,2009,1,1,Australian Grand Prix,2009-03-29,06:00:00,http://en.wikipedia.org/wiki/2009_Australian_G...,\N,\N,...,6.0,3.0,3,0.0,2993.0,9.0,8.0,1:26.194,1:25.265,1:27.127
3,1,2009,1,1,Australian Grand Prix,2009-03-29,06:00:00,http://en.wikipedia.org/wiki/2009_Australian_G...,\N,\N,...,5.0,4.0,4,0.0,2991.0,10.0,6.0,1:25.499,1:25.281,1:26.975
4,1,2009,1,1,Australian Grand Prix,2009-03-29,06:00:00,http://en.wikipedia.org/wiki/2009_Australian_G...,\N,\N,...,4.0,5.0,5,0.0,2997.0,7.0,12.0,1:26.026,1:25.605,\N
