In [122]:
# pip install pandas
# pip install matplotlib
# pip install sqlite3
# pip install kaggle
# move kaggle.json file into ~/.kaggle (mac) or C:\Users<Windows-username>.kaggle\kaggle.json (windows)

import kaggle
import pandas as pd
import numpy as np
import matplotlib as plt
import sqlite3 as db

dataset = "rohanrao/formula-1-world-championship-1950-2020"

kaggle.api.dataset_download_files(dataset, path='./', unzip=True)

print(f"Dataset {dataset} downloaded successfully!")

Dataset URL: https://www.kaggle.com/datasets/rohanrao/formula-1-world-championship-1950-2020
Dataset rohanrao/formula-1-world-championship-1950-2020 downloaded successfully!


In [123]:
results_df = pd.read_csv('results.csv')
qualifying_df = pd.read_csv('qualifying.csv')
lap_times_df = pd.read_csv('lap_times.csv')
pit_stops_df = pd.read_csv('pit_stops.csv')
driver_standings_df = pd.read_csv('driver_standings.csv')
races_df = pd.read_csv('races.csv')
constructors_df = pd.read_csv('constructors.csv')
status_df = pd.read_csv('status.csv')

In [124]:

results_df.drop(['number', 'positionText', 'time', 'rank', 'fastestLap', 'fastestLapTime', 'fastestLapSpeed', 'laps', 'points', 'positionOrder', 'grid'], axis=1, inplace=True)

results_df.rename(columns={'position': 'final_position'}, inplace=True)

results_df['final_position'] = results_df['final_position'].replace('\\N', np.nan)
results_df.final_position = pd.to_numeric(results_df.final_position, errors='coerce')
results_df.milliseconds = pd.to_numeric(results_df.milliseconds, errors='coerce')
print(results_df.dtypes)
results_df.head()

resultId            int64
raceId              int64
driverId            int64
constructorId       int64
final_position    float64
milliseconds      float64
statusId            int64
dtype: object


Unnamed: 0,resultId,raceId,driverId,constructorId,final_position,milliseconds,statusId
0,1,18,1,1,1.0,5690616.0,1
1,2,18,2,2,2.0,5696094.0,1
2,3,18,3,3,3.0,5698779.0,1
3,4,18,4,4,4.0,5707797.0,1
4,5,18,5,1,5.0,5708630.0,1


In [125]:
qualifying_df.drop(['number', 'q1', 'q2', 'q3', 'qualifyId'], axis=1, inplace=True)
qualifying_df.rename(columns={'position': 'qualifying_position'}, inplace=True)

print(qualifying_df.dtypes)
qualifying_df.head()

raceId                 int64
driverId               int64
constructorId          int64
qualifying_position    int64
dtype: object


Unnamed: 0,raceId,driverId,constructorId,qualifying_position
0,18,1,1,1
1,18,9,2,2
2,18,5,1,3
3,18,13,6,4
4,18,2,2,5


In [126]:
lap_times_df.drop(['time', 'lap', 'position'], axis=1, inplace=True)
lap_times_df.rename(columns={'milliseconds': 'lap_time_milliseconds'}, inplace=True)

print(lap_times_df.dtypes)
lap_times_df.head()

raceId                   int64
driverId                 int64
lap_time_milliseconds    int64
dtype: object


Unnamed: 0,raceId,driverId,lap_time_milliseconds
0,841,20,98109
1,841,20,93006
2,841,20,92713
3,841,20,92803
4,841,20,92342


In [127]:
pit_stops_df.drop(['milliseconds', 'time', 'lap'], axis=1, inplace=True)
pit_stops_df.rename(columns={'duration': 'stop_duration', 'stop':'pit_stop'}, inplace=True)

pit_stops_df.stop_duration = pd.to_numeric(pit_stops_df.stop_duration, errors='coerce')
print(pit_stops_df.dtypes)
pit_stops_df.head()

raceId             int64
driverId           int64
pit_stop           int64
stop_duration    float64
dtype: object


Unnamed: 0,raceId,driverId,pit_stop,stop_duration
0,841,153,1,26.898
1,841,30,1,25.021
2,841,17,1,23.426
3,841,4,1,23.251
4,841,13,1,23.842


In [128]:
driver_standings_df.drop(['position','positionText', 'wins', 'driverStandingsId'], axis=1, inplace=True)
driver_standings_df.rename(columns={'points': 'driver_points'}, inplace=True)

print(driver_standings_df.dtypes)
driver_standings_df.head()

raceId             int64
driverId           int64
driver_points    float64
dtype: object


Unnamed: 0,raceId,driverId,driver_points
0,18,1,10.0
1,18,2,8.0
2,18,3,6.0
3,18,4,5.0
4,18,5,4.0


In [129]:
races_df.drop(['date','time', 'round','url', 'fp1_date', 'fp1_time', 'fp2_date', 'fp2_time', 'fp3_date', 'fp3_time', 'quali_date', 'quali_time', 'sprint_date', 'sprint_time'], axis=1, inplace=True)
races_df.rename(columns={'name': 'circuit_name'}, inplace=True)

print(races_df.dtypes)
filtered_races_df = races_df[races_df['year'] >= 2020]
filtered_races_df.head()

raceId           int64
year             int64
circuitId        int64
circuit_name    object
dtype: object


Unnamed: 0,raceId,year,circuitId,circuit_name
1018,1031,2020,70,Austrian Grand Prix
1019,1032,2020,70,Styrian Grand Prix
1020,1033,2020,11,Hungarian Grand Prix
1021,1034,2020,9,British Grand Prix
1022,1035,2020,9,70th Anniversary Grand Prix


In [130]:
constructors_df.drop(['url', 'nationality', 'constructorRef'], axis=1, inplace=True)
constructors_df.rename(columns={'name': 'constructor_name'}, inplace=True)

print(constructors_df.dtypes)
constructors_df.head()

constructorId        int64
constructor_name    object
dtype: object


Unnamed: 0,constructorId,constructor_name
0,1,McLaren
1,2,BMW Sauber
2,3,Williams
3,4,Renault
4,5,Toro Rosso


In [131]:
print(status_df.dtypes)
filtered_status_df = status_df[status_df['status'] == 'Finished']
filtered_status_df.head()

statusId     int64
status      object
dtype: object


Unnamed: 0,statusId,status
0,1,Finished


In [132]:
# merge datasets together

merge1_df = pd.merge(filtered_races_df, results_df, on='raceId')
merge2_df = pd.merge(merge1_df, constructors_df, on='constructorId')
merge3_df = pd.merge(merge2_df, driver_standings_df, on=['driverId', 'raceId'])
merge4_df = pd.merge(merge3_df, lap_times_df, on=['raceId','driverId'])
merge5_df = pd.merge(merge4_df, pit_stops_df, on=['raceId','driverId'])
merge6_df = pd.merge(merge5_df, qualifying_df, on='constructorId')
combined_df = pd.merge(merge6_df, filtered_status_df, on=['statusId'])



#filtered_df = combined_df[combined_df['status'] == 'Finished']
#ombined_df.head()

#combined_df.to_csv('filename.csv', index=False)

names = combined_df['year'].unique()
print(names)

combined_df.head()

[2020 2021 2022 2023 2024]


Unnamed: 0,raceId_x,year,circuitId,circuit_name,resultId,driverId_x,constructorId,final_position,milliseconds,statusId,constructor_name,driver_points,lap_time_milliseconds,pit_stop,stop_duration,raceId_y,driverId_y,qualifying_position,status
0,1031,2020,70,Austrian Grand Prix,24626,822,131,1.0,5455739.0,1,Mercedes,25.0,71454,1,21.937,337,3,5,Finished
1,1031,2020,70,Austrian Grand Prix,24626,822,131,1.0,5455739.0,1,Mercedes,25.0,71454,1,21.937,337,30,7,Finished
2,1031,2020,70,Austrian Grand Prix,24626,822,131,1.0,5455739.0,1,Mercedes,25.0,71454,1,21.937,338,3,6,Finished
3,1031,2020,70,Austrian Grand Prix,24626,822,131,1.0,5455739.0,1,Mercedes,25.0,71454,1,21.937,338,30,7,Finished
4,1031,2020,70,Austrian Grand Prix,24626,822,131,1.0,5455739.0,1,Mercedes,25.0,71454,1,21.937,339,3,2,Finished
