# Pitstops and their Impact on Race Outcome
We will be exploring pitstop data from F1 seasons 2018-2023 and looking at how they determine the outcome of the races

## STEP 1 - Loading the Data & Libraries

In [4]:
# Let us start by importing the necessary libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import sklearn.model_selection as model_selection

# Load the datasets
races = pd.read_csv('data/races.csv')
results = pd.read_csv('data/results.csv')
pit_stops = pd.read_csv('data/pit_stops.csv')
drivers = pd.read_csv('data/drivers.csv')

# Display the first few rows of the dataset
races.head(), results.head(), pit_stops.head(), drivers.head(), 

(   raceId  year  round  circuitId                   name        date  \
 0       1  2009      1          1  Australian Grand Prix  2009-03-29   
 1       2  2009      2          2   Malaysian Grand Prix  2009-04-05   
 2       3  2009      3         17     Chinese Grand Prix  2009-04-19   
 3       4  2009      4          3     Bahrain Grand Prix  2009-04-26   
 4       5  2009      5          4     Spanish Grand Prix  2009-05-10   
 
        time                                                url fp1_date  \
 0  06:00:00  http://en.wikipedia.org/wiki/2009_Australian_G...       \N   
 1  09:00:00  http://en.wikipedia.org/wiki/2009_Malaysian_Gr...       \N   
 2  07:00:00  http://en.wikipedia.org/wiki/2009_Chinese_Gran...       \N   
 3  12:00:00  http://en.wikipedia.org/wiki/2009_Bahrain_Gran...       \N   
 4  12:00:00  http://en.wikipedia.org/wiki/2009_Spanish_Gran...       \N   
 
   fp1_time fp2_date fp2_time fp3_date fp3_time quali_date quali_time  \
 0       \N       \N       \N

## STEP 2 - Filtering the Data
We are only interested in data from seasons 2018-2023 so let us try to filter the dataset

In [8]:
# Filtering the dataset for years 2018 to 2023
races_2018_2023 = races[(races['year'] >= 2018) & (races['year'] <= 2023)]

# Getting the raceId for the years 2018 to 2023
raceIds_2018_2023 = races_2018_2023['raceId'].unique()

# Filtering pit_stops & results dataset with the raceIds from 2018 to 2023
pit_stops_2018_2023 = pit_stops[pit_stops['raceId'].isin(raceIds_2018_2023)]
results_2018_2023 = results[results['raceId'].isin(raceIds_2018_2023)]

# Display the shape of the filtered datasets
races_2018_2023.shape, pit_stops_2018_2023.shape, results_2018_2023.shape

((125, 18), (3838, 7), (2300, 18))

## STEP 3 - Cleaning the Datasets
Now that we have the filtered data we can carry on with cleaning the data by handling missing values and outliers

In [9]:
# Check for missing values in the filtered datasets
missing_values_races = races_2018_2023.isnull().sum()
missing_values_results = results_2018_2023.isnull().sum()
missing_values_pit_stops = pit_stops_2018_2023.isnull().sum()

missing_values_races, missing_values_results, missing_values_pit_stops

(raceId         0
 year           0
 round          0
 circuitId      0
 name           0
 date           0
 time           0
 url            0
 fp1_date       0
 fp1_time       0
 fp2_date       0
 fp2_time       0
 fp3_date       0
 fp3_time       0
 quali_date     0
 quali_time     0
 sprint_date    0
 sprint_time    0
 dtype: int64,
 resultId           0
 raceId             0
 driverId           0
 constructorId      0
 number             0
 grid               0
 position           0
 positionText       0
 positionOrder      0
 points             0
 laps               0
 time               0
 milliseconds       0
 fastestLap         0
 rank               0
 fastestLapTime     0
 fastestLapSpeed    0
 statusId           0
 dtype: int64,
 raceId          0
 driverId        0
 stop            0
 lap             0
 time            0
 duration        0
 milliseconds    0
 dtype: int64)

There are no missing values. Let us look for outliers by checking the descriptive statistics in each dataset

In [10]:
# Descriptive statistics of the filtered datasets
desc_stats_races = races_2018_2023.describe()
desc_stats_results = results_2018_2023.describe()
desc_stats_pit_stops = pit_stops_2018_2023.describe()

desc_stats_races, desc_stats_results, desc_stats_pit_stops

(            raceId         year       round   circuitId
 count   125.000000   125.000000  125.000000  125.000000
 mean   1053.512000  2020.552000   10.992000   29.960000
 std      38.712884     1.738817    6.149346   27.278256
 min     989.000000  2018.000000    1.000000    1.000000
 25%    1020.000000  2019.000000    6.000000    9.000000
 50%    1054.000000  2021.000000   11.000000   18.000000
 75%    1086.000000  2022.000000   16.000000   69.000000
 max    1120.000000  2023.000000   22.000000   80.000000,
            resultId       raceId     driverId  constructorId         grid  \
 count   2300.000000  2300.000000  2300.000000    2300.000000  2300.000000   
 mean   24934.737391  1048.121739   677.193478      77.624348    10.119130   
 std      665.141029    35.392326   322.250443      87.089409     5.825181   
 min    23782.000000   989.000000     1.000000       1.000000     0.000000   
 25%    24360.750000  1017.000000   815.000000       4.000000     5.000000   
 50%    24935.5000