In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [29]:
circuits = pd.read_csv("circuits.csv")
con_res = pd.read_csv("constructor_results.csv")
constructors = pd.read_csv("constructors.csv")
drivers = pd.read_csv("drivers.csv")
lap_times = pd.read_csv("lap_times.csv")
pit_stops = pd.read_csv("pit_stops.csv")
qualifying = pd.read_csv("qualifying.csv")
races = pd.read_csv("races.csv")
incidents = pd.read_csv("status.csv")
results = pd.read_csv("results.csv")

In [30]:
races.dropna(inplace=True)
drivers.dropna(inplace=True)
constructors.dropna(inplace=True)
qualifying.dropna(inplace=True)
lap_times.dropna(inplace=True)
pit_stops.dropna(inplace=True)
results.dropna(inplace=True)


In [31]:
races['date'] = pd.to_datetime(races['date'])
qualifying['q1'] = pd.to_numeric(qualifying['q1'], errors='coerce')
lap_times['milliseconds'] = pd.to_numeric(lap_times['milliseconds'], errors='coerce')



In [32]:
races['date'] = pd.to_datetime(races['date'], format='%Y-%m-%d')



In [33]:
race_results = pd.merge(races, results, on='raceId', how='inner')
race_results_drivers = pd.merge(race_results, drivers, on='driverId', how='inner')
race_results_drivers_constructors = pd.merge(race_results_drivers, constructors, on='constructorId', how='inner')
full_data = pd.merge(race_results_drivers_constructors, qualifying, on=['raceId', 'driverId'], how='left')



In [34]:
lap_times.rename(columns={
    'time': 'lap_time',
    'position': 'lap_position'
}, inplace=True)

pit_stops.rename(columns={
    'time': 'pit_stop_time',
    'position': 'pit_stop_position'
}, inplace=True)

# Now merge without needing suffixes
full_data = pd.merge(full_data, lap_times, on=['raceId', 'driverId'], how='left')
full_data = pd.merge(full_data, pit_stops, on=['raceId', 'driverId'], how='left')

# Inspect the final DataFrame
print(full_data.head())
print(full_data.info())

   raceId  year  round  circuitId                 name_x       date    time_x  \
0       1  2009      1          1  Australian Grand Prix 2009-03-29  06:00:00   
1       1  2009      1          1  Australian Grand Prix 2009-03-29  06:00:00   
2       1  2009      1          1  Australian Grand Prix 2009-03-29  06:00:00   
3       1  2009      1          1  Australian Grand Prix 2009-03-29  06:00:00   
4       1  2009      1          1  Australian Grand Prix 2009-03-29  06:00:00   

                                               url_x fp1_date fp1_time  ...  \
0  http://en.wikipedia.org/wiki/2009_Australian_G...       \N       \N  ...   
1  http://en.wikipedia.org/wiki/2009_Australian_G...       \N       \N  ...   
2  http://en.wikipedia.org/wiki/2009_Australian_G...       \N       \N  ...   
3  http://en.wikipedia.org/wiki/2009_Australian_G...       \N       \N  ...   
4  http://en.wikipedia.org/wiki/2009_Australian_G...       \N       \N  ...   

         q3 lap_x lap_position  lap_ti

In [35]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [36]:
full_data['driver_wins'] = full_data.groupby('driverId')['positionOrder'].transform(lambda x: (x == 1).sum())
full_data['driver_podiums'] = full_data.groupby('driverId')['positionOrder'].transform(lambda x: (x <= 3).sum())
full_data['average_qualifying_position'] = full_data.groupby('driverId')['grid'].transform('mean')
full_data['constructor_wins'] = full_data.groupby('constructorId_x')['positionOrder'].transform(lambda x: (x == 1).sum())

# Dropping columns that are not useful for prediction
full_data = full_data.drop(columns=['url', 'date', 'lap_time', 'pit_stop_time', 'lap_position'])

# Handling missing values
full_data.fillna(0, inplace=True)


# Selecting features and target variable
features = ['grid', 'driver_wins', 'driver_podiums', 'average_qualifying_position', 'constructor_wins']
X = full_data[features]
y = full_data['positionOrder'] <= 3  # Example target: whether a driver finishes on the podium

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
# Normalize or scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [37]:
'''gradient_boosting_model = GradientBoostingClassifier()
gradient_boosting_model.fit(X_train, y_train)
gradient_boosting_accuracy = gradient_boosting_model.score(X_test, y_test)'''

'gradient_boosting_model = GradientBoostingClassifier()\ngradient_boosting_model.fit(X_train, y_train)\ngradient_boosting_accuracy = gradient_boosting_model.score(X_test, y_test)'

In [38]:
'''logistic_regression_model = LogisticRegression()
logistic_regression_model.fit(X_train, y_train)
logistic_regression_accuracy = logistic_regression_model.score(X_test, y_test)'''

'logistic_regression_model = LogisticRegression()\nlogistic_regression_model.fit(X_train, y_train)\nlogistic_regression_accuracy = logistic_regression_model.score(X_test, y_test)'

In [39]:
#ended up choosing this model as it had the higest accuracy score out of the three
random_forest_model = RandomForestClassifier()
random_forest_model.fit(X_train, y_train)
random_forest_accuracy = random_forest_model.score(X_test, y_test)

In [40]:
#print(f'Logistic Regression Accuracy: {logistic_regression_accuracy}')
print(f'Random Forest Accuracy: {random_forest_accuracy}')
#print(f'Gradient Boosting Accuracy: {gradient_boosting_accuracy}')

Random Forest Accuracy: 0.9123835393182127


In [None]:
y_pred_rf = best_rf_model.predict(X_test_scaled)
print("Random Forest:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf)}")
print(f"Precision: {precision_score(y_test, y_pred_rf)}")
print(f"Recall: {recall_score(y_test, y_pred_rf)}")
print(f"F1 Score: {f1_score(y_test, y_pred_rf)}")

In [None]:
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}
grid_search_rf = GridSearchCV(random_forest_model, param_grid_rf, cv=5, scoring='accuracy')
grid_search_rf.fit(X_train_scaled, y_train)
best_rf_model = grid_search_rf.best_estimator_