In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error



: 

In [None]:
df = pd.read_csv("F1DriversDataset 2.csv")
df

In [None]:
df.head()

In [None]:
df.columns

# What is the distribution of drivers' nationalities in the dataset?

In [None]:
nationality_drivers = df['Nationality'].value_counts().sort_values(ascending=False).head(20)
nationality_drivers

In [None]:

def generate_chart(names, values):
    df = px.data.tips() 
    fig = px.pie(df, values=values, names=names, color_discrete_sequence=px.colors.sequential.RdBu)
    return fig

generate_chart(nationality_drivers.index,nationality_drivers.values)




There are more drivers from the UK than the US.

# What is the correlation between the number of seasons a driver participates in and their number of race wins?

In [None]:
df_winsvsyears = df[['Years_Active','Race_Wins']]
df_winsvsyears.head()

In [None]:
df_winsvsyears.isnull().sum()

In [115]:
X= df_winsvsyears['Years_Active'].to_numpy().reshape(-1,1)
y = df_winsvsyears['Race_Wins'].to_numpy().reshape(-1,1).ravel()

In [116]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_regressor = RandomForestRegressor(n_estimators=200, max_depth=None)

rf_regressor.fit(X_train, y_train)

y_pred = rf_regressor.predict(X_test)



In [None]:
corr_coef = df['Years_Active'].corr(df['Race_Wins'])
mse = mean_squared_error(y_test, y_pred)

print("Mean Squared Error: ", mse)
print("corr coef",corr_coef)

there may be other factors beyond the number of years of experience that contribute to a driver's success in Formula 1.

# Pole positions vs Race wins of champions

In [None]:
df_champions = df.query('Championships > 0')
df_champions.head()

In [None]:
df_champions.isnull().sum()

In [120]:
from sklearn.metrics import mean_squared_error
def linear_regression(df, predictor, target):
    
    # to fit a linear regression model to the data
    X = df[predictor].values.reshape(-1,1)
    y = df[target].values.reshape(-1,1)
    
    regressor = LinearRegression()
    regressor.fit(X, y)
    
    # Calculate the R-squared value of the linear regression model
    r_squared = regressor.score(X, y)
    
    # Calculate the correlation coefficient 
    corr_coef = df[predictor].corr(df[target])
    
    # Predict the target variable using the predictor variable
    y_pred = regressor.predict(X)
    
    # Calculate the prediction score
    prediction_score = mean_squared_error(y, y_pred)
    
    # Create a scatter plot to visualize the relationship between the predictor and target
    fig = sns.lmplot(x=predictor,y=target,data=df)
    
    # Plot the regression line on top of the scatter plot
    plt.plot(df[predictor], regressor.predict(df[[predictor]]), color='red')
    plt.title('Race Wins vs Pole Positions')
    return corr_coef, r_squared, y_pred, prediction_score, fig


In [None]:
corr_coef, r_squared, y_pred, prediction_score, fig = linear_regression(df_champions, 'Pole_Positions', 'Race_Wins')


print('prediction_score', prediction_score)
print('correlation coefficient', corr_coef)
print('R-squared:', r_squared)



the pole position is a very good predictor of race wins for champions, and that a higher pole position is generally associated with a higher number of race wins.

#### What does it take to become a Champion?

In [None]:
print(df.corr())
plt.figure(figsize=(12,12))
dataplot = sns.heatmap(df.corr(), cmap="YlGnBu", annot=True)

In [None]:
columns=['Pole_Rate','Start_Rate','Win_Rate','Podium_Rate','FastLap_Rate','Years_Active']
plt.figure(figsize=(20, 15))
i=0
for col in columns:
    i=i+1
    plt.subplot(2,3,i)
    sns.regplot(x = col, y = 'Championships', data = df,order=1)
plt.show()

In [None]:
columns=['Pole_Rate','Start_Rate','Win_Rate','Podium_Rate','FastLap_Rate','Years_Active']
plt.figure(figsize=(20, 15))
i=0
for col in columns:
    i=i+1
    plt.subplot(2,3,i)
    sns.regplot(x = col, y = 'Championships', data = df,order=2)
plt.show()

Binary classification is done using SGD classifier

In [131]:
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn import metrics
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import numpy as np

In [132]:
target = df['Champion']
features = df[['Race_Entries',
       'Race_Starts', 'Pole_Positions', 'Race_Wins', 'Podiums', 'Fastest_Laps',
       'Points']]
X= features.values
Y=target.values
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
 # Always scale the input. The most convenient way is to use a pipeline.
clf = SGDClassifier(random_state=42)

model = clf.fit(X_train, y_train)


In [None]:
y_pred = clf.predict(X_test)

accuracy = metrics.accuracy_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred,average="macro")

print("Accuracy:", accuracy)
print("Precision:", precision)

In [None]:
matrix = confusion_matrix(y_test, y_pred)
print(matrix)
sns.heatmap(matrix, annot=True, cmap='Reds')

In [None]:
ConfusionMatrixDisplay.from_estimator(model, X_test, y_test,cmap=plt.cm.Blues, normalize='true')

Based on the heatmap of the confusion matrix and the normalized confusion matrix, the SGDClassifier returns a good prediction of whether a person can become a Champion or not using the features Race_Entries, Race_Starts, Pole_Positions, Race_Wins, Podiums, Fastest_Laps, and Points. The model has a high accuracy and a low false negative rate, indicating that it correctly identifies most of the champions while only misclassifying a few non-champions. However, given the unpredictability of F1 races, we cannot conclusively prove that this is the best mode of classification.