In [47]:
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score, accuracy_score, precision_score, make_scorer
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, RobustScaler, LabelEncoder
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from tensorflow.keras.utils import to_categorical
from tensorflow.keras import models, layers, regularizers
from tensorflow.keras.callbacks import EarlyStopping

from imblearn.combine import SMOTETomek
from scipy import stats


In [2]:
laps = pd.read_csv("../raw_data/new_clean_data.csv")

  laps = pd.read_csv("../raw_data/new_clean_data.csv")


In [6]:
laps.drop(columns= ["Unnamed: 0.1",
                   "Time",
                   "DriverNumber",
                   "LapTime",
                   "Stint", 
                   "PitOutTime", 
                   "PitInTime", 
                   "Sector1Time",
                   "Sector2Time", 
                   "Sector3Time", 
                   "Sector1SessionTime",
                   "Sector2SessionTime", 
                   "Sector3SessionTime", 
                   "SpeedI1", 
                   "SpeedI2",
                   "SpeedFL",
                   "SpeedST",
                   "LapStartTime",
                   "LapStartDate",
                   "TrackStatus",
                   "Deleted", 
                   "DeletedReason", 
                   "FastF1Generated", 
                   "IsAccurate",
                   "status_list",
                   "TotalLaps",
                   "Time_min", 
                   "Unnamed: 0", 
                   "Time_w", 
                   "AirTemp", 
                   "Humidity", 
                   "Pressure", 
                   "Rainfall",
                   "WindDirection",
                   "WindSpeed", 
                   "Final_Position", 
                   "LocationYear",
                   "next_compound",
                   "FreshTyre",
                   "pitting_this_lap"], inplace = True)

In [7]:
laps["pitting_next_lap"] = laps["pitting_next_lap"].apply(lambda x: 1 if x == True else 0)

In [9]:
laps_for_model = laps.copy()
train_df = laps_for_model[laps_for_model["Year"] < 2022]
val_df = laps_for_model[laps_for_model["Year"] == 2022]
test_df = laps_for_model[laps_for_model["Year"] == 2023]
train_df_shuffled = train_df.sample(frac=1)
val_df_shuffled = val_df.sample(frac=1)
test_df_shuffled = test_df.sample(frac=1)

y_train = train_df_shuffled["pitting_next_lap"]
y_val = val_df_shuffled["pitting_next_lap"]
y_test = test_df_shuffled["pitting_next_lap"]

X_train = train_df_shuffled.drop(columns="pitting_next_lap")
X_val = val_df_shuffled.drop(columns="pitting_next_lap")
X_test = test_df_shuffled.drop(columns="pitting_next_lap")

cat_features = ["Driver", "Compound", "Team", "Location", "second_compound", "TyreStressLevel", "status", "close_ahead", "close_behind", "is_pitting_ahead", "is_pitting_behind", "IsPersonalBest", "Position", "Year"]
cat_features_preproc = make_pipeline(OneHotEncoder(sparse=False, handle_unknown="ignore"))
num_features = ["LapNumber", "TyreLife", "LastTeamRanking", "TrackTemp"]
num_features_preproc = make_pipeline(RobustScaler())

preproc_baseline = make_column_transformer((cat_features_preproc, cat_features),
                                           (num_features_preproc, num_features),
                                           remainder="passthrough")

preproc_baseline.fit(X_train)
X_train_preproc = preproc_baseline.transform(X_train)
X_test_preproc = preproc_baseline.transform(X_test)
X_val_preproc = preproc_baseline.transform(X_val)

smt = SMOTETomek(sampling_strategy=0.5)
X_train_preproc_resamp, y_train_resamp = smt.fit_resample(X_train_preproc, y_train)

def custom_metric(y_test, y_pred_rf):
    predictions = list(y_pred_rf)
    reality = y_test.to_list()
    tracker = 0
    ones_count = reality.count(1)

    for i in range(len(reality)):
        if reality[i] == 1:
            if predictions[i] == 1:
                tracker += 1
            elif i > 0 and predictions[i - 1] == 1:
                tracker += 1
            elif i < len(predictions) - 1 and predictions[i + 1] == 1:
                tracker += 1

    return tracker / ones_count



# KNN 11  neighbors

In [65]:
model_knn = KNeighborsClassifier(n_neighbors=11)

In [66]:
model_knn.fit(X_train_preproc_resamp,y_train_resamp)

In [67]:
y_pred_knn = model_knn.predict(X_test_preproc)

In [68]:
custom_metric(y_test,y_pred_knn)

0.6971830985915493

In [69]:
X_test_eval_df = X_test.copy()
y_test_eval_df = y_test.copy()
X_test_eval_df["RealPitting"] = y_test_eval_df
X_test_eval_df["ModelPitting"] = y_pred_rf
X_test_eval_df.sort_values(by = ["Driver", "Location", "LapNumber"], inplace=True)

In [81]:
X_test_eval_df[(X_test_eval_df["Driver"] == "SAI") & (X_test_eval_df["Location"] == "Sakhir")]

Unnamed: 0,Driver,LapNumber,IsPersonalBest,Compound,TyreLife,Team,Position,Location,Year,LastTeamRanking,...,status,RaceProgress,TyreStressLevel,TrackTemp,close_ahead,close_behind,is_pitting_ahead,is_pitting_behind,RealPitting,ModelPitting
62992,SAI,6.0,False,SOFT,9.0,Ferrari,4.0,Sakhir,2023,2.0,...,1,0.105263,3,30.9,True,True,False,False,0,0
62993,SAI,7.0,False,SOFT,10.0,Ferrari,4.0,Sakhir,2023,2.0,...,1,0.122807,3,30.9,True,True,False,False,0,0
62994,SAI,8.0,False,SOFT,11.0,Ferrari,4.0,Sakhir,2023,2.0,...,1,0.140351,3,30.9,True,True,False,False,0,0
62995,SAI,9.0,False,SOFT,12.0,Ferrari,4.0,Sakhir,2023,2.0,...,1,0.157895,3,30.8,True,True,False,False,0,1
62996,SAI,10.0,False,SOFT,13.0,Ferrari,4.0,Sakhir,2023,2.0,...,1,0.175439,3,30.6,True,True,False,False,0,1
62997,SAI,11.0,False,SOFT,14.0,Ferrari,4.0,Sakhir,2023,2.0,...,1,0.192982,3,30.6,True,True,False,False,0,1
62998,SAI,12.0,False,SOFT,15.0,Ferrari,4.0,Sakhir,2023,2.0,...,1,0.210526,3,30.5,True,True,False,False,1,1
62999,SAI,13.0,False,SOFT,16.0,Ferrari,4.0,Sakhir,2023,2.0,...,1,0.22807,3,30.5,False,True,False,False,0,1
63000,SAI,14.0,False,HARD,1.0,Ferrari,6.0,Sakhir,2023,2.0,...,1,0.245614,3,30.5,False,False,False,False,0,0
63001,SAI,15.0,True,HARD,2.0,Ferrari,5.0,Sakhir,2023,2.0,...,1,0.263158,3,30.5,True,True,True,False,0,0


LECLERC
Barcelona: Nul, aucun pit
Baku: très bon (bonne pred mais 1 tour de retard)
Jeddah:bon sur le vrai pit mais donne 2 tours plus haut
Melbourne:
Sakhir:


SAINZ
Barcelona: Bonne fenêtre
Baku: très bon (bonne pred mais 1 tour de retard)
Jeddah: predis 1 en dessous 1 au dessus mais pas le bon
Melbourne: Très mauvais, pas de pit
Sakhir: sur 2 pit, le 1er il predit la bonne fenetre mais 5 fois de suite. Ne predit pas le 2eme


# SVM

# Random Forest