In [556]:
import os
import pandas as pd
import seaborn as sn
import pytz
import datetime
from meteostat import Point, Hourly
from datetime import datetime, timedelta
from timezonefinder import TimezoneFinder
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [286]:
## Data source https://github.com/f1db/f1db

## alexander-albon -23
## carlos-sainz-jr -55

## miami
## imola
### Test prediction on Year 2025 Round 


In [287]:
sourcepath='F1\\Data\\csv\\'

In [571]:
drivers = pd.read_csv(sourcepath+'f1db-drivers.csv')                      ### Driver details and stats
circuits=pd.read_csv(sourcepath+'f1db-circuits.csv')                      ### race ciricuit details 
raceresults=pd.read_csv(sourcepath+'f1db-races-race-results.csv')         ### race results
races =pd.read_csv(sourcepath+'f1db-races.csv')                           ### race cicruit and stats
engines =pd.read_csv(sourcepath+'f1db-engines.csv')                       ### engine details



In [None]:
## Average F1 Driver Age is around 28years and oldest F1 Driver Alonso  is 43years
## so filrer out all drivers 50 years age or above
## further we will filter drivers to restrict to who are active in 2025

In [573]:
drivers_filtered  = drivers.loc[(drivers['dateOfBirth']>='1975-01-01') & (drivers['dateOfDeath'].isnull()),['id','name','gender','dateOfBirth','bestChampionshipPosition',
       'bestStartingGridPosition', 'bestRaceResult', 'totalChampionshipWins',
       'totalRaceEntries', 'totalRaceStarts', 'totalRaceWins', 'totalRaceLaps',
       'totalPodiums', 'totalPoints', 'totalChampionshipPoints',
       'totalPolePositions', 'totalFastestLaps', 'totalDriverOfTheDay',
       'totalGrandSlams']]

In [574]:
drivers_filtered['dateOfBirth'] = pd.to_datetime(drivers_filtered['dateOfBirth'])
drivers_filtered['Age']=((pd.Timestamp('now')-drivers_filtered['dateOfBirth']).dt.days/365).astype(int)
drivers_filtered=drivers_filtered.drop(columns=['dateOfBirth'])

In [575]:
drivers_filtered=drivers_filtered.rename(columns={'id':'driverId'})

In [576]:
# longitude and latitude are required to get Weather data

In [577]:
circuits_filtered = circuits.loc[:,['id', 'name', 'type', 'direction','latitude', 'longitude', 'length', 'turns']]

In [578]:
circuits= circuits.rename(columns = {'id':'circuitId'})

In [579]:
races['racestarttimelocal'] = pd.to_datetime(races['racestarttime'],format='%m/%d/%Y %H:%M')

In [580]:
racecircuit = pd.merge(races,circuits,how="left",on="circuitId")

In [581]:
racecircuit_filtered = racecircuit.loc[(racecircuit['year']>2021),['id','circuitId' ,'year', 'round', 'date', 'time','racestarttimelocal', 'qualifyingFormat', 'sprintQualifyingFormat',  'circuitType', 'direction_x', 'courseLength', 'turns_x', 'laps',  'driversChampionshipDecider', 'constructorsChampionshipDecider',
        'latitude', 'longitude']]

In [582]:
racecircuit_filtered=racecircuit_filtered.rename(columns={'id':'raceId'})

In [583]:
def normalize(value, mini, maxi):
    return min(abs(value - mini) / abs(maxi - mini), 1.0)

In [584]:
def convert_to_local_time(datetime_input, lat, lng, datetime_format="%Y-%m-%d %H:%M:%S"):
    """
    Converts a datetime input (string or Timestamp) to local time based on latitude and longitude.
    """
    try:
        # If input is string, parse it
        if isinstance(datetime_input, str):
            utc_dt = datetime.strptime(datetime_input, datetime_format)
        elif isinstance(datetime_input, datetime):
            utc_dt = datetime_input
        else:
            return None

        # Find the timezone
        tf = TimezoneFinder(in_memory=True)
        timezone_str = tf.timezone_at(lng=lng, lat=lat)
        if timezone_str is None:
            return None

        # Localize to UTC and convert
        utc_dt = utc_dt.replace(tzinfo=pytz.utc)
        local_tz = pytz.timezone(timezone_str)
        local_dt = utc_dt.astimezone(local_tz)

        local_dt_naive = local_dt.replace(tzinfo=None)
        
        return local_dt_naive

    except Exception as e:
        print(f"Error: {e}")
        return None

In [585]:
def get_weather(latitude , longitude, startdatetime, enddatetime):
    """ Gives Weather information based on location(latitude , longitude) ,start date time and end date time.

    Args:
        latitude: Latitude of the location.
        longitude: Longitude of the location.
        startdatetime : Start date time from which weather info is required
        enddatetime : End date time upto which weather info is required

    Returns:
        A weather object having Avg temp, Avg Humidity, Avg Windspeed and bool if it will rain
    """
    location = Point(latitude,longitude)
    data = Hourly(location,startdatetime,enddatetime)
    wdata = data.fetch()
    
    temp = wdata.loc[:, 'temp'].mean()
    rhum = wdata.loc[:, 'rhum'].mean()
    wspd= wdata.loc[:, 'wspd'].mean()
    rain= wdata.loc[:, 'prcp'].sum()

    # Normalize each component (0 = best, 1 = worst)
    temp_score = normalize(temp, 20, 40)  # Ideal ~20°C
    humidity_score = normalize(rhum, 30, 90)
    wind_score = normalize(wspd, 0, 30)
    precip_score = min(rain / 10, 1.0)  # 10mm = very wet

    score = (
        0.4 * precip_score +   # Rain is most disruptive
        0.25 * temp_score +
        0.2 * wind_score +
        0.15 * humidity_score
    )
    

    
    
    
    #data = {'temp': temp, 'rhum': rhum, 'wspd': wspd, 'rain': rain}

    
    #wdata_df = pd.DataFrame(data,index=[0])
    
    return score
    

In [586]:

racecircuit_filtered['racestartlocal2'] = racecircuit_filtered.apply(
    lambda row: convert_to_local_time(row['racestarttimelocal'], row['latitude'], row['longitude']),
    axis=1
)

In [590]:
racecircuit_filtered['raceendlocal2'] = pd.to_datetime(racecircuit_filtered['racestartlocal2']+ timedelta(hours=3),format='%m/%d/%Y %H:%M')

In [591]:
racecircuit_filtered['weatherscore']  = racecircuit_filtered.apply(
    lambda row: get_weather(row['latitude'], row['longitude'],row['racestartlocal2'], row['raceendlocal2']),
    axis=1
)



In [595]:
racecircuit_filtered[['weatherscore','year','round']].query('year==2025 and round==7')

Unnamed: 0,weatherscore,year,round
400,0.101688,2025,7


In [599]:
merge_1 = pd.merge(racecircuit_filtered,raceresults,how="inner",on="raceId")

In [600]:
merge_2 = pd.merge(merge_1,drivers_filtered,how="inner",on="driverId")

In [601]:
### All column list

##'raceId', 'year_x', 'round_x', 'qualifyingFormat', 'sprintQualifyingFormat','driversChampionshipDecider', 'constructorsChampionshipDecider', 'racestartlocal2', 'raceendlocal2','weatherscore', 'positionNumber','sharedCar','timePenalty','timePenaltyMillis', 'interval','intervalMillis', 'reasonRetired', 'gridPositionNumber', 'pitStops',	'circuitId','circuitType', 'direction_x', 'courseLength', 'turns_x', 'laps_x',
##'constructorId', 'engineManufacturerId', 'tyreManufacturerId', 'driverNumber', 'driverId','Age','fastestLap', 'driverOfTheDay', 'grandSlam', 'name', 'gender','bestChampionshipPosition', 'bestStartingGridPosition','bestRaceResult', 'totalChampionshipWins', 'totalRaceEntries', 'totalRaceWins', 'totalRaceLaps', 'totalPodiums', 'totalPoints', 'totalChampionshipPoints', 'totalPolePositions', 'totalFastestLaps', 'totalDriverOfTheDay', 'points','polePosition','timeMillis'

In [602]:
merge_3 = merge_2[['year_x', 'round_x', 'qualifyingFormat', 'sprintQualifyingFormat','driversChampionshipDecider', 'constructorsChampionshipDecider', 'weatherscore', 'timePenaltyMillis', 'intervalMillis', 'reasonRetired', 'gridPositionNumber', 'pitStops',	'circuitId','circuitType', 'direction_x', 'courseLength', 'turns_x', 'laps_x',
'constructorId', 'engineManufacturerId', 'driverNumber','Age','fastestLap', 'driverOfTheDay', 'grandSlam', 'bestChampionshipPosition', 'bestStartingGridPosition','bestRaceResult', 'totalChampionshipWins', 'totalRaceEntries', 'totalRaceWins', 'totalRaceLaps', 'totalPodiums', 'totalPoints', 'totalChampionshipPoints', 'totalPolePositions', 'totalFastestLaps', 'totalDriverOfTheDay', 'points']]

In [603]:
merge_3['points']=merge_3['points'].fillna(value=0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [604]:
#test = merge_3.loc[(merge_3['year_x']==2025) & (merge_3['round_x']==6)]
imola = merge_3.loc[(merge_3['year_x']==2025) & (merge_3['round_x']==7)]

In [537]:
y_test=test['points']

In [538]:
test =test.drop('points', axis=1)

In [605]:
train= merge_3.query('year_x!=2025 or round_x!=7')
## train= merge_3.query('year_x!=2025 or round_x!=6')


In [606]:
y_train=train['points']

In [607]:
train =train.drop('points', axis=1)

In [608]:
train.describe(include=['O'])

Unnamed: 0,qualifyingFormat,sprintQualifyingFormat,reasonRetired,circuitId,circuitType,direction_x,constructorId,engineManufacturerId,fastestLap,driverOfTheDay
count,1479,280,206,1479,1479,1479,1479,1479,1479,1479
unique,2,1,37,25,2,2,13,5,2,2
top,KNOCKOUT,SPRINT_SHOOTOUT,Collision,bahrain,RACE,CLOCKWISE,ferrari,mercedes,False,False
freq,1419,280,38,80,960,939,148,591,1405,1405


In [609]:
# Preprocessing
numeric_features = ['weatherscore',
'timePenaltyMillis',
##'intervalMillis',
'gridPositionNumber',
'pitStops',
'courseLength',
'turns_x',
'laps_x',
'driverNumber',
'Age',
'bestChampionshipPosition',
'bestStartingGridPosition',
'bestRaceResult',
'totalChampionshipWins',
'totalRaceEntries',
'totalRaceWins',
'totalRaceLaps',
'totalPodiums',
'totalPoints',
'totalChampionshipPoints',
'totalPolePositions',
'totalFastestLaps',
'totalDriverOfTheDay',
]
categorical_features = ['qualifyingFormat',
'sprintQualifyingFormat',
##'reasonRetired',
'circuitId',
'circuitType',
'direction_x',
'constructorId',
'engineManufacturerId',
'fastestLap',
'driverOfTheDay',
]

In [610]:
# Pipelines
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Full preprocessor
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

In [625]:
# Full pipeline with model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
    ##('regressor', XGBRegressor(n_estimators=100, max_depth=5, learning_rate=0.1, objective='reg:squarederror'))
])

In [626]:
# Fit the model
model.fit(train, y_train)

In [628]:
# Extract feature importances
# Need to get feature names after preprocessing
onehot_columns = model.named_steps['preprocessor'].transformers_[1][1].named_steps['encoder'].get_feature_names_out(categorical_features)
feature_names = numeric_features + list(onehot_columns)
importances = model.named_steps['regressor'].feature_importances_

# Create a DataFrame for feature importances
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

feature_importance_df.head(10)

Unnamed: 0,Feature,Importance
2,gridPositionNumber,0.539375
21,totalDriverOfTheDay,0.054175
0,weatherscore,0.045047
3,pitStops,0.041522
7,driverNumber,0.028511
75,driverOfTheDay_False,0.023628
76,driverOfTheDay_True,0.023207
5,turns_x,0.018735
4,courseLength,0.014526
63,constructorId_mercedes,0.012868


In [629]:
y_train_pred = model.predict(train)
#y_test_pred = model.predict(testset)

In [561]:
# Evaluation metrics
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)
train_mae = mean_absolute_error(y_train, y_train_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
train_rmse = mean_squared_error(y_train, y_train_pred, squared=False)
test_rmse = mean_squared_error(y_test, y_test_pred, squared=False)

{
    "Train R² Score": train_r2,
    "Test R² Score": test_r2,
    "Train MAE": train_mae,
    "Test MAE": test_mae,
    "Train RMSE": train_rmse,
    "Test RMSE": test_rmse
}



{'Train R² Score': 0.8780178162083881,
 'Test R² Score': 0.8019641031742607,
 'Train MAE': 1.61355350863966,
 'Test MAE': 1.929049271903932,
 'Train RMSE': 2.5279583683843994,
 'Test RMSE': 3.191935431982017}

In [630]:
y_test_pred

array([23.603416  , 17.091923  , 10.094009  , 21.266022  ,  3.021527  ,
        5.919259  ,  5.8905706 ,  3.2563052 ,  7.269876  ,  1.1986543 ,
        0.33187544,  2.4716017 ,  1.1664449 , -0.02430281,  0.9726908 ,
        0.9921576 ,  0.13490224, -0.19399275,  0.21043438, -0.22504136],
      dtype=float32)

In [569]:
testset['points']=y_test_pred

In [570]:
testset[['driverNumber','points']].sort_values('points',ascending=False)

Unnamed: 0,driverNumber,points
1459,81,23.603416
1462,1,21.266022
1460,4,17.091923
1461,63,10.094009
1467,55,7.269876
1464,12,5.919259
1465,16,5.890571
1466,44,3.256305
1463,23,3.021527
1470,31,2.471602


In [631]:
imola_pred = model.predict(imola)

In [632]:
imola_pred

array([20.58, 14.81, 11.36, 24.  ,  3.89,  4.91,  7.77,  5.92,  5.54,
        2.36,  2.75,  3.55,  5.31,  1.26,  3.12,  6.52,  2.91,  2.93,
        3.18,  3.74])

In [633]:
imola['points']=imola_pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [634]:
# Sort by 'col1' and add row ID
df = imola[['driverNumber','points']].sort_values('points',ascending=False).reset_index(drop=True)
df['ID'] = df.index + 1

print(df)

    driverNumber  points  ID
0              1   24.00   1
1             81   20.58   2
2              4   14.81   3
3             63   11.36   4
4             16    7.77   5
5             18    6.52   6
6             44    5.92   7
7             55    5.54   8
8             10    5.31   9
9             12    4.91  10
10            23    3.89  11
11             7    3.74  12
12            31    3.55  13
13            87    3.18  14
14            14    3.12  15
15             5    2.93  16
16            30    2.91  17
17             6    2.75  18
18            22    2.36  19
19            27    1.26  20
