In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from api import JolpicaF1API

In [2]:
api = JolpicaF1API()

In [3]:
SEED = 69

In [4]:
def get_race_data(season: int) -> pd.DataFrame:
    """Get race data for a season including round numbers"""
    data = api.get_race_info(season=str(season))
    return pd.DataFrame(data["MRData"]["RaceTable"]["Races"])


def get_qualifying_data(season: int, round_num: int) -> pd.DataFrame:
    """Get qualifying results for a specific race"""
    data = api.get_qualifying_results(season=str(season), round=int(round_num))
    return pd.DataFrame(data["MRData"]["RaceTable"]["Races"][0]["QualifyingResults"])


def get_race_results(season: int, round_num: int) -> pd.DataFrame:
    """Get race results for a specific race"""
    data = api.get_race_results(season=str(season), round=int(round_num))
    return pd.DataFrame(data["MRData"]["RaceTable"]["Races"][0]["Results"])

In [5]:
recent_years = list(range(2012, 2025))
british_gp_data = []

In [6]:
for year in recent_years:
    season_races = get_race_data(year)
    british_gp = season_races[
        season_races["raceName"].str.contains("British Grand Prix")
    ].dropna()

    if not british_gp.empty:
        round_num = british_gp.iloc[0]["round"]

        # Get qualifying and results
        qualifying = get_qualifying_data(year, round_num)
        results = get_race_results(year, round_num)

        # Add metadata
        qualifying["season"] = year
        qualifying["round"] = round_num
        results["season"] = year
        results["round"] = round_num

        british_gp_data.append(
            {
                "year": year,
                "round": round_num,
                "qualifying": qualifying,
                "results": results,
            }
        )

In [7]:
qualifying_dfs = []
results_dfs = []

for race in british_gp_data:
    q = race["qualifying"].copy()
    q["driverId"] = q["Driver"].apply(lambda x: x["driverId"])
    q["position"] = q["position"].astype(int)
    qualifying_dfs.append(q[["season", "round", "driverId", "position"]])

    r = race["results"].copy()
    r["driverId"] = r["Driver"].apply(lambda x: x["driverId"])
    r["positionOrder"] = r["position"].fillna(0).astype(int)
    r["positionOrder"] = r["positionOrder"].replace(0, r["positionOrder"].max() + 1)
    results_dfs.append(r[["season", "round", "driverId", "positionOrder"]])

qualifying_historical = pd.concat(qualifying_dfs, ignore_index=True)
results_historical = pd.concat(results_dfs, ignore_index=True)

In [8]:
# Calculate average positions
avg_qualifying = (
    qualifying_historical.groupby("driverId")["position"].mean().reset_index()
)
avg_race = results_historical.groupby("driverId")["positionOrder"].mean().reset_index()

# Create target variable (winners)
winners = results_historical[results_historical["positionOrder"] == 1][
    "driverId"
].unique()

# Merge features
features = avg_qualifying.merge(avg_race, on="driverId")
features["winner"] = features["driverId"].isin(winners).astype(int)
features = features.rename(
    columns={"position": "qualifying_position", "positionOrder": "race_position"}
)

In [9]:
# Model training
X = features[["qualifying_position", "race_position"]]
y = features["winner"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=SEED
)

model = RandomForestClassifier(
    random_state=SEED, min_samples_leaf=1, max_features="sqrt", class_weight="balanced"
)
model.fit(X_train, y_train)

# Evaluation
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        11

    accuracy                           1.00        11
   macro avg       1.00      1.00      1.00        11
weighted avg       1.00      1.00      1.00        11



In [10]:
# Get 2025 data for prediction
current_races = get_race_data(2025)

# Find the British GP round
british_gp_2025 = current_races[
    current_races["raceName"].str.contains("British Grand Prix")
]
british_gp_round = None

if not british_gp_2025.empty:
    british_gp_round = british_gp_2025.iloc[0]["round"]

In [11]:
# Determine who's actually participating
try:
    british_gp_qualifying = get_qualifying_data(2025, british_gp_round)
    british_gp_drivers = set(
        british_gp_qualifying["Driver"].apply(lambda x: x["driverId"])
    )
    print(f"Found {len(british_gp_drivers)} drivers participating in 2025 British GP")
except:
    # Fallback if the GP hasn't occurred yet
    print("GP qualifying data not available yet, using all active drivers")
    british_gp_drivers = None

Found 20 drivers participating in 2025 British GP


In [12]:
# Get current season data for all races so far
qualifying_2025 = []
results_2025 = []

for _, race in current_races.iterrows():
    # Skip races at or after the British GP
    if int(race["round"]) >= int(british_gp_round):
        continue

    try:
        q = get_qualifying_data(2025, race["round"])
        q["driverId"] = q["Driver"].apply(lambda x: x["driverId"])
        qualifying_2025.append(q[["driverId", "position"]])

        r = get_race_results(2025, race["round"])
        r["driverId"] = r["Driver"].apply(lambda x: x["driverId"])
        results_2025.append(r[["driverId", "position"]])

    except Exception as e:
        print(f"Could not get data for round {race['round']}: {e}")

In [13]:
current_qual = (
    pd.concat(qualifying_2025)
    .assign(position=lambda x: x["position"].fillna(0).astype(int))
    .groupby("driverId")["position"]
    .mean()
    .reset_index()
)

current_results = (
    pd.concat(results_2025)
    .assign(position=lambda x: x["position"].fillna(0).astype(int))
    .groupby("driverId")["position"]
    .mean()
    .reset_index()
)

In [14]:
# Handle any remaining invalid values before merging
max_position = (
    max(current_qual["position"].max(), current_results["position"].max()) or 20
)
current_qual["position"] = current_qual["position"].replace(0, max_position + 1)
current_results["position"] = current_results["position"].replace(0, max_position + 1)

In [15]:
current_features = current_qual.merge(
    current_results, on="driverId", suffixes=("_qual", "_race")
).rename(
    columns={"position_qual": "qualifying_position", "position_race": "race_position"}
)

In [16]:
# Filter only drivers participating in the British GP
if british_gp_drivers:
    current_features = current_features[
        current_features["driverId"].isin(british_gp_drivers)
    ]
    print(f"Filtered to {len(current_features)} participating drivers")

current_features = current_features.fillna(max_position + 1)
current_features = current_features[
    (current_features["qualifying_position"] > 0)
    & (current_features["race_position"] > 0)
]

# Make predictions for the filtered list of drivers
current_features["predicted_winner"] = model.predict(
    current_features[["qualifying_position", "race_position"]].astype(float)
)

print("2025 British GP Predictions:")
print(
    current_features[
        ["driverId", "qualifying_position", "race_position", "predicted_winner"]
    ]
)

Filtered to 20 participating drivers
2025 British GP Predictions:
          driverId  qualifying_position  race_position  predicted_winner
0            albon             9.818182      10.636364                 0
1           alonso            10.909091      12.636364                 0
2        antonelli             8.090909      10.545455                 0
3          bearman            16.545455      12.909091                 0
4        bortoleto            15.272727      15.363636                 0
5        colapinto            16.000000      14.400000                 0
7            gasly            12.181818      13.909091                 0
8           hadjar             9.818182      11.181818                 0
9         hamilton             7.181818       7.363636                 0
10      hulkenberg            15.818182      12.454545                 0
11          lawson            14.454545      13.363636                 0
12         leclerc             5.636364       5.727273    