In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import fastf1
from fastf1.core import *

In [3]:
from data_extraction import *

In [4]:
monaco_2023=get_session_data(2023, 'Monaco', 'R')

core           INFO 	Loading data for Monaco Grand Prix - Race [v3.5.3]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '14', '31', '44', '63', '16', '10', '55', '4', '81', '77', '21', '24', '23', '22', '11', '27', '2', '20', '18']


In [5]:
race_res = preprocess_race_results(monaco_2023)

In [6]:
race_res.info()

<class 'fastf1.core.SessionResults'>
Index: 20 entries, 1 to 18
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   DriverNumber        20 non-null     object 
 1   Abbreviation        20 non-null     object 
 2   DriverId            20 non-null     object 
 3   TeamName            20 non-null     object 
 4   TeamColor           20 non-null     object 
 5   FullName            20 non-null     object 
 6   CountryCode         20 non-null     object 
 7   Position            20 non-null     float64
 8   ClassifiedPosition  20 non-null     object 
 9   GridPosition        20 non-null     float64
 10  Status              20 non-null     object 
 11  Points              20 non-null     float64
 12  RaceTime            20 non-null     object 
dtypes: float64(3), object(10)
memory usage: 2.2+ KB


In [7]:
lap_res=generate_laps_summary(monaco_2023)

In [8]:
race_res['Status'].value_counts()

Status
Lapped      10
Finished     9
Retired      1
Name: count, dtype: int64

In [9]:
lap_res.head()

Unnamed: 0,Driver,Compound,LapsOnCompound,FastestLapOnCompound,FastestSector1,FastestSector2,FastestSector3,FastestLapOverall,PitStops
0,VER,INTERMEDIATE,22,01:25.201,00:22.004,00:40.652,00:21.731,01:16.604,1
1,VER,MEDIUM,53,01:16.604,00:19.913,00:36.053,00:20.320,01:16.604,1
2,GAS,HARD,45,01:16.839,00:19.978,00:36.326,00:20.518,01:15.831,2
3,GAS,INTERMEDIATE,22,01:25.607,00:22.318,00:41.293,00:21.658,01:15.831,2
4,GAS,MEDIUM,4,01:15.831,00:19.843,00:35.628,00:20.282,01:15.831,2


In [21]:
abbr_to_name = dict(zip(race_res["Abbreviation"], race_res["FullName"]))

lap_res["FullName"] = lap_res["Driver"].map(abbr_to_name)

In [10]:
GP_NAME = "Monaco Grand Prix"

In [22]:
race_summary_tmpl = (
    "In the {GP_NAME}, {FullName} (#{DriverNumber}, {CountryCode}) drove for "
    "{TeamName}, started P{GridPosition}, and finished P{Position} ({ClassifiedPosition}). "
    "They scored {Points} points with a total time of {RaceTime}. "
    "{n_finished} finished the race, {n_lapped} driver(s) got lapped, {n_retired} driver(s) retired."
)
driver_tmpl = (
    "{FullName} began on P{GridPosition}, classified as {ClassifiedPosition}, "
    "earning {Points} point(s). Status: {Status}."
)
stint_tmpl = (
    "{FullName} used the {Compound} tyre for {LapsOnCompound} lap(s), "
    "with best lap on that tyre in {FastestLapOnCompound}."
)
lap_overall_tmpl = (
    "{FullName}’s overall fastest lap was {FastestLapOverall}; "
    "sectors: S1={FastestSector1}, S2={FastestSector2}, S3={FastestSector3}."
)
pit_tmpl = "{FullName} made {PitStops} pit stop(s)."

In [19]:
row0     = race_res.iloc[0]
n_lapped = (race_res.Status.str.lower() == "lapped").sum()
n_finished = (race_res.Status.str.lower() == "finished").sum()
n_retired = (race_res.Status.str.lower() == "retired").sum()

In [31]:
summary = race_summary_tmpl.format(**row0.to_dict(), GP_NAME=GP_NAME, n_finished=n_finished, n_lapped=n_lapped, n_retired=n_retired)
driver_lines = [ driver_tmpl.format(**r.to_dict()) for _,r in race_res.iterrows() ]
stint_texts = [
    stint_tmpl.format(**r.to_dict())
    for _, r in lap_res.iterrows()
]

# For each driver, pick the row with the *fastest* lap overall
idx_best = lap_res.groupby("FullName")["FastestLapOverall"].idxmin()
best_lap_rows = lap_res.loc[idx_best]

lap_texts = [
    lap_overall_tmpl.format(**r.to_dict())
    for _, r in best_lap_rows.iterrows()
]

# For each driver, sum (or max) their pit stops
#    If PitStops is cumulative per row, use max(); 
#    if it’s per‐stint, use sum().
pit_counts = lap_res.groupby("FullName")["PitStops"].unique()
pit_texts = [
    pit_tmpl.format(FullName=drv, PitStops=count)
    for drv, count in pit_counts.items()
]

# Combine into your full lap‐context
context = "\n".join([summary] + driver_lines+stint_texts + lap_texts + pit_texts)
print(context)

In the Monaco Grand Prix, Max Verstappen (#1, NED) drove for Red Bull Racing, started P1.0, and finished P1.0 (1). They scored 25.0 points with a total time of 108:51.980. 9 finished the race, 10 driver(s) got lapped, 1 driver(s) retired.
Max Verstappen began on P1.0, classified as 1, earning 25.0 point(s). Status: Finished.
Fernando Alonso began on P2.0, classified as 2, earning 18.0 point(s). Status: Finished.
Esteban Ocon began on P3.0, classified as 3, earning 15.0 point(s). Status: Finished.
Lewis Hamilton began on P5.0, classified as 4, earning 13.0 point(s). Status: Finished.
George Russell began on P8.0, classified as 5, earning 10.0 point(s). Status: Finished.
Charles Leclerc began on P6.0, classified as 6, earning 8.0 point(s). Status: Finished.
Pierre Gasly began on P7.0, classified as 7, earning 6.0 point(s). Status: Finished.
Carlos Sainz began on P4.0, classified as 8, earning 4.0 point(s). Status: Finished.
Lando Norris began on P10.0, classified as 9, earning 2.0 point(

### QA SPECS

In [32]:
qa_specs = [
    # ----- Race Results questions -----
    (
      "Who won the {GPName}?",
      lambda rr, lr: rr.loc[rr.Position == 1, "FullName"].iloc[0]
    ),
    (
      "Which driver finished in position {pos}?",
      lambda rr, lr, pos: rr.loc[rr.Position == int(pos), "FullName"].iloc[0]
    ),
    (
      "What was {driver}'s starting grid position?",
      lambda rr, lr, driver: int(rr.loc[rr.FullName == driver, "GridPosition"].iloc[0])
    ),
    (
      "What was {driver}'s classified position?",
      lambda rr, lr, driver: rr.loc[rr.FullName == driver, "ClassifiedPosition"].iloc[0]
    ),
    (
      "How many points did {driver} score?",
      lambda rr, lr, driver: str(rr.loc[rr.FullName == driver, "Points"].iloc[0])
    ),
    (
      "What was {driver}'s total race time?",
      lambda rr, lr, driver: rr.loc[rr.FullName == driver, "RaceTime"].iloc[0]
    ),
    (
      "Which drivers retired in the {GPName}?",
      lambda rr, lr: ", ".join(rr.loc[rr.Status.str.lower()=="retired", "FullName"].tolist())
    ),
    (
      "How many drivers retired in the {GPName}?",
      lambda rr, lr: str((rr.Status.str.lower()=="retired").sum())
    ),
    (
      "Which team scored the most points?",
      lambda rr, lr: rr.groupby("TeamName").Points.sum().idxmax()
    ),
    (
      "What is the country code of {driver}?",
      lambda rr, lr, driver: rr.loc[rr.FullName == driver, "CountryCode"].iloc[0]
    ),

    # ----- Race Laps questions -----
    (
      "Which driver had the overall fastest lap in the {GPName}?",
      lambda rr, lr: lr.loc[lr.FastestLapOverall == lr.FastestLapOverall.min(), "FullName"].iloc[0]
    ),
    (
      "What was the overall fastest lap time?",
      lambda rr, lr: lr.FastestLapOverall.min()
    ),
    (
      "Who set the fastest sector 1 time?",
      lambda rr, lr: lr.loc[lr.FastestSector1 == lr.FastestSector1.min(), "FullName"].iloc[0]
    ),
    (
      "What was the fastest sector 1 time?",
      lambda rr, lr: lr.FastestSector1.min()
    ),
    (
      "Who set the fastest sector 2 time?",
      lambda rr, lr: lr.loc[lr.FastestSector2 == lr.FastestSector2.min(), "FullName"].iloc[0]
    ),
    (
      "What was the fastest sector 2 time?",
      lambda rr, lr: lr.FastestSector2.min()
    ),
    (
      "Who set the fastest sector 3 time?",
      lambda rr, lr: lr.loc[lr.FastestSector3 == lr.FastestSector3.min(), "FullName"].iloc[0]
    ),
    (
      "What was the fastest sector 3 time?",
      lambda rr, lr: lr.FastestSector3.min()
    ),
    (
      "Which driver used the {Compound} tyre for the most laps?",
      lambda rr, lr, Compound: lr.groupby("Driver").LapsOnCompound.sum().idxmax()
    ),
    (
      "How many laps did {driver} do on {Compound} tyres?",
      lambda rr, lr, driver, Compound: str(
         lr.loc[(lr.FullName==driver)&(lr.Compound==Compound), "LapsOnCompound"].sum()
      )
    ),
    (
      "What was {driver}'s fastest lap time on {Compound} tyres?",
      lambda rr, lr, driver, Compound: lr.loc[
         (lr.FullName==driver)&(lr.Compound==Compound),
         "FastestLapOnCompound"
      ].min()
    ),
    (
      "Which driver made the most pit stops?",
      lambda rr, lr: lr.groupby("FullName").PitStops.sum().idxmax()
    ),
    (
      "How many pit stops did {driver} make?",
      lambda rr, lr, driver: str(lr.loc[lr.FullName==driver, "PitStops"].sum())
    ),
]

In [38]:
import string
from itertools import product

GP_NAME = "Monaco Grand Prix"
qas = []

# Precompute value domains
pos_vals      = sorted(race_res.Position.astype(int).unique())
driver_vals   = race_res.FullName.unique().tolist()
compound_vals = lap_res.Compound.unique().tolist()

formatter = string.Formatter()

for i, (tmpl, fn) in enumerate(qa_specs):
    # 1) Figure out which fields appear in this template
    fields = [fname for _, fname, _, _ in formatter.parse(tmpl) if fname]
    # e.g. ['GPName','driver'] or ['GPName','Compound'] or just ['GPName']

    # 2) Separate out GPName
    extra_fields = [f for f in fields if f != "GPName"]

    # 3) Build the cartesian product of values for those extra fields
    if extra_fields:
        # Map each field name to its list of possible values
        domain = []
        for f in extra_fields:
            if f == "pos":
                domain.append((f, pos_vals))
            elif f == "driver":
                domain.append((f, driver_vals))
            elif f == "Compound":
                domain.append((f, compound_vals))
            else:
                raise ValueError(f"Unknown placeholder {f}")

        # Iterate every combination
        for combo in product(*[vals for _, vals in domain]):
            # combo is a tuple of selected values in the same order as domain
            template_kwargs = {"GPName": GP_NAME}
            fn_kwargs       = {}
            for (field_name, _), value in zip(domain, combo):
                template_kwargs[field_name] = value
                fn_kwargs[field_name]       = value

            # 4) Format question and compute answer
            question = tmpl.format(**template_kwargs)
            answer   = fn(race_res, lap_res, **fn_kwargs)
            answer   = str(answer)
            start    = context.find(answer)

            qas.append({
                "id":       f"q{i}_" + "_".join(f"{name}{val}" for (name,_), val in zip(domain, combo)),
                "question": question,
                "answers":  [{"text": answer, "answer_start": start}],
                "is_impossible": False
            })

    else:
        # No extra fields → single question
        question = tmpl.format(GPName=GP_NAME)
        answer   = fn(race_res, lap_res)
        answer   = str(answer)
        start    = context.find(answer)

        qas.append({
            "id": f"q{i}",
            "question": question,
            "answers": [{"text": answer, "answer_start": start}],
            "is_impossible": False
        })

In [39]:
import json

squad = {
    "data": [{
        "title": GP_NAME,
        "paragraphs": [{
            "context": context,
            "qas": qas
        }]
    }]
}

# with open("f1_gp_qa.json", "w") as f:
    # json.dump(squad, f, indent=2)
# print("Saved f1_gp_qa.json")

Saved f1_gp_qa.json


In [40]:
from sklearn.model_selection import train_test_split

orig = squad["data"][0]["paragraphs"][0]
train_qas, val_qas = train_test_split(orig["qas"], test_size=0.2, random_state=42)

train_squad = {"data":[{"title":GP_NAME,"paragraphs":[{"context":context,"qas":train_qas}]}]}
val_squad   = {"data":[{"title":GP_NAME,"paragraphs":[{"context":context,"qas":val_qas}]}]}

with open("train.json","w") as f: json.dump(train_squad, f, indent=2)
with open("val.json",  "w") as f: json.dump(val_squad,   f, indent=2)
print("Wrote train.json & val.json")

Wrote train.json & val.json
