In [1]:
from typing import Annotated
from webdriver_manager.chrome import ChromeDriverManager
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import time
import htmltabletomd
import re
from bs4 import BeautifulSoup
from openai import OpenAI
import json
import pandas as pd
import htmlmin
from io import StringIO
import sys
import pandas as pd
from IPython.display import display, Markdown

def extract_text_from_dynamic_site(url, wait_time=10):
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
    chrome_options.add_experimental_option('useAutomationExtension', False)

    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)

    try:
        driver.get(url)
        print(f"Loaded: {url}")
        time.sleep(wait_time)
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, 'html.parser')
        return soup
    except Exception as e:
        print(f"Error: {e}")
        return None
    finally:
        driver.quit()

def get_f1_driver_standings(
    standings_type: Annotated[str, "Select 'drivers' or 'team' championship standings."]
) -> str:
    """Get the current F1 drivers or teams standings. 
    Use it to determine the points for each driver in the driver's championship or the points for each team in the teams' championship."""
    if standings_type=="team":
        url = f"https://www.formula1.com/en/results/2025/team"
    else:
        url = f"https://www.formula1.com/en/results/2025/drivers"
    soup_og = extract_text_from_dynamic_site(url, wait_time=3)
    html_table = soup_og.find_all('table')[0]
    html_text = str(html_table.prettify())
    html_text = re.sub(" class=\"[^\"]*\"", "", html_text)
    html_text = re.sub("<img[^>]*>", "", html_text)
    html_text = re.sub("<a[^>]*>", "", html_text)
    html_text = re.sub("<span[^>]*>", "", html_text)
    html_text = re.sub("</span[^>]*>", "", html_text)
    html_text = re.sub("<br>", "", html_text)
    html_text = re.sub("<p>", "", html_text)
    html_text = re.sub("\\\n *", "", html_text)
    drivers_standings_table = htmltabletomd.convert_table(html_text)
    return drivers_standings_table

# Retreive F1 schedule (with time zones)
def get_f1_schedule() -> str:
    # Parse the F1 schedule from the F1 website
    url = f"https://www.formula1.com/en/racing/2025.html"
    soup_og = extract_text_from_dynamic_site(url, wait_time=3)

In [2]:
url = f"https://www.formula1.com/en/racing/2025.html"
soup_og = extract_text_from_dynamic_site(url, wait_time=3)
def get_info(x):
    x = x.prettify()
    x = re.sub(" class=\"[^\"]*\"", "", x)
    x = re.sub("<img[^>]*>", "", x)
    x = re.sub("<svg[^>]*>", "", x)
    x = re.sub("</svg[^>]*>", "", x)
    x = re.sub("<path[^>]*>", "", x)
    x = re.sub("</path[^>]*>", "", x)
    x = re.sub("<a[^>]*>", "", x)
    x = re.sub("<span[^>]*>", "<span>", x)
    x = re.sub("</span[^>]*>", "</span>", x)
    x = re.sub("<defs[^>]*>", "", x)
    x = re.sub("</defs[^>]*>", "", x)
    x = re.sub("<g[^>]*>", "", x)
    x = re.sub("</g[^>]*>", "", x)
    x = re.sub("<clippath[^>]*>", "", x)
    x = re.sub("</clippath[^>]*>", "", x)
    x = re.sub("<br>", "", x)
    x = re.sub("<p>", "", x)
    x = x.splitlines()
    y = []
    for line in x:
        if line.strip() != "":
            y.append(line.strip())
    event_text = ''.join(y)
    event_info = re.findall(r"(?<=>)[^<>]+(?=<)", event_text)
    return event_info
possible_races = soup_og.find_all(attrs={"data-f1rd-a7s-click": "event_tile_click"})
race_cards = []
for race in possible_races:
    if len(race.find_all('title')) > 0:
        race_cards.append(race)
for race in race_cards:
    country = race.find_all('title')[-1].text[8:]
event_info_list = []
for race in race_cards:
    event_info_list.append(get_info(race))
events = []
event_dict = {}
for event in event_info_list:
    if len(event)==5:
        round = event[0]
        country = event[2]
        event_name = event[3]
        dates = event[4]
    elif len(event)==6:
        round = event[0]
        country = event[3]
        event_name = event[4]
        dates = event[5]
    if len(event)>6:
        round = event[0]
        country = event[4]
        event_name = event[5]
        dates = event[2]
    event_dict[round] = {"Country": country, "Dates": dates}
schedule_text = pd.DataFrame(event_dict).T.to_markdown()

Loaded: https://www.formula1.com/en/racing/2025.html


In [3]:
# Write schedule_text to an txt file
with open("data-cache/schedule_text.txt", "w") as f:
    f.write(schedule_text)

In [4]:
# Drivers
url = f"https://www.formula1.com/en/results/2025/drivers"
soup_og = extract_text_from_dynamic_site(url, wait_time=3)
html_table = soup_og.find_all('table')[0]
html_text = str(html_table.prettify())
html_text = re.sub(" class=\"[^\"]*\"", "", html_text)
html_text = re.sub("<img[^>]*>", "", html_text)
html_text = re.sub("<a[^>]*>", "", html_text)
html_text = re.sub("<span[^>]*>", "", html_text)
html_text = re.sub("</span[^>]*>", "", html_text)
html_text = re.sub("<br>", "", html_text)
html_text = re.sub("<p>", "", html_text)
html_text = re.sub("\\\n *", "", html_text)
dr_raw_markdown_table = htmltabletomd.convert_table(html_text)
drivers = pd.read_html(StringIO(html_text))[0]
dr_html_table = drivers.set_index('POS.')[['DRIVER', 'TEAM', 'PTS.']].to_html(border='')
dr_minified = htmlmin.minify(dr_html_table, remove_empty_space=True).replace(" class=dataframe", "").replace(' style="text-align: right;"', "")

Loaded: https://www.formula1.com/en/results/2025/drivers


In [5]:
# Write dr_minified to an html file
with open("data-cache/drivers_standings.html", "w") as f:
    f.write(dr_minified)

In [7]:
# HTML table whitespace
dr_html_table = drivers.set_index('POS.')[['DRIVER', 'TEAM', 'PTS.']].to_html(border='')
# Minify HTML
dr_minified = htmlmin.minify(dr_html_table, remove_empty_space=True)
# Markdown table
dr_markdown_table = drivers.set_index('POS.')[['DRIVER', 'TEAM', 'PTS.']].to_markdown()
# Natural language - New line
dr_nl_1 = []
for row in drivers[['POS.', 'DRIVER', 'TEAM', 'PTS.']].values:
    dr_nl_1.append(f"{row[0]}. {row[1]} has {row[3]} points and drives for {row[2]}.")
dr_nl_1 = '\n'.join(dr_nl_1)
# Natural language - <> tag inline
dr_nl_2 = []
for row in drivers[['POS.', 'DRIVER', 'TEAM', 'PTS.']].values:
    dr_nl_2.append(f"<{row[0]}>{row[1]} has {row[3]} points and drives for {row[2]}</{row[0]}>")
dr_nl_2 = ''.join(dr_nl_2)
# Natural language - <> tag new line
dr_nl_3 = []
for row in drivers[['POS.', 'DRIVER', 'TEAM', 'PTS.']].values:
    dr_nl_3.append(f"<{row[0]}>{row[1]} has {row[3]} points and drives for {row[2]}</{row[0]}>")
dr_nl_3 = '\n'.join(dr_nl_3)

In [8]:
dr_table_formats = {
    "html_table": dr_html_table,
    "html_mini": dr_minified,
    "markdown_table": dr_markdown_table,
    "raw_markdown_table": dr_raw_markdown_table,
    "natural_language_new_line": dr_nl_1,
    "natural_language_inline_tags": dr_nl_2,
    "natural_language_new_line_tags": dr_nl_3
}

In [9]:
# Teams
url = f"https://www.formula1.com/en/results/2025/team"
soup_og = extract_text_from_dynamic_site(url, wait_time=3)
html_table = soup_og.find_all('table')[0]
html_text = str(html_table.prettify())
html_text = re.sub(" class=\"[^\"]*\"", "", html_text)
html_text = re.sub("<img[^>]*>", "", html_text)
html_text = re.sub("<a[^>]*>", "", html_text)
html_text = re.sub("<span[^>]*>", "", html_text)
html_text = re.sub("</span[^>]*>", "", html_text)
html_text = re.sub("<br>", "", html_text)
html_text = re.sub("<p>", "", html_text)
html_text = re.sub("\\\n *", "", html_text)
tm_raw_markdown_table = htmltabletomd.convert_table(html_text)
teams = pd.read_html(StringIO(html_text))[0]
# HTML table whitespace
tm_html_table = teams.set_index('POS.')[['TEAM', 'PTS.']].to_html(border='')
# Minify HTML
tm_minified = htmlmin.minify(tm_html_table, remove_empty_space=True).replace(" class=dataframe", "").replace(' style="text-align: right;"', "")

Loaded: https://www.formula1.com/en/results/2025/team


In [10]:
# Write dr_minified to an html file
with open("data-cache/team_standings.html", "w") as f:
    f.write(tm_minified)

In [11]:
# HTML table whitespace
tm_html_table = teams.set_index('POS.')[['TEAM', 'PTS.']].to_html(border='')
# Minify HTML
tm_minified = htmlmin.minify(tm_html_table, remove_empty_space=True)
# Markdown table
tm_markdown_table = teams.set_index('POS.')[['TEAM', 'PTS.']].to_markdown()
# Natural language - New line
tm_nl_1 = []
for row in teams[['POS.', 'TEAM', 'PTS.']].values:
    tm_nl_1.append(f"{row[0]}. {row[1]} has {row[2]} points.")
tm_nl_1 = '\n'.join(tm_nl_1)
# Natural language - <> tag inline
tm_nl_2 = []
for row in teams[['POS.', 'TEAM', 'PTS.']].values:
    tm_nl_2.append(f"<{row[0]}>{row[1]} has {row[2]} points</{row[0]}>")
tm_nl_2 = ''.join(tm_nl_2)
# Natural language - <> tag new line
tm_nl_3 = []
for row in teams[['POS.', 'TEAM', 'PTS.']].values:
    tm_nl_3.append(f"<{row[0]}>{row[1]} has {row[2]} points</{row[0]}>")
tm_nl_3 = '\n'.join(tm_nl_3)

In [12]:
tm_table_formats = {
    "html_table": tm_html_table,
    "html_mini": tm_minified,
    "markdown_table": tm_markdown_table,
    "raw_markdown_table": tm_raw_markdown_table,
    "natural_language_new_line": tm_nl_1,
    "natural_language_inline_tags": tm_nl_2,
    "natural_language_new_line_tags": tm_nl_3
}

In [None]:
table_formats = list(set(dr_table_formats.keys()).intersection(set(tm_table_formats.keys())))
qas = pd.read_csv('test_prompts.txt', sep='|', header=None, names=['Prompt', 'Answer']).values
curr_date = pd.Timestamp.now().strftime('%b %d, %Y')
table_formats = ['html_table',
 'html_mini',
 'raw_markdown_table',
 'markdown_table']

### Results lookup
- Select race number, race location, driver, positions
- Query: Who was first in imola?
- API: positions=[1], races=['imola'], drivers=None, race_numbers=None
- Query: What are the last 3 results from Oscar Piastri?
- API: positions=None, races=[-3:], drivers=['Oscar Piastri']

#### Steps:
1. Automate extraction from web to local DB
2. Create sql-like syntax for extracting to local DB
3. Maintain local DB cache

### Testing

In [65]:
results = []
for i, (prompt, answer) in enumerate(qas[:3]):
    for curr_table_name in table_formats:
        # Point to the local server
        client = OpenAI(base_url="http://localhost:1234/v1", api_key="lm-studio")
        model = "qwen/qwen3-4b-2507"
        def get_f1_driver_standings() -> str:
            """Get the current F1 drivers standings. 
            Use it to determine the points for each driver in the driver's championship."""
            return dr_table_formats[curr_table_name]
        def get_f1_team_standings() -> str:
            """Get the current F1 teams standings. 
            Use it to determine the points for each team in the teams' championship."""
            return tm_table_formats[curr_table_name]

        tools = [
            {
                "type":"function",
                "function": {
                    "name": "get_f1_driver_standings",
                    "description": "Get the current F1 drivers standings. Use it to determine the points for each driver in the driver's championship.",
                    "parameters":  {
                        "type": "object",
                        "properties": {}
                    },
                        "additionalProperties": False,
                    },
                },
            {
                "type":"function",
                "function": {
                    "name": "get_f1_team_standings",
                    "description": "Get the current F1 team standings. Use it to determine the points for each team in the teams' championship.",
                    "parameters":  {
                        "type": "object",
                        "properties": {}
                    },
                        "additionalProperties": False,
                    },
                },
        ]

        messages = [
            {
                "role":"system",
                "content":f"""You are an expert in Formula 1 that will assist the user with questions regarding the sport.
There are 24 races in the current season and the schedule is as follows:
{schedule_text}
The current date is {curr_date}. Consider this date when asked about the next race (closest date after the current date) or the previous race (closest date before the current date).
There have been a few changes in terms of driver line-ups.
If you need to know which driver belongs to a certain team or which team a driver belongs to, use `get_f1_driver_standings`.
Use `get_f1_driver_standings` and `get_f1_team_standings` to assist the user.""",
            },
            {
                "role": "user",
                "content": prompt,
            },
        ]

        # LM Studio
        response = client.chat.completions.create(
            model=model,
            messages=messages,
            tools=tools,
        )
        first_response = response
        # Extract the arguments for get_delivery_date
        # Note this code assumes we have already determined that the model generated a function call.
        tool_calls = getattr(response.choices[0].message, "tool_calls", None)
        if tool_calls and len(tool_calls) > 0:
            tool_call_response_payloads = []
            function_call_results = {}
            for tool_call in tool_calls:
                # Get a reference to the current module
                current_module = sys.modules[__name__]
                # Dynamically get and call the function
                function_name = tool_call.function.name
                result = getattr(current_module, function_name)()
                tool_call_response_payload = {
                    "id":tool_call.id,
                    "type":tool_call.type,
                    "function":tool_call.function,
                }
                tool_call_response_payloads.append(tool_call_response_payload)
                function_call_results[function_name] = result
            assistant_tool_call_request_message = {
                "role": "assistant",
                "tool_calls": tool_call_response_payloads
            }

            # Create a message containing the result of the function call
            function_call_result_message = {
                "role":"tool",
                "content": json.dumps(
                    function_call_results
                ),
                "tool_call_id": tool_call.id,
            }

            # Prepare the chat completion call payload
            completion_messages_payload = [
                messages[0],
                messages[1],
                assistant_tool_call_request_message,
                function_call_result_message,
            ]

            # Call the OpenAI API's chat completions endpoint to send the tool call result back
            # to the model
            response = client.chat.completions.create(
                model=model,
                messages=completion_messages_payload,
            )

        final_response = response.choices[0].message.content
        messages = [
            {
                "role":"system",
                "content":"You a teacher evaluating student answers to questions about Formula 1. You have a <question> a <correct_answer> and a <student_answer>. Only respond with 'Correct' or 'Incorrect'.",
            },
            {
                "role": "user",
                "content": f"<question>{prompt}</question><correct_answer>{answer}</correct_answer><student_answer>{final_response}</student_answer>",
            },
        ]

        # LM Studio
        response = client.chat.completions.create(
            model=model,
            messages=messages,
        )
        evaluation = response.choices[0].message.content
        results.append([model, curr_table_name, prompt, answer, final_response, len(final_response), evaluation])
        print(f"{i+1}/{len(qas)} - {curr_table_name}", end='\r')
results_df = pd.DataFrame(results, columns=['model', 'table_format', 'prompt', 'correct_answer', 'final_answer', 'final_answer_length', 'evaluation'])
results_df['good'] = results_df['evaluation'].map({'Correct': 1, 'Incorrect': 0})

3/13 - markdown_tableable

In [66]:
results_df.groupby('table_format')[['good', 'final_answer_length']].mean().reset_index()

Unnamed: 0,table_format,good,final_answer_length
0,html_mini,0.666667,122.0
1,html_table,0.666667,168.333333
2,markdown_table,0.666667,122.666667
3,raw_markdown_table,0.666667,158.333333


In [67]:
results_df[results_df['table_format']=='html_mini'][['prompt', 'good']]

Unnamed: 0,prompt,good
1,What is the next race?,1
5,When is the Mexican grand prix?,1
9,What is the last race on the calendar?,0


In [68]:
results_df.groupby('prompt')[['good', 'final_answer_length']].mean()

Unnamed: 0_level_0,good,final_answer_length
prompt,Unnamed: 1_level_1,Unnamed: 2_level_1
What is the last race on the calendar?,0.0,107.0
What is the next race?,1.0,268.5
When is the Mexican grand prix?,1.0,53.0


In [71]:
tf = 0
pmpt = 2
print(table_formats[tf])
print(qas[pmpt])
display(Markdown(results_df[(results_df['table_format']==table_formats[tf]) & (results_df['prompt']==qas[pmpt][0])]['final_answer'].iloc[0]))

html_table
['What is the last race on the calendar?'
 'Abu Dhabi Grand Prix on 5 to 7 December']


The last race on the calendar is the Abu Dhabi Grand Prix, which takes place from 5th to 7th December 2025.

In [None]:
results_df[(results_df['table_format']==table_formats[tf])].groupby('prompt')[['good', 'final_answer_length']].mean()

# TODO
- Improve caching by using race schedule
- Test different schedule formats
- Add result lookup functions
- Fix last race and current race eval