# Overview

Cross validate our parsing results with Ergast data, against the 2023 season.

## Lap times

In [1]:
import os
import re
import sys
import zipfile

import pandas as pd
import requests
from tqdm.auto import tqdm

sys.path.append('..')

from parse_race_history_chart import parse_race_history_chart


# Download Ergast data
if not os.path.exists('../data/lap_times.csv'):
    resp = requests.get('https://ergast.com/downloads/f1db_csv.zip', stream=True)
    with open('f1db_csv.zip', 'wb') as f:
        for chunk in resp.iter_content(chunk_size=4096):
            f.write(chunk)
    os.makedirs('../data', exist_ok=True)
    with zipfile.ZipFile('f1db_csv.zip', 'r') as zip_ref:
        zip_ref.extractall('../data')
    os.remove('f1db_csv.zip')

# Lap times for 2023
lap_times = pd.read_csv('../data/lap_times.csv', usecols=['raceId', 'driverId', 'lap', 'time'])
races = pd.read_csv('../data/races.csv', usecols=['raceId', 'year', 'round'])
races = races[races['year'] == 2023]
lap_times = lap_times.merge(races, on='raceId', how='inner')
del races
drivers = pd.read_csv('../data/drivers.csv', usecols=['driverId', 'number'])
lap_times = lap_times.merge(drivers, on='driverId', how='inner')
lap_times.drop(columns=['raceId', 'driverId', 'year'], inplace=True)
del drivers

# Our parsing results
lap_times_parsing = []
for round in tqdm(range(1, 23), desc='Parsing Race History Chart PDFs in 2023'):
    temp = parse_race_history_chart(f'../data/history2023{round:02d}.pdf')
    temp['round'] = round
    lap_times_parsing.append(temp)
lap_times_parsing = pd.concat(lap_times_parsing, ignore_index=True)
lap_times_parsing.rename(columns={'time': 'time_parsing'}, inplace=True)

# Shift lap No. for those lapped cars, e.g. car with `gap == "1 LAP"` should have `lap -= 1`
lap_times_parsing['lap'] = lap_times_parsing.apply(
    lambda x: x['lap'] - int(re.findall(r'(\d+) LAP', x.gap)[0])
    if x['gap'] and re.findall(r'(\d+) LAP', x['gap'])
    else x['lap'], axis=1
)

Parsing Race History Chart PDFs in 2023:   0%|          | 0/22 [00:00<?, ?it/s]

In [2]:
lap_times.loc[lap_times['number'] == '33', 'number'] = '1'  # Ver: 33 --> 1
matched = lap_times.merge(lap_times_parsing,
                          left_on=['round', 'lap', 'number'],
                          right_on=['round', 'lap', 'driver_no'],
                          how='outer',
                          indicator=True)

In [36]:
temp = matched[matched['_merge'] == 'both']

In [13]:
temp[temp['time'] != temp['time_parsing']].sample(n=10, random_state=1234)

Unnamed: 0,lap,time,round,number,driver_no,gap,time_parsing,_merge
10103,44,1:27.442,9,2,2,PIT,1:15.763,both
21653,25,1:34.160,20,81,81,PIT,1:21.760,both
2941,54,6:35.214,3,77,77,PIT,1:50.643,both
21281,3,1:49.154,20,81,81,PIT,28:04.821,both
6118,51,1:27.706,6,4,4,PIT,1:35.974,both
10076,42,1:28.454,9,81,81,PIT,1:14.404,both
2950,55,7:47.991,3,21,21,PIT,9:18.552,both
7431,40,1:43.035,7,77,77,PIT,1:27.553,both
1028,55,2:04.847,1,4,4,PIT,1:39.789,both
22130,56,1:34.215,20,3,3,PIT,1:19.237,both


In [11]:
temp[(temp['time'] != temp['time_parsing']) & (temp['gap'] != 'PIT')]

Unnamed: 0,lap,time,round,number,driver_no,gap,time_parsing,_merge


In [81]:
# jolpica/jolpica-f1#13
errors = [
    (14, 48),
    (77, 46),
    (21, 46),
    (10, 57),
    (44, 49),
    (20, 45),
    (4,  50),
    (31, 46),
    (81, 49),
    (63, 48),
    (55, 54),
    (18, 48),
    (22, 37),
    (24, 50)
]
temp = matched[matched['_merge'] == 'both'].copy()
temp = temp[temp['round'] == 9]
temp['driver_no'] = temp['driver_no'].astype(int)
temp = temp.sort_values(by=['driver_no', 'lap']).set_index(['driver_no', 'lap'])
temp.loc[errors]

Unnamed: 0_level_0,Unnamed: 1_level_0,time,round,number,gap,time_parsing,_merge
driver_no,lap,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
14,48,1:09.634,9,14,52.323,1:09.634,both
77,46,1:10.074,9,77,1 LAP,1:10.074,both
21,46,1:09.852,9,21,1 LAP,1:09.852,both
10,57,1:09.507,9,10,59.717,1:09.507,both
44,49,1:09.735,9,44,51.505,1:09.735,both
20,45,1:10.703,9,20,1 LAP,1:10.703,both
4,50,1:08.972,9,4,22.314,1:08.972,both
31,46,1:10.451,9,31,1 LAP,1:10.451,both
81,49,1:09.896,9,81,1 LAP,1:09.896,both
63,48,1:09.734,9,63,65.145,1:09.734,both
