In [None]:
!pip install duckdb -q

import pandas as pd
import duckdb

def get_race_stats_df(start_year: int, end_year: int) -> pd.DataFrame:
    base_url = 'https://www.indianapolismotorspeedway.com/events/indy500/history/historical-stats/race-stats/race-results'

    og_column_names = ['Finish','Start', 'Qual. Speed', 'Car Num.', 'Driver', 'Car Name/Entrant', 'Make/Model', 'Status', 'Laps', 'Led', 'Winnings']
    column_dtypes = ['int8', 'int8', 'float32', 'int8', 'string', 'string', 'string', 'string', 'int32', 'int32', 'string']
    new_column_names = ['finish', 'start', 'qual_speed', 'car_num', 'driver', 'car_name', 'model', 'status', 'laps', 'led', 'winnings']

    dtype_mapper = {og: dtype for og, dtype in zip(og_column_names, column_dtypes)}
    column_mapper = {og: new for og, new in zip(og_column_names, new_column_names)}

    df = (
        pd.concat(
            [
                (
                    pd.read_html(f'{base_url}/{year}')[0]
                    .astype(dtype=dtype_mapper)
                    .rename(columns=column_mapper)
                    .assign(year=year)
                    .assign(winnings=lambda df_: df_.winnings.str.replace('[\$,]', '', regex=True)) # $1,000,000 -> 1000000
                )
                for year
                in range(start_year, end_year)
            ]
        )
        .reset_index(drop=True)
    )

    return df


def main(start_year: int, end_year: int=2024):
    df = get_race_stats_df(start_year, end_year)
    df.to_csv(f'race_stats_{start_year}-{end_year}.csv', index=False )


main(start_year=2000)