In [162]:
# import libraries
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
from urllib.request import urlopen

In [360]:
# Get url functions

def get_race_urls(year):
    urls = []
    f1_url = 'https://www.formula1.com'
    source = urllib.request.urlopen(f1_url + '/en/results.html/' + str(year) + '/races.html').read()
    soup = BeautifulSoup(source,'lxml')
    
    for url in soup.find_all('a'):
        if str(year) in str(url.get('href'))\
            and 'race-result' in str(url.get('href'))\
            and f1_url + url.get('href') not in urls:
            urls.append(f1_url + url.get('href'))
    return urls

def get_drivers_urls(year):
    urls = []
    f1_url = 'https://www.formula1.com'
    source = urllib.request.urlopen(f1_url + '/en/results.html/' + str(year) + '/drivers.html').read()
    soup = BeautifulSoup(source,'lxml')
    
    for url in soup.find_all('a'):
        if str(year) in str(url.get('href'))\
            and '/drivers/' in str(url.get('href'))\
            and f1_url + url.get('href') not in urls:
            urls.append('https://www.formula1.com' + url.get('href'))
    return urls

def get_team_urls(year):
    urls = []
    f1_url = 'https://www.formula1.com'
    source = urllib.request.urlopen(f1_url + '/en/results.html/' + str(year) + '/team.html').read()
    soup = BeautifulSoup(source,'lxml')
    
    for url in soup.find_all('a'):
        if str(year) in str(url.get('href'))\
            and '/team/' in str(url.get('href'))\
            and f1_url + url.get('href') not in urls:
            urls.append('https://www.formula1.com' + url.get('href'))
    return urls

In [428]:
# Year parameters
years = range(2020, 2022)

In [440]:
# Generate df for teams
teams_df_list = []

for year in years:
    urls = get_team_urls(year)
    for url in urls:
        team_name = url.split('/')[-1].split('.')[0].replace('_', ' ').title()
        source = urlopen(url).read()
        soup = BeautifulSoup(source, 'lxml')
        table = soup.find_all('table')[0]
        df = pd.read_html(str(table), flavor='bs4', header=[0])[0]
        df = df.iloc[:, 1:4]
        df["team_name"] = team_name
        df["year"] = year
        teams_df_list.append(df)

teams_df = pd.concat(teams_df_list, ignore_index=True)
teams_df.head()
teams_df.columns = ['race_grand_prix_name', 'race_date', 'points', 'team_name', 'year']
teams_df = teams_df[['year', 'race_date', 'race_grand_prix_name', 'team_name', 'points']]
teams_df['race_date'] = pd.to_datetime(teams_df['race_date'])

In [458]:
# Generate df for drivers
drivers_df_list = []

for year in years:
    urls = get_drivers_urls(year)
    for url in urls:
        driver_name = url.split('/')[-1].split('.')[0].replace('-', ' ').title()
        source = urlopen(url).read()
        soup = BeautifulSoup(source, 'lxml')
        table = soup.find_all('table')[0]
        df = pd.read_html(str(table), flavor='bs4', header=[0])[0]
        df = df.iloc[:, 1:5]
        df["driver_name"] = driver_name
        df["year"] = year
        drivers_df_list.append(df)

drivers_df = pd.concat(drivers_df_list, ignore_index=True)
drivers_df.columns = ['race_grand_prix_name', 'race_date', 'team_name', 'race_position', 'driver_name', 'year']
drivers_df = drivers_df[['year', 'race_date', 'race_grand_prix_name', 'team_name', 'driver_name', 'race_position']]
drivers_df['race_date'] = pd.to_datetime(drivers_df['race_date'])

In [459]:
# Generate df for race, fastest laps, pit stop summary, starting grid, qualifying, 
# practise 1, practice 2, practice 3, sprint results, sprint grid

Unnamed: 0,year,race_date,race_grand_prix_name,team_name,driver_name,race_position
0,2020,2020-12-06,Sakhir,Williams Mercedes,Jack Aitken,16
1,2020,2020-07-05,Austria,Red Bull Racing Honda,Alexander Albon,DNF
2,2020,2020-07-12,Styria,Red Bull Racing Honda,Alexander Albon,4
3,2020,2020-07-19,Hungary,Red Bull Racing Honda,Alexander Albon,5
4,2020,2020-08-02,Great Britain,Red Bull Racing Honda,Alexander Albon,8


In [370]:
race_result_pages = ['race-result', 'fastest-laps', 'pit-stop-summary', 'starting-grid', 'qualifying',
                     'practice-1', 'practice-2', 'practice-3', 'sprint-results', 'sprint-grid']

validate('https://www.formula1.com/en/results.html/2020/races/1046/austria/race-result.html')

Unnamed: 0.1,Unnamed: 0,Grand Prix,Date,PTS,Unnamed: 4
0,,Austria,05 Jul 2020,0,
1,,Styria,12 Jul 2020,0,
2,,Hungary,19 Jul 2020,0,
3,,Great Britain,02 Aug 2020,0,
4,,70th Anniversary,09 Aug 2020,0,
5,,Spain,16 Aug 2020,0,
6,,Belgium,30 Aug 2020,0,
7,,Italy,06 Sep 2020,0,
8,,Tuscany,13 Sep 2020,0,
9,,Russia,27 Sep 2020,0,
