In [1]:
import pandas as pd
import requests
import bs4
from typing import List, Dict


# fbref.com

Get match urls

In [2]:
def get_team_matches(team_id, season:str)->List[Dict[str, str|bool]]:
    fbref_url = f'https://fbref.com/en/squads/{team_id}/{season}/all_comps/'
    fb_ref = requests.get(fbref_url)
    soup = bs4.BeautifulSoup(fb_ref.text)
    tables = soup.find_all("table")
    match_table = tables[5]
    match_links = [t['href'] for t in match_table.find_all("a") if '/en/matches/' in t['href']]

    matches =[]

    for idx, rw in enumerate(match_table.find_all("tr")):
        try:
            match_date = rw.find("th").text
            competition = rw.find("td",{"data-stat":"comp"}).text
            home = rw.find("td",{"data-stat":"venue"}).text =='Home'
            opponent = rw.find("td",{"data-stat":"opponent"}).text
            try:
                link = rw.find("th").find("a")['href']
            except TypeError:
                link = None
            match = {"date":match_date, "competition":competition, 
                        "is_home":home, "opponent":opponent, "link":link }
            matches.append(match)
        except AttributeError:
            print("no data for row " + str(idx))
    return matches

Get match lineups

In [3]:
def get_match_lineups(match_id:str)->(List[str],List[str]):
    fbref_url = 'https://fbref.com/en/matches/' + match_id
    fb_ref = requests.get(fbref_url)
    soup = bs4.BeautifulSoup(fb_ref.text)
    tables = soup.find_all("table")

    df = pd.read_html(str(tables[0]))[0]
    bench_idx = df[df.iloc[:,0]=='Bench'].index[0]
    bench_lineup = df.loc[bench_idx+1:, :]
    starting_lineup = df.loc[:bench_idx-1:, :]
    starting_lineup = starting_lineup.iloc[:,1]
    bench_lineup = bench_lineup.iloc[:,1]
    return [*starting_lineup], [*bench_lineup]

In [4]:
def get_player_matches(player_id:str, season:str)->pd.DataFrame:
    """season should be in format like: '2022-2023'"""
    fbref_url = f"https://fbref.com/en/players/{player_id}/matchlogs/{season}/"
    fb_ref = requests.get(fbref_url)
    soup = bs4.BeautifulSoup(fb_ref.text)
    tables = soup.find_all("table")

    df = pd.read_html(str(tables[0]))[0]
    df= df.droplevel(0, axis=1)
    df['Min'] = df['Min'].replace("On matchday squad, but did not play", 0)
    df.dropna(axis=0, inplace=True)
    df.replace("On matchday squad, but did not play", None, inplace=True)
    keep_cols = ['Date', 'Comp', 'Squad', 'Opponent',
        'Start', 'Min']
    return df[keep_cols]


In [5]:
season = '2022-2023'
liverpool_id = '822bd0ba'

liverpool_matches = get_team_matches(liverpool_id, season)

no data for row 0


In [6]:
konate_id = "5ed9b537"
konate_playing_time_df = get_player_matches(konate_id,season)

In [7]:
konate_games_df = pd.DataFrame(liverpool_matches).merge(konate_playing_time_df, left_on='date', right_on='Date', how='outer')
# games without a link haven't been played
konate_games_df.dropna(subset=['link'], inplace=True)

In [8]:
def get_playing_time(team_id, player_id, season):
    liverpool_matches = get_team_matches(team_id, season)
    player_matches = get_player_matches(player_id,season)
    player_games_df = pd.DataFrame(liverpool_matches).merge(player_matches, left_on='date', right_on='Date', how='left')

    # games without a link haven't been played yet (postponed or future date)
    player_games_df.dropna(subset=['link'], inplace=True)
    player_games_df.drop('Date', inplace=True, axis=1)

    return player_games_df

In [9]:
konate_playing_time2223 = get_playing_time(liverpool_id, konate_id, season)

no data for row 0


In [10]:
konate_playing_time2122 = get_playing_time(liverpool_id, konate_id, '2021-2022')

no data for row 0


In [11]:
konate_all_liverpool_df = pd.concat([konate_playing_time2122,konate_playing_time2223])

In [12]:
konate_all_liverpool_df['date'] = pd.to_datetime(konate_all_liverpool_df['date'])

In [13]:
konate_all_liverpool_df

Unnamed: 0,date,competition,is_home,opponent,link,Comp,Squad,Opponent,Start,Min
0,2021-08-14,Premier League,False,Norwich City,/en/matches/c52500ad/Norwich-City-Liverpool-Au...,Premier League,Liverpool,Norwich City,N,0
1,2021-08-21,Premier League,True,Burnley,/en/matches/94d9dac0/Liverpool-Burnley-August-...,Premier League,Liverpool,Burnley,N,0
2,2021-08-28,Premier League,True,Chelsea,/en/matches/78aa75e6/Liverpool-Chelsea-August-...,Premier League,Liverpool,Chelsea,N,0
3,2021-09-12,Premier League,False,Leeds United,/en/matches/e6a245be/Leeds-United-Liverpool-Se...,Premier League,Liverpool,Leeds United,N,0
4,2021-09-15,Champions Lg,True,it Milan,/en/matches/ff3e4ae2/Liverpool-Milan-September...,Champions Lg,eng Liverpool,it Milan,N,0
...,...,...,...,...,...,...,...,...,...,...
18,2022-10-26,Champions Lg,False,nl Ajax,/en/matches/d88e832b/Ajax-Liverpool-October-26...,Champions Lg,eng Liverpool,nl Ajax,N,0
19,2022-10-29,Premier League,True,Leeds United,/en/matches/b5b1e744/Liverpool-Leeds-United-Oc...,Premier League,Liverpool,Leeds United,N,0
20,2022-11-01,Champions Lg,True,it Napoli,/en/matches/91efb39a/Liverpool-Napoli-November...,Champions Lg,eng Liverpool,it Napoli,Y,90
21,2022-11-06,Premier League,False,Tottenham,/en/matches/2954504d/Tottenham-Hotspur-Liverpo...,Premier League,Liverpool,Tottenham,Y,90
