In [1]:
import requests
from pathlib import Path
import os

import shelve

from bs4 import BeautifulSoup
from bs4 import Comment
import pandas as pd

In [2]:
from diskcache import Cache

cache = Cache("players_passing_matches.shelve")

@cache.memoize()
def get_page(link):
    print(link)
    return requests.get(link).content

In [3]:
def uniquify(df_columns):
    seen = set()

    for item in df_columns:
        fudge = 1
        newitem = item

        while newitem in seen:
            fudge += 1
            newitem = "{}_{}".format(item, fudge)

        yield newitem
        seen.add(newitem)

In [4]:
path = 'dfs/EPL/players'
season_player_match_links = {f.split('_')[0]:os.path.join(path, f) for f in os.listdir(path) if os.path.isfile(os.path.join(path, f)) and f.endswith('match_links.csv')}
season_player_match_links = dict(sorted(season_player_match_links.items(), key = lambda kv:(kv[1], kv[0]), reverse = True))
season_player_match_links

{'2020-2021': 'dfs/EPL/players/2020-2021_match_links.csv',
 '2019-2020': 'dfs/EPL/players/2019-2020_match_links.csv',
 '2018-2019': 'dfs/EPL/players/2018-2019_match_links.csv',
 '2017-2018': 'dfs/EPL/players/2017-2018_match_links.csv',
 '2016-2017': 'dfs/EPL/players/2016-2017_match_links.csv',
 '2015-2016': 'dfs/EPL/players/2015-2016_match_links.csv',
 '2014-2015': 'dfs/EPL/players/2014-2015_match_links.csv'}

In [5]:
dfs = []
for season in season_player_match_links:
    df = pd.read_csv(season_player_match_links[season])
    df = df[df['Rk'] != 'Rk'].reset_index(drop=True)
    dfs.append(df)

In [6]:
dfs[0].iloc[0]['Matches'].replace('summary', 'passing')

'https://fbref.com/en/players/5f09991f/matchlogs/2020-2021/passing/Patrick-van-Aanholt-Match-Logs'

In [7]:
def build_column_name_prefixes(spans, prefixes):
    assert len(spans) == len(prefixes)
    new_cols = []
    for i in range(len(prefixes)):
        for j in range(spans[i]):
            new_cols.append(prefixes[i])
    return new_cols
            

In [8]:
def join_prefixes_suffixes(prefixes, suffixes):
    assert len(prefixes) == len(suffixes)
    new_names = []
    for i in range(len(prefixes)):
        if prefixes[i] == '':
            new_names.append(suffixes[i])
        else:
            new_names.append(prefixes[i] + '_' + suffixes[i])
    return new_names

In [9]:
join_prefixes_suffixes(
    ['', '', '', '', '', '', '', '', '', '', '', 'Total', 'Total', 'Total', 'Total', 'Total', 'Short', 'Short', 'Short', 'Medium', 'Medium', 'Medium', 'Long', 'Long', 'Long', '', '', '', '', '', '', '', ''], 
    ['Date', 'Day', 'Comp', 'Round', 'Venue', 'Result', 'Squad', 'Opponent', 'Start', 'Pos', 'Min', 'Cmp', 'Att', 'Cmp%', 'TotDist', 'PrgDist', 'Cmp', 'Att', 'Cmp%', 'Cmp', 'Att', 'Cmp%', 'Cmp', 'Att', 'Cmp%', 'Ast', 'xA', 'KP', '1/3', 'PPA', 'CrsPA', 'Prog', 'Match Report']
)

['Date',
 'Day',
 'Comp',
 'Round',
 'Venue',
 'Result',
 'Squad',
 'Opponent',
 'Start',
 'Pos',
 'Min',
 'Total_Cmp',
 'Total_Att',
 'Total_Cmp%',
 'Total_TotDist',
 'Total_PrgDist',
 'Short_Cmp',
 'Short_Att',
 'Short_Cmp%',
 'Medium_Cmp',
 'Medium_Att',
 'Medium_Cmp%',
 'Long_Cmp',
 'Long_Att',
 'Long_Cmp%',
 'Ast',
 'xA',
 'KP',
 '1/3',
 'PPA',
 'CrsPA',
 'Prog',
 'Match Report']

In [13]:
columns = {}
datasets = {}

from tqdm.auto import tqdm


for j, df in tqdm(enumerate(dfs)):
    if df['Season'].unique()[0] in ['2020-2021', '2019-2020', '2018-2019', '2017-2018']:
        print(df['Season'].unique()[0])
#         if j in [0, 1, 2]:
#             continue
        for i, row in tqdm(df.iterrows()):

            data = []

            link = row['Matches']
            link = link.replace('summary', 'passing')
            rk = row['Rk']
            league = row['League']
            season = row['Season']

            page = get_page(link)

            soup = BeautifulSoup(page, 'html.parser')
            
            thead_trs = soup.find('thead').find_all('tr')
            
            col_spans = [1 if el.get('colspan') is None else int(el.get('colspan')) for el in thead_trs[0].find_all('th')]
            column = [el.text.strip() for el in thead_trs[0].find_all('th')]
            prefixes = build_column_name_prefixes(col_spans, column)
            column = [el.text.strip() for el in thead_trs[1].find_all('th')]
            column = ['Season'] + join_prefixes_suffixes(prefixes, column)
#             print(column)
            
            if league not in columns:
                columns[league] = {}
            columns[league][season] = column

            for el in soup.find('div', id='div_matchlogs_all').find('tbody').find_all('tr'):
                row = []
                row.append(season)
                count_empty = 0
                for el2 in el.find_all('th'):
                    row.append(el2.text.strip())
                    if el2.text.strip() == '':
                        count_empty += 1
                for el2 in el.find_all('td'):
                    row.append(el2.text.strip())
                    if el2.text.strip() == '':
                        count_empty += 1
#                 print(row)
                if row[-2] == 'On matchday squad, but did not play':
                    temp = row[-1]
                    row[-1] = None
                    for i in range(len(columns[league][season]) - len(row) - 1):
                        row.append(None)
                    row.append(temp)

                if count_empty != (len(row) - 1):
                    data.append(row)

            if league not in datasets:
                datasets[league] = {}
            datasets[league][season] = data

            df2 = pd.DataFrame(datasets[league][season], columns = columns[league][season])

            path = Path(f'dfs/{league}/players_passing_matches/{season}')
            path.mkdir(parents=True, exist_ok=True)

            df2.to_csv(os.path.join(path, f'player_rk_{rk}.csv'), index = False)
            

0it [00:00, ?it/s]

2020-2021


0it [00:00, ?it/s]

https://fbref.com/en/players/c81d773d/matchlogs/2020-2021/passing/Tosin-Adarabioyo-Match-Logs
https://fbref.com/en/players/f76e6b4e/matchlogs/2020-2021/passing/Adrian-Match-Logs
https://fbref.com/en/players/4d034881/matchlogs/2020-2021/passing/Sergio-Aguero-Match-Logs
https://fbref.com/en/players/246d153b/matchlogs/2020-2021/passing/Ola-Aina-Match-Logs
https://fbref.com/en/players/9b398aea/matchlogs/2020-2021/passing/Rayan-Ait-Nouri-Match-Logs
https://fbref.com/en/players/d6192210/matchlogs/2020-2021/passing/Semi-Ajayi-Match-Logs
https://fbref.com/en/players/eaeca114/matchlogs/2020-2021/passing/Nathan-Ake-Match-Logs
https://fbref.com/en/players/b827d5b3/matchlogs/2020-2021/passing/Marc-Albrighton-Match-Logs
https://fbref.com/en/players/77e84962/matchlogs/2020-2021/passing/Thiago-Alcantara-Match-Logs
https://fbref.com/en/players/f7d50789/matchlogs/2020-2021/passing/Toby-Alderweireld-Match-Logs
https://fbref.com/en/players/8fa8f8d2/matchlogs/2020-2021/passing/Runar-Alex-Runarsson-Match-L

https://fbref.com/en/players/2b91cf8e/matchlogs/2020-2021/passing/Ciaran-Clark-Match-Logs
https://fbref.com/en/players/0442183b/matchlogs/2020-2021/passing/Nathaniel-Clyne-Match-Logs
https://fbref.com/en/players/2928dca2/matchlogs/2020-2021/passing/Conor-Coady-Match-Logs
https://fbref.com/en/players/0420d84f/matchlogs/2020-2021/passing/Seamus-Coleman-Match-Logs
https://fbref.com/en/players/27c01749/matchlogs/2020-2021/passing/Aaron-Connolly-Match-Logs
https://fbref.com/en/players/dc64b8b3/matchlogs/2020-2021/passing/Liam-Cooper-Match-Logs
https://fbref.com/en/players/74e12f1e/matchlogs/2020-2021/passing/Jack-Cork-Match-Logs
https://fbref.com/en/players/f13d02a6/matchlogs/2020-2021/passing/Helder-Costa-Match-Logs
https://fbref.com/en/players/fdf3cb77/matchlogs/2020-2021/passing/Vladimir-Coufal-Match-Logs
https://fbref.com/en/players/4f974391/matchlogs/2020-2021/passing/Aaron-Cresswell-Match-Logs
https://fbref.com/en/players/a4fe8cda/matchlogs/2020-2021/passing/Patrick-Cutrone-Match-Logs

https://fbref.com/en/players/bf8fad51/matchlogs/2020-2021/passing/Kortney-Hause-Match-Logs
https://fbref.com/en/players/fed7cb61/matchlogs/2020-2021/passing/Kai-Havertz-Match-Logs
https://fbref.com/en/players/c951a6df/matchlogs/2020-2021/passing/Isaac-Hayden-Match-Logs
https://fbref.com/en/players/5f1a2f31/matchlogs/2020-2021/passing/Michael-Hector-Match-Logs
https://fbref.com/en/players/5540c9dc/matchlogs/2020-2021/passing/Ahmed-Hegazi-Match-Logs
https://fbref.com/en/players/e5a76dfe/matchlogs/2020-2021/passing/Dean-Henderson-Match-Logs
https://fbref.com/en/players/935e6b8f/matchlogs/2020-2021/passing/Jordan-Henderson-Match-Logs
https://fbref.com/en/players/989d5705/matchlogs/2020-2021/passing/Jeff-Hendrick-Match-Logs
https://fbref.com/en/players/c0398aca/matchlogs/2020-2021/passing/Pablo-Hernandez-Match-Logs
https://fbref.com/en/players/92e7e919/matchlogs/2020-2021/passing/Son-Heung-min-Match-Logs
https://fbref.com/en/players/9a71e978/matchlogs/2020-2021/passing/Ki-Jana-Hoever-Match-

https://fbref.com/en/players/d8931174/matchlogs/2020-2021/passing/Harry-Maguire-Match-Logs
https://fbref.com/en/players/892d5bb1/matchlogs/2020-2021/passing/Riyad-Mahrez-Match-Logs
https://fbref.com/en/players/e3a5814e/matchlogs/2020-2021/passing/Ainsley-Maitland-Niles-Match-Logs
https://fbref.com/en/players/bc0bfc64/matchlogs/2020-2021/passing/Josh-Maja-Match-Logs
https://fbref.com/en/players/c691bfe2/matchlogs/2020-2021/passing/Sadio-Mane-Match-Logs
https://fbref.com/en/players/758dd7f0/matchlogs/2020-2021/passing/Javier-Manquillo-Match-Logs
https://fbref.com/en/players/9ae9eaea/matchlogs/2020-2021/passing/Fernando-Marcal-Match-Logs
https://fbref.com/en/players/bb5fbd2b/matchlogs/2020-2021/passing/Solly-March-Match-Logs
https://fbref.com/en/players/aa54ec6f/matchlogs/2020-2021/passing/Pablo-Mari-Match-Logs
https://fbref.com/en/players/8b788c01/matchlogs/2020-2021/passing/Anthony-Martial-Match-Logs
https://fbref.com/en/players/48a5a5d6/matchlogs/2020-2021/passing/Martinelli-Match-Logs

https://fbref.com/en/players/1bf33a9a/matchlogs/2020-2021/passing/Christian-Pulisic-Match-Logs
https://fbref.com/en/players/466fb2c5/matchlogs/2020-2021/passing/Aaron-Ramsdale-Match-Logs
https://fbref.com/en/players/1544f145/matchlogs/2020-2021/passing/Jacob-Ramsey-Match-Logs
https://fbref.com/en/players/dc31b84c/matchlogs/2020-2021/passing/Kayne-Ramsey-Match-Logs
https://fbref.com/en/players/6385ebfb/matchlogs/2020-2021/passing/Darren-Randolph-Match-Logs
https://fbref.com/en/players/a1d5bd30/matchlogs/2020-2021/passing/Marcus-Rashford-Match-Logs
https://fbref.com/en/players/28b40c9c/matchlogs/2020-2021/passing/Tim-Ream-Match-Logs
https://fbref.com/en/players/ab651565/matchlogs/2020-2021/passing/Nathan-Redmond-Match-Logs
https://fbref.com/en/players/803ae100/matchlogs/2020-2021/passing/Harrison-Reed-Match-Logs
https://fbref.com/en/players/3353737a/matchlogs/2020-2021/passing/Sergio-Reguilon-Match-Logs
https://fbref.com/en/players/0f7533cd/matchlogs/2020-2021/passing/Bobby-Reid-Match-Lo

https://fbref.com/en/players/b28bbd58/matchlogs/2020-2021/passing/Andros-Townsend-Match-Logs
https://fbref.com/en/players/4dc4f138/matchlogs/2020-2021/passing/Conor-Townsend-Match-Logs
https://fbref.com/en/players/9a28eba4/matchlogs/2020-2021/passing/Adama-Traore-Match-Logs
https://fbref.com/en/players/c47541e0/matchlogs/2020-2021/passing/Bertrand-Traore-Match-Logs
https://fbref.com/en/players/3ae14ed1/matchlogs/2020-2021/passing/Trezeguet-Match-Logs
https://fbref.com/en/players/38ceb24a/matchlogs/2020-2021/passing/Leandro-Trossard-Match-Logs
https://fbref.com/en/players/f315ca93/matchlogs/2020-2021/passing/Kostas-Tsimikas-Match-Logs
https://fbref.com/en/players/2baec6ce/matchlogs/2020-2021/passing/Axel-Tuanzebe-Match-Logs
https://fbref.com/en/players/1880614f/matchlogs/2020-2021/passing/Cengiz-Under-Match-Logs
https://fbref.com/en/players/531a4aa8/matchlogs/2020-2021/passing/Yan-Valery-Match-Logs
https://fbref.com/en/players/45963054/matchlogs/2020-2021/passing/Jamie-Vardy-Match-Logs


0it [00:00, ?it/s]

https://fbref.com/en/players/5f09991f/matchlogs/2019-2020/passing/Patrick-van-Aanholt-Match-Logs
https://fbref.com/en/players/774cf58b/matchlogs/2019-2020/passing/Max-Aarons-Match-Logs
https://fbref.com/en/players/f586779e/matchlogs/2019-2020/passing/Tammy-Abraham-Match-Logs
https://fbref.com/en/players/f2bf1b0f/matchlogs/2019-2020/passing/Che-Adams-Match-Logs
https://fbref.com/en/players/f76e6b4e/matchlogs/2019-2020/passing/Adrian-Match-Logs
https://fbref.com/en/players/4d034881/matchlogs/2019-2020/passing/Sergio-Aguero-Match-Logs
https://fbref.com/en/players/2b1c4abd/matchlogs/2019-2020/passing/Albian-Ajeti-Match-Logs
https://fbref.com/en/players/eaeca114/matchlogs/2019-2020/passing/Nathan-Ake-Match-Logs
https://fbref.com/en/players/b827d5b3/matchlogs/2019-2020/passing/Marc-Albrighton-Match-Logs
https://fbref.com/en/players/f7d50789/matchlogs/2019-2020/passing/Toby-Alderweireld-Match-Logs
https://fbref.com/en/players/cd1acf9d/matchlogs/2019-2020/passing/Trent-Alexander-Arnold-Match-L

https://fbref.com/en/players/2afc7272/matchlogs/2019-2020/passing/Lewis-Cook-Match-Logs
https://fbref.com/en/players/3e5e4e63/matchlogs/2019-2020/passing/Steve-Cook-Match-Logs
https://fbref.com/en/players/74e12f1e/matchlogs/2019-2020/passing/Jack-Cork-Match-Logs
https://fbref.com/en/players/4f974391/matchlogs/2019-2020/passing/Aaron-Cresswell-Match-Logs
https://fbref.com/en/players/a4fe8cda/matchlogs/2019-2020/passing/Patrick-Cutrone-Match-Logs
https://fbref.com/en/players/d9565625/matchlogs/2019-2020/passing/Diogo-Dalot-Match-Logs
https://fbref.com/en/players/9d107085/matchlogs/2019-2020/passing/Charlie-Daniels-Match-Logs
https://fbref.com/en/players/52105203/matchlogs/2019-2020/passing/Scott-Dann-Match-Logs
https://fbref.com/en/players/6e33125f/matchlogs/2019-2020/passing/Kevin-Danso-Match-Logs
https://fbref.com/en/players/44781702/matchlogs/2019-2020/passing/Ben-Davies-Match-Logs
https://fbref.com/en/players/9228b07c/matchlogs/2019-2020/passing/Tom-Davies-Match-Logs
https://fbref.co

https://fbref.com/en/players/ba115a40/matchlogs/2019-2020/passing/Sebastien-Haller-Match-Logs
https://fbref.com/en/players/e9254eec/matchlogs/2019-2020/passing/Grant-Hanley-Match-Logs
https://fbref.com/en/players/bf8fad51/matchlogs/2019-2020/passing/Kortney-Hause-Match-Logs
https://fbref.com/en/players/c951a6df/matchlogs/2019-2020/passing/Isaac-Hayden-Match-Logs
https://fbref.com/en/players/a6de6361/matchlogs/2019-2020/passing/Tom-Heaton-Match-Logs
https://fbref.com/en/players/e5a76dfe/matchlogs/2019-2020/passing/Dean-Henderson-Match-Logs
https://fbref.com/en/players/935e6b8f/matchlogs/2019-2020/passing/Jordan-Henderson-Match-Logs
https://fbref.com/en/players/989d5705/matchlogs/2019-2020/passing/Jeff-Hendrick-Match-Logs
https://fbref.com/en/players/76eca7e2/matchlogs/2019-2020/passing/Wayne-Hennessey-Match-Logs
https://fbref.com/en/players/189cee7b/matchlogs/2019-2020/passing/Javier-Hernandez-Match-Logs
https://fbref.com/en/players/007b39a5/matchlogs/2019-2020/passing/Onel-Hernandez-Ma

https://fbref.com/en/players/0df2b165/matchlogs/2019-2020/passing/John-Lundstram-Match-Logs
https://fbref.com/en/players/83d074ff/matchlogs/2019-2020/passing/Alexis-Mac-Allister-Match-Logs
https://fbref.com/en/players/ee38d9c5/matchlogs/2019-2020/passing/James-Maddison-Match-Logs
https://fbref.com/en/players/d8931174/matchlogs/2019-2020/passing/Harry-Maguire-Match-Logs
https://fbref.com/en/players/892d5bb1/matchlogs/2019-2020/passing/Riyad-Mahrez-Match-Logs
https://fbref.com/en/players/e3a5814e/matchlogs/2019-2020/passing/Ainsley-Maitland-Niles-Match-Logs
https://fbref.com/en/players/c691bfe2/matchlogs/2019-2020/passing/Sadio-Mane-Match-Logs
https://fbref.com/en/players/758dd7f0/matchlogs/2019-2020/passing/Javier-Manquillo-Match-Logs
https://fbref.com/en/players/bb5fbd2b/matchlogs/2019-2020/passing/Solly-March-Match-Logs
https://fbref.com/en/players/aa54ec6f/matchlogs/2019-2020/passing/Pablo-Mari-Match-Logs
https://fbref.com/en/players/8aa5f52c/matchlogs/2019-2020/passing/Adrian-Mariap

https://fbref.com/en/players/4806ec67/matchlogs/2019-2020/passing/Jordan-Pickford-Match-Logs
https://fbref.com/en/players/9a526299/matchlogs/2019-2020/passing/Brandon-Pierrick-Match-Logs
https://fbref.com/en/players/1ef37668/matchlogs/2019-2020/passing/Erik-Pieters-Match-Logs
https://fbref.com/en/players/7fcc71d8/matchlogs/2019-2020/passing/Daniel-Podence-Match-Logs
https://fbref.com/en/players/867239d3/matchlogs/2019-2020/passing/Paul-Pogba-Match-Logs
https://fbref.com/en/players/4b40d9ca/matchlogs/2019-2020/passing/Nick-Pope-Match-Logs
https://fbref.com/en/players/86695068/matchlogs/2019-2020/passing/Dennis-Praet-Match-Logs
https://fbref.com/en/players/f5e6d08d/matchlogs/2019-2020/passing/Sebastian-Prodl-Match-Logs
https://fbref.com/en/players/fdd85f94/matchlogs/2019-2020/passing/Davy-Propper-Match-Logs
https://fbref.com/en/players/0745b37d/matchlogs/2019-2020/passing/Teemu-Pukki-Match-Logs
https://fbref.com/en/players/1bf33a9a/matchlogs/2019-2020/passing/Christian-Pulisic-Match-Logs

https://fbref.com/en/players/c2f9d19f/matchlogs/2019-2020/passing/Neil-Taylor-Match-Logs
https://fbref.com/en/players/f1e94cd6/matchlogs/2019-2020/passing/Nathan-Tella-Match-Logs
https://fbref.com/en/players/75566759/matchlogs/2019-2020/passing/Alexander-Tettey-Match-Logs
https://fbref.com/en/players/b547f08c/matchlogs/2019-2020/passing/Jordan-Thomas-Match-Logs
https://fbref.com/en/players/fc027d02/matchlogs/2019-2020/passing/Luke-Thomas-Match-Logs
https://fbref.com/en/players/fa38722f/matchlogs/2019-2020/passing/Max-Thompson-Match-Logs
https://fbref.com/en/players/56f7a928/matchlogs/2019-2020/passing/Youri-Tielemans-Match-Logs
https://fbref.com/en/players/fce2302c/matchlogs/2019-2020/passing/Kieran-Tierney-Match-Logs
https://fbref.com/en/players/e7ca0439/matchlogs/2019-2020/passing/James-Tomkins-Match-Logs
https://fbref.com/en/players/7edfbb8a/matchlogs/2019-2020/passing/Fikayo-Tomori-Match-Logs
https://fbref.com/en/players/52611641/matchlogs/2019-2020/passing/Lucas-Torreira-Match-Log

KeyboardInterrupt: 