In [1]:
import requests
from pathlib import Path
import os

import shelve

from bs4 import BeautifulSoup
from bs4 import Comment
import pandas as pd

In [2]:
players_url = {
    'EPL': {
        '2020-2021': "https://fbref.com/en/comps/9/stats/Premier-League-Stats", 
        '2019-2020': "https://fbref.com/en/comps/9/3232/stats/2019-2020-Premier-League-Stats", 
        '2018-2019': "https://fbref.com/en/comps/9/1889/stats/2018-2019-Premier-League-Stats",
        '2017-2018': "https://fbref.com/en/comps/9/1631/stats/2017-2018-Premier-League-Stats", 
        '2016-2017': "https://fbref.com/en/comps/9/1526/stats/2016-2017-Premier-League-Stats", 
        '2015-2016': "https://fbref.com/en/comps/9/1467/stats/2015-2016-Premier-League-Stats",
        '2014-2015': "https://fbref.com/en/comps/9/733/stats/2014-2015-Premier-League-Stats",
    }
}

In [3]:
# d = shelve.open('players.shelve')
# list(d.keys())

In [4]:
# def shelve_it(file_name):
#     d = shelve.open(file_name)

#     def decorator(func):
#         def new_func(param):
#             if param not in d:
#                 print(param)
#                 d[param] = func(param)
#             else:
#                 print('Found Cached!')
#             return d[param]

#         return new_func
    
#     d.close()

#     return decorator

In [5]:
# @shelve_it('players.shelve')
# def get_page(link):
#     print(link)
#     return requests.get(link).content

In [6]:
from diskcache import Cache

cache = Cache("players.shelve")

@cache.memoize()
def get_page(link):
    print(link)
    return requests.get(link).content

In [7]:
players_url

{'EPL': {'2020-2021': 'https://fbref.com/en/comps/9/stats/Premier-League-Stats',
  '2019-2020': 'https://fbref.com/en/comps/9/3232/stats/2019-2020-Premier-League-Stats',
  '2018-2019': 'https://fbref.com/en/comps/9/1889/stats/2018-2019-Premier-League-Stats',
  '2017-2018': 'https://fbref.com/en/comps/9/1631/stats/2017-2018-Premier-League-Stats',
  '2016-2017': 'https://fbref.com/en/comps/9/1526/stats/2016-2017-Premier-League-Stats',
  '2015-2016': 'https://fbref.com/en/comps/9/1467/stats/2015-2016-Premier-League-Stats',
  '2014-2015': 'https://fbref.com/en/comps/9/733/stats/2014-2015-Premier-League-Stats'}}

In [18]:
columns = {}
datasets = {}

for league in players_url:
    for season in players_url[league]:
        data = []
        print(league, season)
        link = players_url[league][season]
        page = get_page(link)
        soup = BeautifulSoup(page, 'html.parser')
        for el in soup.find_all(string=lambda text: isinstance(text, Comment)):
            if 'div_stats_standard' in el:
                soup2 = BeautifulSoup(el, 'html.parser')
        
        column = ['League', 'Season'] + [el.text.strip() for el in soup2.find('thead').find_all('tr')[1].find_all('th')]
        if league not in columns:
            columns[league] = {}
        columns[league][season] = column
        
        for el in soup2.find('tbody').find_all('tr'):
            row = []
            row.append(league)
            row.append(season)
            count_empty = 0
            for el2 in el.find_all('th'):
                row.append(el2.text.strip())
                if el2.text.strip() == '':
                    count_empty += 1
            for el2 in el.find_all('td'):
                row.append(el2.text.strip())
                if el2.text.strip() == 'Matches':
                    row[-1] = 'https://fbref.com' + el2.find('a')['href'].strip()
                if el2.text.strip() == '':
                    count_empty += 1
            if count_empty != (len(row) - 2) and row[0] != 'Rk':
                data.append(row)
        
        if league not in datasets:
            datasets[league] = {}
        datasets[league][season] = data

for league in datasets:
    for season in datasets[league]:
        df = pd.DataFrame(datasets[league][season], columns = columns[league][season])

        path = Path(f'dfs/{league}/players')
        path.mkdir(parents=True, exist_ok=True)

        df.iloc[:, list(range(20))].to_csv(os.path.join(path, f'{season}_performance.csv'), index = False)
        df.iloc[:, list(range(13)) + list(range(20, 25))].to_csv(os.path.join(path, f'{season}_performance_per_90_min.csv'), index = False)
        df.iloc[:, list(range(9)) + [len(df.columns) - 1]].to_csv(os.path.join(path, f'{season}_match_links.csv'), index = False)
        
        if len(df.columns) > 27:
            df.iloc[:, list(range(13)) + list(range(25, 29))].to_csv(os.path.join(path, f'{season}_expected.csv'), index = False)
            df.iloc[:, list(range(13)) + list(range(29, 34))].to_csv(os.path.join(path, f'{season}_expected_per_90_min.csv'), index = False)
        

EPL 2020-2021
EPL 2019-2020
EPL 2018-2019
EPL 2017-2018
EPL 2016-2017
EPL 2015-2016
EPL 2014-2015
