In [1]:
import pandas as pd
import numpy as np

import requests
from bs4 import BeautifulSoup

import time

import warnings
warnings.filterwarnings('ignore')

In [19]:
def fetch_player_data(url):
    
    resp = requests.get(url)
    resp.encoding = 'utf-8'
    soup = BeautifulSoup(resp.text.replace('<!--', '').replace('--!>', ''), 'html.parser')

    headers, rows = [], []

    table_data = soup.find_all('tbody')[2]
    table_rows = table_data.find_all('tr')

    for value in table_rows[0]:
        headers.append(value.get('data-stat'))

    for row in table_rows:
        row_values = []
        row_data = row.find_all('td')
        for data_value in row_data:
            if data_value.get('data-append-csv'):
                row_values.append(data_value.get('data-append-csv'))
            row_values.append(data_value.get_text())
        rows.append(row_values)

    df = pd.DataFrame(rows, columns = headers).fillna(0)
    df = df.loc[df['player'] != 0].reset_index(drop = True)

    if 'minutes' in headers:
        df['minutes'] = df['minutes'].str.replace(',', '')
    elif 'gk_minutes' in headers:
        df['gk_minutes'] = df['gk_minutes'].str.replace(',', '')

    for i, row in df.iterrows():
        try:
            if '-' in row['age']:
                age_num = row['age'].split('-')[0]
                df.at[i, 'age'] = age_num
        except: pass

    df.rename(columns={'ranker' : 'player_id'}, inplace=True)
    df = df.apply(pd.to_numeric, errors = 'ignore')
    
    return df

In [20]:
def check_comp(comp):
    output = ""
    if comp.lower() == 'england' or comp.lower() == "premier_league" or comp.lower() == "eng":
        league_code = 9
        league_name = "Premier-League"
    elif comp.lower() == 'spain' or comp.lower() == "la_liga" or comp.lower() == "esp":
        league_code = 12
        league_name = "La-Liga"
    elif comp.lower() == 'italy' or comp.lower() == "serie-a" or comp.lower() == "ita":
        league_code = 11
        league_name = "Serie-A"
    elif comp.lower() == 'germany' or comp.lower() == "bundesliga" or comp.lower() == "ger":
        league_code = 20
        league_name = "Bundesliga"
    elif comp.lower() == 'france' or comp.lower() == "ligue-1" or comp.lower() == "fra":
        league_code = 13
        league_name = "Ligue-1"
    return league_code, league_name

In [21]:
def get_data(competition, year, data_type):
    
    comp = check_comp(competition)
    season = f"{str(year)}-{str(year+1)}"
    url = f"https://fbref.com/en/comps/{comp[0]}/{season}/{data_type}/{season}-{comp[1]}-Stats"
    df = fetch_player_data(url)
    
    return df

In [26]:
df = get_data("eng", 2022, "keepers")

In [27]:
df

Unnamed: 0,player_id,player,nationality,position,team,age,birth_year,gk_games,gk_games_starts,gk_minutes,...,gk_ties,gk_losses,gk_clean_sheets,gk_clean_sheets_pct,gk_pens_att,gk_pens_allowed,gk_pens_saved,gk_pens_missed,gk_pens_save_pct,matches
0,47064058,Oliver Baumann,de GER,GK,Hoffenheim,32,1990,34,34,3060,...,6,18,5,14.7,3,2,0,1,0.0,Matches
1,663e080e,Janis Blaswich,de GER,GK,RB Leipzig,31,1991,26,26,2340,...,4,6,7,26.9,7,6,1,0,14.3,Matches
2,d28339cb,Fabian Bredlow,de GER,GK,Stuttgart,27,1995,15,15,1350,...,5,6,1,6.7,5,5,0,0,0.0,Matches
3,db401046,Koen Casteels,be BEL,GK,Wolfsburg,30,1992,34,34,3060,...,10,11,12,35.3,7,4,1,2,20.0,Matches
4,af104654,Oliver Christensen,dk DEN,GK,Hertha BSC,23,1999,33,33,2970,...,8,19,3,9.1,6,5,1,0,16.7,Matches
5,dee3013b,Finn Dahmen,de GER,GK,Mainz 05,24,1998,8,8,720,...,2,3,0,0.0,3,1,2,0,66.7,Matches
6,3f6c3ae0,Tjark Ernst,de GER,GK,Hertha BSC,19,2003,1,1,90,...,0,0,0,0.0,0,0,0,0,,Matches
7,1e43fad8,Ralf Fährmann,de GER,GK,Schalke 04,33,1988,12,12,1025,...,6,3,5,41.7,2,1,0,1,0.0,Matches
8,a92ab7be,Mark Flekken,nl NED,GK,Freiburg,29,1993,34,34,3060,...,8,9,13,38.2,6,5,0,1,0.0,Matches
9,17376543,Rafał Gikiewicz,pl POL,GK,Augsburg,34,1987,23,23,2070,...,3,12,5,21.7,4,1,2,1,66.7,Matches
