In [45]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

In [51]:
years = list(range(2022, 2018, -1))
print(years)
all_matches = []
standings_url = "https://fbref.com/en/comps/9/Premier-League-Stats"

[2022, 2021, 2020, 2019]


In [52]:
for year in years:  # loop for all years in range
    data = requests.get(standings_url) # request page
    soup = BeautifulSoup(data.text) # extract all information form page
    standings_table = soup.select('table.stats_table')[0] # log the standings table for the current year

    links = [l.get("href") for l in standings_table.find_all('a')] # get the reference to all the links from all links on page
    links = [l for l in links if '/squads/' in l] # all links with '/squads/' in order to find the team pages for all clubs
    team_urls = [f"https://fbref.com{l}" for l in links] # absolute links for club specific pages
    
    previous_season = soup.select("a.prev")[0].get("href") # set new standings page to the previous season for next loop
    standings_url = f"https://fbref.com{previous_season}"
    
    for team_url in team_urls: # loop for all team links on standings page
        team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-", " ") # parse team name
        print(team_name)
        data = requests.get(team_url)
        
        matches = pd.read_html(data.text, match="Scores & Fixtures")[0] # Scores and fixtures dataframe for the club
        matches = matches[['Date', 'Time', 'Comp', 'Round', 'Day', 'Venue', 'Result', 'GF', 'GA', 'Opponent', 'Attendance', 'Captain', 'Formation','Referee']]

        soup = BeautifulSoup(data.text) # extract all information from page
        links = soup.find_all('a') # find all links on page for the club
        links = [l.get("href") for l in links] # get reference urls for those links
        links = [l for l in links if l and (('all_comps/shooting/' in l) or ('all_comps/possession/' in l) or ('all_comps/passing/' in l) or ('all_comps/defense/' in l))] # filter to only shooting stats links
        links = [*set(links)]

        for link in links:
            if '/shooting/' in link:
                data_shooting = requests.get(f"https://fbref.com{link}") # pull that data for the shooting stats link
                shooting = pd.read_html(data_shooting.text, match="Shooting")[0] # read the link's data from the Shooting table
            if '/passing/' in link:
                data_passing = requests.get(f"https://fbref.com{link}") 
                passing = pd.read_html(data_passing.text, match="Passing")[0] 
            if '/possession/' in link:
                data_possession = requests.get(f"https://fbref.com{link}")
                possession = pd.read_html(data_possession.text, match="Possession")[0]
            if '/defense/' in link:
                data_defense = requests.get(f"https://fbref.com{link}") 
                defense = pd.read_html(data_defense.text, match="Defensive Actions")[0] 

        # Clear out unecessary multi indexed column names and rename columns to specify features
        d = dict(zip(shooting.columns.levels[0], ['Expected', '', 'Standard', '']))
        shooting = shooting.rename(columns=d, level=0)

        d = dict(zip(possession.columns.levels[0], ['Carries', 'Dribbles', '', 'Receiving', 'Touches', '']))
        possession = possession.rename(columns=d, level=0)

        d = dict(zip(passing.columns.levels[0], ['', 'Long', 'Medium', 'Short', 'Total', '', '', '', '', '', '', '', '']))
        passing = passing.rename(columns=d, level=0)

        d = dict(zip(defense.columns.levels[0], ['Blocks', '', 'Pressures', 'Tackles', '', '', '', '', '', 'Vs Dribbles']))
        defense = defense.rename(columns=d, level=0)

        dataframes = {
            'matches': matches, 
            'shooting': shooting, 
            'possession': possession, 
            'passing': passing, 
            'defense': defense
        }

        for key, df in dataframes.items():
            if (key != 'matches'):
                df.columns = ['_'.join(col).strip() for col in df.columns.values]
            df.columns = df.columns.str.lstrip("_")
            df.columns = df.columns.str.replace(' ', '_')
            df.columns = df.columns.str.replace('-', '_')
            df.columns = df.columns.str.replace(':', '_')
            df.columns = df.columns.str.lower()
        
        shooting = shooting.drop(columns='match_report')
        possession = possession.drop(columns='match_report')
        passing = passing.drop(columns='match_report')
        defense = defense.drop(columns='match_report')


        try:
            team_data = matches.copy()
            dataframes = {
                'shooting': shooting, 
                'possession': possession, 
                'passing': passing, 
                'defense': defense
            }
            for key, df in dataframes.items():
                team_data = team_data.merge(df, on=['date', 'time', 'comp', 'round', 'day', 'venue', 'result', 'gf', 'ga', 'opponent'])

        except ValueError:
            continue

        team_data = team_data[team_data["comp"] == "Premier League"]
        
        team_data["team"] = team_name
        all_matches.append(team_data)
        time.sleep(1)

Arsenal
Manchester City
Tottenham Hotspur
Brighton and Hove Albion
Leeds United
Chelsea
Newcastle United
Manchester United
Liverpool
Brentford
Fulham
Crystal Palace
Southampton
Nottingham Forest
Aston Villa
West Ham United
Bournemouth
Everton
Wolverhampton Wanderers
Leicester City
Manchester City
Liverpool
Chelsea
Tottenham Hotspur
Arsenal
Manchester United
West Ham United
Leicester City
Brighton and Hove Albion
Wolverhampton Wanderers
Newcastle United
Crystal Palace
Brentford
Aston Villa
Southampton
Everton
Leeds United
Burnley
Watford
Norwich City
Manchester City
Manchester United
Liverpool
Chelsea
Leicester City
West Ham United
Tottenham Hotspur
Arsenal
Leeds United
Everton
Aston Villa
Newcastle United
Wolverhampton Wanderers
Crystal Palace
Southampton
Brighton and Hove Albion
Burnley
Fulham
West Bromwich Albion
Sheffield United
Liverpool
Manchester City
Manchester United
Chelsea
Leicester City
Tottenham Hotspur
Wolverhampton Wanderers
Arsenal
Sheffield United
Burnley
Southampton
Ev

In [53]:
match_df = pd.concat(all_matches).reset_index(drop=True)
match_df = match_df.sort_values('date', ascending=False)

In [54]:
match_df.shape

(2360, 99)

In [56]:
match_df.columns

Index(['date', 'time', 'comp', 'round', 'day', 'venue', 'result', 'gf', 'ga',
       'opponent', 'attendance', 'captain', 'formation', 'referee',
       'standard_gls', 'standard_sh', 'standard_sot', 'standard_sot%',
       'standard_g/sh', 'standard_g/sot', 'standard_dist', 'standard_fk',
       'standard_pk', 'standard_pkatt', 'expected_xg', 'expected_npxg',
       'expected_npxg/sh', 'expected_g_xg', 'expected_np_g_xg', 'poss',
       'touches_touches', 'touches_def_pen', 'touches_def_3rd',
       'touches_mid_3rd', 'touches_att_3rd', 'touches_att_pen', 'touches_live',
       'dribbles_succ', 'dribbles_att', 'dribbles_succ%', 'dribbles_#pl',
       'dribbles_megs', 'carries_carries', 'carries_totdist',
       'carries_prgdist', 'carries_prog', 'carries_1/3', 'carries_cpa',
       'carries_mis', 'carries_dis', 'receiving_targ', 'receiving_rec',
       'receiving_rec%', 'receiving_prog', 'total_cmp', 'total_att',
       'total_cmp%', 'total_totdist', 'total_prgdist', 'short_cmp',
    

In [67]:
import plotly.express as px
fig = px.scatter(
    match_df, 
    x="attendance", 
    y="gf", 
    color='result', 
    hover_data=['opponent', 'ga']
)
fig.show()