In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

In [1]:
years = list(range(2022, 2021, -1))
print(years)
all_matches = []
standings_url = "https://fbref.com/en/comps/9/Premier-League-Stats"

[2022]


In [4]:
for year in years:  # loop for all years in range
    print(year)
    data = requests.get(standings_url) # request page
    soup = BeautifulSoup(data.text) # extract all information form page
    standings_table = soup.select('table.stats_table')[0] # log the standings table for the current year

    links = [l.get("href") for l in standings_table.find_all('a')] # get the reference to all the links from all links on page
    links = [l for l in links if '/squads/' in l] # all links with '/squads/' in order to find the team pages for all clubs
    team_urls = [f"https://fbref.com{l}" for l in links] # absolute links for club specific pages
    
    previous_season = soup.select("a.prev")[0].get("href") # set new standings page to the previous season for next loop
    standings_url = f"https://fbref.com{previous_season}"
    
    for team_url in team_urls: # loop for all team links on standings page
        team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-", " ") # parse team name
        data = requests.get(team_url)
        
        matches = pd.read_html(data.text, match="Scores & Fixtures")[0] # Scores and fixtures dataframe for the club
        matches = matches[['Date', 'Time', 'Comp', 'Round', 'Day', 'Venue', 'Result', 'GF', 'GA', 'Opponent', 'Attendance', 'Captain', 'Formation','Referee']]

        soup = BeautifulSoup(data.text) # extract all information from page
        links = soup.find_all('a') # find all links on page for the club
        links = [l.get("href") for l in links] # get reference urls for those links
        links = [l for l in links if l and (('all_comps/shooting/' in l) or ('all_comps/possession/' in l) or ('all_comps/passing/' in l) or ('all_comps/defense/' in l))] # filter to only shooting stats links
        links = [*set(links)]

        df_info = {
            'matches': {
                'name': 'Matches',
                'column_levels': [],
                'df': matches,
            },
            'shooting': {
                'name': 'Shooting',
                'column_levels': ['Expected', '', 'Standard', ''],
                'df': pd.DataFrame()
            },
            'possession': {
                'name': 'Possession',
                'column_levels': ['Carries', 'Dribbles', '', 'Receiving', 'Touches', ''],
                'df': pd.DataFrame()
            },
            'passing': {
                'name': 'Passing',
                'column_levels': ['', 'Long', 'Medium', 'Short', 'Total', '', '', '', '', '', '', '', ''],
                'df': pd.DataFrame()
            },
            'defense': {
                'name': 'Defensive Actions',
                'column_levels': ['Blocks', '', 'Pressures', 'Tackles', '', '', '', '', '', 'Vs Dribbles'],
                'df': pd.DataFrame()
            }
        }

        # Update dataframes in our dictionary to pull web page tables
        for l in links:
            for key, webster in df_info.items():

                if (key != 'matches') and ('all_comps/'+ key in l):
                        data = requests.get(f"https://fbref.com{l}") # pull that data for the stats link
                        webster['df'] = pd.read_html(data.text, match=webster['name'])[0] # read the link's data from the table
                        d = dict(zip(webster['df'].columns.levels[0], webster['column_levels'])) # Clear out unecessary multi indexed column names and rename columns to specify features
                        webster['df'] = webster['df'].rename(columns=d, level=0)

        # Clean dataframe structure
        for key, webster in df_info.items():
            if (key != 'matches'):
                webster['df'].columns = ['_'.join(col).strip() for col in webster['df'].columns.values]
            webster['df'].columns = webster['df'].columns.str.lstrip("_")
            webster['df'].columns = webster['df'].columns.str.replace(' ', '_')
            webster['df'].columns = webster['df'].columns.str.replace('-', '_')
            webster['df'].columns = webster['df'].columns.str.replace(':', '_')
            webster['df'].columns = webster['df'].columns.str.lower()
            webster['df'] = webster['df'].set_index(['date', 'time', 'comp', 'round', 'day', 'venue', 'result', 'gf', 'ga', 'opponent'])
            webster['df'] = webster['df'].add_prefix(key + '_')
            webster['df'] = webster['df'].reset_index()
        
        try:
            team_data = df_info['matches']['df'].copy()
            for key, webseter in df_info.items():
                if (key != 'matches'):
                    team_data = team_data.merge(df_info[key]['df'], on=['date', 'time', 'comp', 'round', 'day', 'venue', 'result', 'gf', 'ga', 'opponent'])

        except ValueError:
            continue

        team_data = team_data[team_data["comp"] == "Premier League"]
        
        team_data["team"] = team_name
        all_matches.append(team_data)
        time.sleep(1)


match_df = pd.concat(all_matches).reset_index(drop=True)
match_df = match_df.sort_values('date', ascending=False)

2022


In [6]:
match_df.shape

(760, 103)

In [9]:
# pd.set_option('max_columns', None) # set to see all columns in df
pd.reset_option('max_columns')
match_df.head(3)

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,defense_blocks_blocks,defense_blocks_sh,defense_blocks_shsv,defense_blocks_pass,defense_int,defense_tkl+int,defense_clr,defense_err,defense_match_report,team
759,2022-05-22,16:00,Premier League,Matchweek 38,Sun,Home,L,0,5,Tottenham,...,8.0,4.0,1.0,4.0,16.0,,7.0,1.0,Match Report,Norwich City
265,2022-05-22,16:00,Premier League,Matchweek 38,Sun,Away,L,1,3,Brighton,...,21.0,8.0,0.0,13.0,5.0,,19.0,1.0,Match Report,West Ham United
37,2022-05-22,16:00,Premier League,Matchweek 38,Sun,Home,W,3,2,Aston Villa,...,5.0,1.0,0.0,4.0,7.0,,7.0,0.0,Match Report,Manchester City


In [57]:
len(pd.read_csv('dictionary.csv').loc[:,'description'])

103

In [63]:
# pd.set_option('max_rows', None) # set this to seee all columns
# pd.reset_option('max_rows')
pd.DataFrame({
    'feature': list(match_df.columns), 
    'data_type': list(match_df.dtypes), 
    'description': list(pd.read_csv('dictionary.csv')['description'])
}).head(5)

Unnamed: 0,feature,data_type,description
0,date,object,Date listed is local to the match
1,time,object,Time listed is local to the match venue (24 ho...
2,comp,object,Competition - Number next to competition state...
3,round,object,Round or Phase of Competition
4,day,object,Day of week


In [68]:
import plotly.express as px
fig = px.scatter(
    match_df, 
    x="passing_short_cmp", 
    y="passing_total_cmp", 
    color='team', 
    size='shooting_standard_gls',
    hover_data=['team', 'opponent', 'ga']
)
fig.show()