### Format for web scraping prem match data

In [250]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [251]:
standings_url = "https://fbref.com/en/comps/9/Premier-League-Stats"
data = requests.get(standings_url)
soup = BeautifulSoup(data.text)
standings_table = soup.select('table.stats_table')[0]

In [252]:
links = standings_table.find_all('a')  # all links in the web page
links = [l.get("href") for l in links]  # get the reference to all the links
links = [l for l in links if '/squads/' in l]  # all links with '/squads/' in order to find the team pages for all clubs
team_urls = [f"https://fbref.com{l}" for l in links]  # absolute links for club specific pages
team_url = team_urls[0] # first club page link
data = requests.get(team_url)  # get data from club page

In [253]:
matches = pd.read_html(data.text, match="Scores & Fixtures")[0] # Scores and fixtures dataframe for the club
matches = matches[['Date', 'Time', 'Comp', 'Round', 'Day', 'Venue', 'Result', 'GF', 'GA', 'Opponent', 'Attendance', 'Captain', 'Formation','Referee']]

In [254]:
data = requests.get(team_url)
soup = BeautifulSoup(data.text) # extract all information from page
links = soup.find_all('a') # find all links on page for the club
links = [l.get("href") for l in links] # get reference urls for those links
links = [l for l in links if l and (('all_comps/shooting/' in l) or ('all_comps/possession/' in l) or ('all_comps/passing/' in l) or ('all_comps/defense/' in l))] # filter to only shooting stats links
links = [*set(links)]

data_passing = requests.get(f"https://fbref.com{links[0]}") # pull that data for the shooting stats link
passing = pd.read_html(data_passing.text, match="Passing")[0] # read the link's data from the Shooting table

data_possession = requests.get(f"https://fbref.com{links[1]}") # pull that data for the shooting stats link
possession = pd.read_html(data_possession.text, match="Possession")[0] # read the link's data from the Shooting table

data_shooting = requests.get(f"https://fbref.com{links[2]}") # pull that data for the shooting stats link
shooting = pd.read_html(data_shooting.text, match="Shooting")[0] # read the link's data from the Shooting table

data_defense = requests.get(f"https://fbref.com{links[3]}") # pull that data for the shooting stats link
defense = pd.read_html(data_defense.text, match="Defensive Actions")[0] # read the link's data from the Shooting table


In [255]:
# Clear out unecessary multi indexed column names and rename columns to specify features
d = dict(zip(shooting.columns.levels[0], ['Expected', '', 'Standard', '']))
shooting = shooting.rename(columns=d, level=0)

d = dict(zip(possession.columns.levels[0], ['Carries', 'Dribbles', '', 'Receiving', 'Touches', '']))
possession = possession.rename(columns=d, level=0)

d = dict(zip(passing.columns.levels[0], ['', 'Long', 'Medium', 'Short', 'Total', '', '', '', '', '', '', '', '']))
passing = passing.rename(columns=d, level=0)

d = dict(zip(defense.columns.levels[0], ['Blocks', '', 'Pressures', 'Tackles', '', '', '', '', '', 'Vs Dribbles']))
defense = defense.rename(columns=d, level=0)

In [256]:


dataframes = {
    'matches': matches, 
    'shooting': shooting, 
    'possession': possession, 
    'passing': passing, 
    'defense': defense
}

for key, df in dataframes.items():
    if (key != 'matches'):
        df.columns = ['_'.join(col).strip() for col in df.columns.values]
    df.columns = df.columns.str.lstrip("_")
    df.columns = df.columns.str.replace(' ', '_')
    df.columns = df.columns.str.replace('-', '_')
    df.columns = df.columns.str.replace(':', '_')
    df.columns = df.columns.str.lower()


In [257]:
shooting = shooting.drop(columns='match_report')
possession = possession.drop(columns='match_report')
passing = passing.drop(columns='match_report')
defense = defense.drop(columns='match_report')

In [258]:
team_data = matches.copy()
dataframes = {
    'shooting': shooting, 
    'possession': possession, 
    'passing': passing, 
    'defense': defense
}
for key, df in dataframes.items():
       team_data = team_data.merge(df, on=['date', 'time', 'comp', 'round', 'day', 'venue', 'result', 'gf', 'ga', 'opponent'])



In [262]:
team_data[(team_data['comp']=='Premier League')]

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,pressures_mid_3rd,pressures_att_3rd,blocks_blocks,blocks_sh,blocks_shsv,blocks_pass,int,tkl+int,clr,err
0,2022-08-05,20:00,Premier League,Matchweek 1,Fri,Away,W,2.0,0.0,Crystal Palace,...,71,42,21,6,0,15,17,,20,0
1,2022-08-13,15:00,Premier League,Matchweek 2,Sat,Home,W,4.0,2.0,Leicester City,...,37,42,15,1,1,14,14,,13,0
2,2022-08-20,17:30,Premier League,Matchweek 3,Sat,Away,W,3.0,0.0,Bournemouth,...,51,31,12,0,0,12,17,,18,1
3,2022-08-27,17:30,Premier League,Matchweek 4,Sat,Home,W,2.0,1.0,Fulham,...,42,25,12,2,0,10,15,,8,1
