In [1]:

import requests

In [2]:
# Define URL to scrape
standings_url = "https://fbref.com/en/comps/9/Premier-League-Stats"

In [3]:
# Download page using get method, make request to server to download page HTML
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
data = requests.get(standings_url, headers=headers)

In [4]:
# Look at HTML string
data.text



In [5]:
# Import BeautifulSoup library to parse HTML
from bs4 import BeautifulSoup
import time

In [6]:
# Check the response status code
if data.status_code == 200:
    print("Request successful")
    # Initialize soup object using HTML
    soup = BeautifulSoup(data.text, 'html.parser')
    
    # Select the stats table with CSS Selector
    try:
        standings_table = soup.select('table.stats_table')[0]
        print("Found the stats table")
    except IndexError:
        print("Stats table not found")
else:
    # Print the HTTP status code if it's not 200
    print(f"Failed to retrieve the page. Status code: {data.status_code}")

# Add a delay to avoid too many requests
time.sleep(60)


Request successful
Found the stats table


In [7]:
# Check stats table HTML
standings_table

<table class="stats_table sortable min_width force_mobilize" data-cols-to-freeze=",2" id="results2024-202591_overall"> <caption>Premier League Table</caption> <colgroup><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/></colgroup> <thead> <tr> <th aria-label="Rank" class="poptip sort_default_asc center" data-stat="rank" data-tip="&lt;strong&gt;Rank&lt;/strong&gt;&lt;br&gt;Squad finish in competition&lt;br&gt;Finish within the league or competition.&lt;br&gt;For knockout competitions may show final round reached.&lt;br&gt;Colors and arrows represent promotion/relegation or qualifiation for continental cups.&lt;br&gt;Trophy indicates team won league whether by playoffs or by leading the table.&lt;br&gt;Star indicates topped table in league USING another means of naming champion." scope="col">Rk</th> <th aria-label="Squad" class="poptip sort_default_asc center" data-stat="team" scope="col">Squad</th> <th aria-label="Mat

In [8]:
# Find all a tags in our table
links = standings_table.find_all('a')

In [9]:
# Get href property of each link - goes through each a element and find value of href property
links = [l.get("href") for l in links]

In [10]:
# Filter links so we only have squad links
links = [l for l in links if '/squads/' in l]

In [11]:
links

['/en/squads/822bd0ba/Liverpool-Stats',
 '/en/squads/18bb7c10/Arsenal-Stats',
 '/en/squads/e4a775cb/Nottingham-Forest-Stats',
 '/en/squads/b8fd03ef/Manchester-City-Stats',
 '/en/squads/b2b47a98/Newcastle-United-Stats',
 '/en/squads/cff3d9bb/Chelsea-Stats',
 '/en/squads/4ba7cbea/Bournemouth-Stats',
 '/en/squads/8602292d/Aston-Villa-Stats',
 '/en/squads/d07537b9/Brighton-and-Hove-Albion-Stats',
 '/en/squads/fd962109/Fulham-Stats',
 '/en/squads/cd051869/Brentford-Stats',
 '/en/squads/19538871/Manchester-United-Stats',
 '/en/squads/47c64c55/Crystal-Palace-Stats',
 '/en/squads/7c21e445/West-Ham-United-Stats',
 '/en/squads/361ca564/Tottenham-Hotspur-Stats',
 '/en/squads/d3fd31cc/Everton-Stats',
 '/en/squads/a2d435b3/Leicester-City-Stats',
 '/en/squads/8cec06e1/Wolverhampton-Wanderers-Stats',
 '/en/squads/b74092de/Ipswich-Town-Stats',
 '/en/squads/33c895d4/Southampton-Stats']

In [12]:
# Turn links into full urls using Python format string
team_urls = [f"https://fbref.com{l}" for l in links]

In [13]:
team_urls

['https://fbref.com/en/squads/822bd0ba/Liverpool-Stats',
 'https://fbref.com/en/squads/18bb7c10/Arsenal-Stats',
 'https://fbref.com/en/squads/e4a775cb/Nottingham-Forest-Stats',
 'https://fbref.com/en/squads/b8fd03ef/Manchester-City-Stats',
 'https://fbref.com/en/squads/b2b47a98/Newcastle-United-Stats',
 'https://fbref.com/en/squads/cff3d9bb/Chelsea-Stats',
 'https://fbref.com/en/squads/4ba7cbea/Bournemouth-Stats',
 'https://fbref.com/en/squads/8602292d/Aston-Villa-Stats',
 'https://fbref.com/en/squads/d07537b9/Brighton-and-Hove-Albion-Stats',
 'https://fbref.com/en/squads/fd962109/Fulham-Stats',
 'https://fbref.com/en/squads/cd051869/Brentford-Stats',
 'https://fbref.com/en/squads/19538871/Manchester-United-Stats',
 'https://fbref.com/en/squads/47c64c55/Crystal-Palace-Stats',
 'https://fbref.com/en/squads/7c21e445/West-Ham-United-Stats',
 'https://fbref.com/en/squads/361ca564/Tottenham-Hotspur-Stats',
 'https://fbref.com/en/squads/d3fd31cc/Everton-Stats',
 'https://fbref.com/en/squads/

In [14]:
# Work with Chelsea url
team_url = team_urls[5]

In [15]:
# Use requests to get HTML from that URL
data = requests.get(team_url)
time.sleep(60)

In [16]:
import pandas as pd

In [17]:
from io import StringIO

In [18]:
html_data = StringIO(data.text)

In [19]:
# Convert Scores & Fixtures table to dataframe using string
matches = pd.read_html(html_data, match="Scores & Fixtures")

In [20]:
matches

[          Date   Time            Comp                Round  Day Venue Result  \
 0   2024-08-18  16:30  Premier League          Matchweek 1  Sun  Home      L   
 1   2024-08-22  20:00         Conf Lg       Play-off round  Thu  Home      W   
 2   2024-08-25  14:00  Premier League          Matchweek 2  Sun  Away      W   
 3   2024-08-29  20:30         Conf Lg       Play-off round  Thu  Away      L   
 4   2024-09-01  13:30  Premier League          Matchweek 3  Sun  Home      D   
 5   2024-09-14  20:00  Premier League          Matchweek 4  Sat  Away      W   
 6   2024-09-21  12:30  Premier League          Matchweek 5  Sat  Away      W   
 7   2024-09-24  19:45         EFL Cup          Third round  Tue  Home      W   
 8   2024-09-28  15:00  Premier League          Matchweek 6  Sat  Home      W   
 9   2024-10-03  20:00         Conf Lg         League phase  Thu  Home      W   
 10  2024-10-06  14:00  Premier League          Matchweek 7  Sun  Home      D   
 11  2024-10-20  16:30  Prem

In [21]:
matches[0].head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,Attendance,Captain,Formation,Opp Formation,Referee,Match Report,Notes
0,2024-08-18,16:30,Premier League,Matchweek 1,Sun,Home,L,0.0,2.0,Manchester City,1.0,0.8,48.0,39818.0,Enzo Fernández,4-2-3-1,3-2-4-1,Anthony Taylor,Match Report,
1,2024-08-22,20:00,Conf Lg,Play-off round,Thu,Home,W,2.0,0.0,ch Servette FC,,,,37902.0,Moisés Caicedo,4-3-3,4-2-3-1,Jérémie Pignard,Match Report,Leg 1 of 2
2,2024-08-25,14:00,Premier League,Matchweek 2,Sun,Away,W,6.0,2.0,Wolves,1.6,1.9,60.0,31235.0,Enzo Fernández,4-2-3-1,4-2-3-1,Darren England,Match Report,
3,2024-08-29,20:30,Conf Lg,Play-off round,Thu,Away,L,1.0,2.0,ch Servette FC,,,,28000.0,Enzo Fernández,4-2-3-1,4-4-1-1,Marco Di Bello,Match Report,Leg 2 of 2; Chelsea won
4,2024-09-01,13:30,Premier League,Matchweek 3,Sun,Home,D,1.0,1.0,Crystal Palace,2.4,0.5,62.0,39298.0,Enzo Fernández,4-2-3-1,3-4-3,Jarred Gillett,Match Report,


In [22]:
# Retrieve Chelsea shooting stats with same method
# Initialize soup object using our HTML 
soup = BeautifulSoup(data.text)

In [23]:
# Find all a tags in our table
links = soup.find_all('a')

In [24]:
# Get href property of each link - goes through each a element and find value of href property
links = [l.get("href") for l in links]

In [25]:
# Filter links so we only have shooting stats links
links = [l for l in links if l and 'all_comps/shooting/' in l]

In [26]:
links

['/en/squads/cff3d9bb/2024-2025/matchlogs/all_comps/shooting/Chelsea-Match-Logs-All-Competitions',
 '/en/squads/cff3d9bb/2024-2025/matchlogs/all_comps/shooting/Chelsea-Match-Logs-All-Competitions',
 '/en/squads/cff3d9bb/2024-2025/matchlogs/all_comps/shooting/Chelsea-Match-Logs-All-Competitions',
 '/en/squads/cff3d9bb/2024-2025/matchlogs/all_comps/shooting/Chelsea-Match-Logs-All-Competitions']

In [27]:
# Download HTML for specific link
data = requests.get(f"https://fbref.com{links[0]}")
time.sleep(60)

In [28]:
# Convert Shooting data to dataframe using string
shooting_data = StringIO(data.text)
shooting = pd.read_html(shooting_data, match="Shooting")[0]

In [29]:
shooting.head()

Unnamed: 0_level_0,For Chelsea,For Chelsea,For Chelsea,For Chelsea,For Chelsea,For Chelsea,For Chelsea,For Chelsea,For Chelsea,For Chelsea,...,Standard,Standard,Standard,Standard,Expected,Expected,Expected,Expected,Expected,Unnamed: 25_level_0
Unnamed: 0_level_1,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Dist,FK,PK,PKatt,xG,npxG,npxG/Sh,G-xG,np:G-xG,Match Report
0,2024-08-18,16:30,Premier League,Matchweek 1,Sun,Home,L,0.0,2.0,Manchester City,...,17.1,0.0,0,0,1.0,1.0,0.1,-1.0,-1.0,Match Report
1,2024-08-22,20:00,Conf Lg,Play-off round,Thu,Home,W,2.0,0.0,ch Servette FC,...,,,1,1,,,,,,Match Report
2,2024-08-25,14:00,Premier League,Matchweek 2,Sun,Away,W,6.0,2.0,Wolves,...,15.4,1.0,0,0,1.6,1.6,0.12,4.4,4.4,Match Report
3,2024-08-29,20:30,Conf Lg,Play-off round,Thu,Away,L,1.0,2.0,ch Servette FC,...,,,1,1,,,,,,Match Report
4,2024-09-01,13:30,Premier League,Matchweek 3,Sun,Home,D,1.0,1.0,Crystal Palace,...,15.9,1.0,0,0,2.4,2.4,0.18,-1.4,-1.4,Match Report


In [30]:
# Drop 1 index level (multi-level index)
shooting.columns = shooting.columns.droplevel()

In [31]:
shooting.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Dist,FK,PK,PKatt,xG,npxG,npxG/Sh,G-xG,np:G-xG,Match Report
0,2024-08-18,16:30,Premier League,Matchweek 1,Sun,Home,L,0.0,2.0,Manchester City,...,17.1,0.0,0,0,1.0,1.0,0.1,-1.0,-1.0,Match Report
1,2024-08-22,20:00,Conf Lg,Play-off round,Thu,Home,W,2.0,0.0,ch Servette FC,...,,,1,1,,,,,,Match Report
2,2024-08-25,14:00,Premier League,Matchweek 2,Sun,Away,W,6.0,2.0,Wolves,...,15.4,1.0,0,0,1.6,1.6,0.12,4.4,4.4,Match Report
3,2024-08-29,20:30,Conf Lg,Play-off round,Thu,Away,L,1.0,2.0,ch Servette FC,...,,,1,1,,,,,,Match Report
4,2024-09-01,13:30,Premier League,Matchweek 3,Sun,Home,D,1.0,1.0,Crystal Palace,...,15.9,1.0,0,0,2.4,2.4,0.18,-1.4,-1.4,Match Report


In [32]:
# Merge matches & shooting dataframes
team_data = matches[0].merge(shooting[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]], on="Date")

In [33]:
team_data.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Opp Formation,Referee,Match Report,Notes,Sh,SoT,Dist,FK,PK,PKatt
0,2024-08-18,16:30,Premier League,Matchweek 1,Sun,Home,L,0.0,2.0,Manchester City,...,3-2-4-1,Anthony Taylor,Match Report,,10.0,3.0,17.1,0.0,0,0
1,2024-08-22,20:00,Conf Lg,Play-off round,Thu,Home,W,2.0,0.0,ch Servette FC,...,4-2-3-1,Jérémie Pignard,Match Report,Leg 1 of 2,,,,,1,1
2,2024-08-25,14:00,Premier League,Matchweek 2,Sun,Away,W,6.0,2.0,Wolves,...,4-2-3-1,Darren England,Match Report,,14.0,8.0,15.4,1.0,0,0
3,2024-08-29,20:30,Conf Lg,Play-off round,Thu,Away,L,1.0,2.0,ch Servette FC,...,4-4-1-1,Marco Di Bello,Match Report,Leg 2 of 2; Chelsea won,,,,,1,1
4,2024-09-01,13:30,Premier League,Matchweek 3,Sun,Home,D,1.0,1.0,Crystal Palace,...,3-4-3,Jarred Gillett,Match Report,,13.0,7.0,15.9,1.0,0,0


In [34]:
shooting.shape

(35, 26)

In [35]:
matches[0].shape

(50, 20)

In [36]:
#Set up for loop to scrape multiple teams for multiple years

In [37]:

years = list(range(2024 ,2022, -1))

In [38]:
years

[2024, 2023]

In [39]:
all_matches = []

In [40]:
# Define URL to scrape
standings_url = "https://fbref.com/en/comps/9/Premier-League-Stats"

In [41]:
import time

In [44]:
import time
for year in years:
    data = requests.get(standings_url)
    soup = BeautifulSoup(data.text)
    standings_table = soup.select('table.stats_table')[0]
    # time.sleep(60)

    links = [l.get("href") for l in standings_table.find_all('a')]
    links = [l for l in links if '/squads/' in l]
    team_urls = [f"https://fbref.com{l}" for l in links]
    
    previous_season = soup.select("a.prev")[0].get("href")
    standings_url = f"https://fbref.com{previous_season}"
    
    for team_url in team_urls:
        team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-", " ")
        data = requests.get(team_url)
        # time.sleep(60)
        matches_data = StringIO(data.text)
        matches = pd.read_html(matches_data, match="Scores & Fixtures")[0]
        soup = BeautifulSoup(data.text)
        links = [l.get("href") for l in soup.find_all('a')]
        links = [l for l in links if l and 'all_comps/shooting/' in l]
        data = requests.get(f"https://fbref.com{links[0]}")
        # time.sleep(60)
        shooting_data = StringIO(data.text)
        shooting = pd.read_html(shooting_data, match="Shooting")[0]
        shooting.columns = shooting.columns.droplevel()
        try:
            team_data = matches.merge(shooting[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]], on="Date")
        except ValueError:
            continue
        team_data = team_data[team_data["Comp"] == "Premier League"]
        
        team_data["Season"] = year
        team_data["Team"] = team_name
        all_matches.append(team_data)
        time.sleep(20)
    

In [45]:
len(all_matches)

43

In [46]:
match_df = pd.concat(all_matches)

In [47]:
match_df.columns = [c.lower() for c in match_df.columns]

In [48]:
match_df

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
0,2024-08-17,12:30,Premier League,Matchweek 1,Sat,Away,W,2.0,0.0,Ipswich Town,...,Match Report,,18.0,5.0,14.8,0.0,0,0,2024,Liverpool
1,2024-08-25,16:30,Premier League,Matchweek 2,Sun,Home,W,2.0,0.0,Brentford,...,Match Report,,19.0,8.0,13.6,1.0,0,0,2024,Liverpool
2,2024-09-01,16:00,Premier League,Matchweek 3,Sun,Away,W,3.0,0.0,Manchester Utd,...,Match Report,,11.0,3.0,13.4,0.0,0,0,2024,Liverpool
3,2024-09-14,15:00,Premier League,Matchweek 4,Sat,Home,L,0.0,1.0,Nott'ham Forest,...,Match Report,,14.0,5.0,14.9,0.0,0,0,2024,Liverpool
5,2024-09-21,15:00,Premier League,Matchweek 5,Sat,Home,W,3.0,0.0,Bournemouth,...,Match Report,,19.0,12.0,16.6,0.0,0,0,2024,Liverpool
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38,2022-04-30,15:00,Premier League,Matchweek 35,Sat,Away,L,0,2,Aston Villa,...,Match Report,,9.0,3.0,21.6,0.0,0,0,2023,Norwich City
39,2022-05-08,14:00,Premier League,Matchweek 36,Sun,Home,L,0,4,West Ham,...,Match Report,,8.0,2.0,22.2,1.0,0,0,2023,Norwich City
40,2022-05-11,19:45,Premier League,Matchweek 21,Wed,Away,L,0,3,Leicester City,...,Match Report,,9.0,5.0,17.0,0.0,0,0,2023,Norwich City
41,2022-05-15,14:00,Premier League,Matchweek 37,Sun,Away,D,1,1,Wolves,...,Match Report,,11.0,2.0,14.4,0.0,0,0,2023,Norwich City


In [49]:
match_df.to_csv("matches.csv")