# Webscraping Football (Soccer) Data from [fbref.com](https://fbref.com/en/comps/9/Premier-League-Stats)

In [6]:
import requests

In [7]:
#Defining the url for scraping
standings_url = "https://fbref.com/en/comps/9/Premier-League-Stats"

In [8]:
#downloading the page with the request library...
data = requests.get(standings_url)

In [9]:
from bs4 import BeautifulSoup

In [10]:
#parsing the important html links with beautifulsoup
soup = BeautifulSoup(data.text)

In [11]:
#importing the first "[0]" pl table after identifying its html tag
standings_table = soup.select('table.stats_table')[0]

In [12]:
#Find all anchor (a) tags within the standings_table
links = standings_table.find_all('a')

In [13]:
#Getting the href property for every link in the list
links = [l.get("href") for l in links]

In [14]:
#filtering it to show the href property of only squads
links = [l for l in links if '/squads/' in l]

In [15]:
links
#only displayed the last part of the links

['/en/squads/822bd0ba/Liverpool-Stats',
 '/en/squads/b8fd03ef/Manchester-City-Stats',
 '/en/squads/18bb7c10/Arsenal-Stats',
 '/en/squads/8602292d/Aston-Villa-Stats',
 '/en/squads/361ca564/Tottenham-Hotspur-Stats',
 '/en/squads/19538871/Manchester-United-Stats',
 '/en/squads/7c21e445/West-Ham-United-Stats',
 '/en/squads/d07537b9/Brighton-and-Hove-Albion-Stats',
 '/en/squads/b2b47a98/Newcastle-United-Stats',
 '/en/squads/8cec06e1/Wolverhampton-Wanderers-Stats',
 '/en/squads/cff3d9bb/Chelsea-Stats',
 '/en/squads/4ba7cbea/Bournemouth-Stats',
 '/en/squads/fd962109/Fulham-Stats',
 '/en/squads/47c64c55/Crystal-Palace-Stats',
 '/en/squads/cd051869/Brentford-Stats',
 '/en/squads/e4a775cb/Nottingham-Forest-Stats',
 '/en/squads/e297cd13/Luton-Town-Stats',
 '/en/squads/d3fd31cc/Everton-Stats',
 '/en/squads/943e8050/Burnley-Stats',
 '/en/squads/1df6b87e/Sheffield-United-Stats']

In [16]:
#...adding the first part fo the links to all the squad links
team_urls = [f"https://fbref.com{l}" for l in links]
team_urls

['https://fbref.com/en/squads/822bd0ba/Liverpool-Stats',
 'https://fbref.com/en/squads/b8fd03ef/Manchester-City-Stats',
 'https://fbref.com/en/squads/18bb7c10/Arsenal-Stats',
 'https://fbref.com/en/squads/8602292d/Aston-Villa-Stats',
 'https://fbref.com/en/squads/361ca564/Tottenham-Hotspur-Stats',
 'https://fbref.com/en/squads/19538871/Manchester-United-Stats',
 'https://fbref.com/en/squads/7c21e445/West-Ham-United-Stats',
 'https://fbref.com/en/squads/d07537b9/Brighton-and-Hove-Albion-Stats',
 'https://fbref.com/en/squads/b2b47a98/Newcastle-United-Stats',
 'https://fbref.com/en/squads/8cec06e1/Wolverhampton-Wanderers-Stats',
 'https://fbref.com/en/squads/cff3d9bb/Chelsea-Stats',
 'https://fbref.com/en/squads/4ba7cbea/Bournemouth-Stats',
 'https://fbref.com/en/squads/fd962109/Fulham-Stats',
 'https://fbref.com/en/squads/47c64c55/Crystal-Palace-Stats',
 'https://fbref.com/en/squads/cd051869/Brentford-Stats',
 'https://fbref.com/en/squads/e4a775cb/Nottingham-Forest-Stats',
 'https://fbre

# Extracting Match Stats using Pandas and Requests

In [17]:
#working with cheslea
team_url = team_urls[10]

In [18]:
data = requests.get(team_url)

In [19]:
#Scanning through all the tables to bring out the "Scores and Fixtures" table
#(the table was captioned "Scores and Fixtures", could also use the table's id)
import pandas as pd
matches = pd.read_html(data.text, match="Scores & Fixtures")

In [20]:
#the first match data extraction using pandas
matches[0]

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,Attendance,Captain,Formation,Referee,Match Report,Notes
0,2023-08-13,16:30,Premier League,Matchweek 1,Sun,Home,D,1,1,Liverpool,1.4,1.3,65.0,40096.0,Reece James,3-4-3,Anthony Taylor,Match Report,
1,2023-08-20,16:30,Premier League,Matchweek 2,Sun,Away,L,1,3,West Ham,2.5,1.8,75.0,62451.0,Ben Chilwell,3-4-3,John Brooks,Match Report,
2,2023-08-25,20:00,Premier League,Matchweek 3,Fri,Home,W,3,0,Luton Town,2.2,0.4,67.0,39893.0,Ben Chilwell,3-4-3,Robert Jones,Match Report,
3,2023-08-30,19:45,EFL Cup,Second round,Wed,Home,W,2,1,AFC Wimbledon,,,84.0,37794.0,Conor Gallagher,4-2-3-1,Tony Harrington,Match Report,
4,2023-09-02,15:00,Premier League,Matchweek 4,Sat,Home,L,0,1,Nott'ham Forest,2.3,0.8,75.0,39820.0,Ben Chilwell,3-4-3,Tim Robinson,Match Report,
5,2023-09-17,14:00,Premier League,Matchweek 5,Sun,Away,D,0,0,Bournemouth,1.8,1.0,64.0,10421.0,Conor Gallagher,4-2-3-1,David Coote,Match Report,
6,2023-09-24,14:00,Premier League,Matchweek 6,Sun,Home,L,0,1,Aston Villa,1.3,1.0,53.0,39700.0,Conor Gallagher,4-2-3-1,Jarred Gillett,Match Report,
7,2023-09-27,19:45,EFL Cup,Third round,Wed,Home,W,1,0,Brighton,,,39.0,37516.0,Ben Chilwell,4-2-3-1,Thomas Bramall,Match Report,
8,2023-10-02,20:00,Premier League,Matchweek 7,Mon,Away,W,2,0,Fulham,1.7,1.1,44.0,24445.0,Conor Gallagher,4-3-3,Tim Robinson,Match Report,
9,2023-10-07,15:00,Premier League,Matchweek 8,Sat,Away,W,4,1,Burnley,1.9,0.7,62.0,21654.0,Conor Gallagher,4-3-3,Stuart Attwell,Match Report,


# Getting Match Shooting Stats with Requests and Pandas

In [21]:
soup = BeautifulSoup(data.text)

In [22]:
links = soup.find_all('a')
links = [l.get("href") for l in links]
links = [l for l in links if l and 'all_comps/shooting/' in l]

In [23]:
#to download the html of our shooting stats page in its full form
data = requests.get(f"https://fbref.com{links[0]}")

In [24]:
shooting = pd.read_html(data.text, match="Shooting")[0]

# Cleaning and Merging Scraped Data with Pandas

In [25]:
shooting.head()
#first step will be to clear the multi-level index

Unnamed: 0_level_0,For Chelsea,For Chelsea,For Chelsea,For Chelsea,For Chelsea,For Chelsea,For Chelsea,For Chelsea,For Chelsea,For Chelsea,...,Standard,Standard,Standard,Standard,Expected,Expected,Expected,Expected,Expected,Unnamed: 25_level_0
Unnamed: 0_level_1,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Dist,FK,PK,PKatt,xG,npxG,npxG/Sh,G-xG,np:G-xG,Match Report
0,2023-08-13,16:30,Premier League,Matchweek 1,Sun,Home,D,1,1,Liverpool,...,13.2,1.0,0,0,1.4,1.4,0.14,-0.4,-0.4,Match Report
1,2023-08-20,16:30,Premier League,Matchweek 2,Sun,Away,L,1,3,West Ham,...,17.1,0.0,0,1,2.5,1.7,0.11,-1.5,-0.7,Match Report
2,2023-08-25,20:00,Premier League,Matchweek 3,Fri,Home,W,3,0,Luton Town,...,14.2,1.0,0,0,2.2,2.2,0.12,0.8,0.8,Match Report
3,2023-08-30,19:45,EFL Cup,Second round,Wed,Home,W,2,1,AFC Wimbledon,...,,,1,1,,,,,,Match Report
4,2023-09-02,15:00,Premier League,Matchweek 4,Sat,Home,L,0,1,Nott'ham Forest,...,17.7,2.0,0,0,2.3,2.3,0.11,-2.3,-2.3,Match Report


In [26]:
#cleaning the unnecessary first level of index
shooting.columns = shooting.columns.droplevel()

In [27]:
shooting.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Dist,FK,PK,PKatt,xG,npxG,npxG/Sh,G-xG,np:G-xG,Match Report
0,2023-08-13,16:30,Premier League,Matchweek 1,Sun,Home,D,1,1,Liverpool,...,13.2,1.0,0,0,1.4,1.4,0.14,-0.4,-0.4,Match Report
1,2023-08-20,16:30,Premier League,Matchweek 2,Sun,Away,L,1,3,West Ham,...,17.1,0.0,0,1,2.5,1.7,0.11,-1.5,-0.7,Match Report
2,2023-08-25,20:00,Premier League,Matchweek 3,Fri,Home,W,3,0,Luton Town,...,14.2,1.0,0,0,2.2,2.2,0.12,0.8,0.8,Match Report
3,2023-08-30,19:45,EFL Cup,Second round,Wed,Home,W,2,1,AFC Wimbledon,...,,,1,1,,,,,,Match Report
4,2023-09-02,15:00,Premier League,Matchweek 4,Sat,Home,L,0,1,Nott'ham Forest,...,17.7,2.0,0,0,2.3,2.3,0.11,-2.3,-2.3,Match Report


In [28]:
#we need to combine the data frames for "matches" and "shooting" (same structure)
#to prevent duplicates, unique columns will be selected from one of the df's
team_data = matches[0].merge(shooting[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]], on="Date")
team_data.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Formation,Referee,Match Report,Notes,Sh,SoT,Dist,FK,PK,PKatt
0,2023-08-13,16:30,Premier League,Matchweek 1,Sun,Home,D,1,1,Liverpool,...,3-4-3,Anthony Taylor,Match Report,,10,4,13.2,1.0,0,0
1,2023-08-20,16:30,Premier League,Matchweek 2,Sun,Away,L,1,3,West Ham,...,3-4-3,John Brooks,Match Report,,16,3,17.1,0.0,0,1
2,2023-08-25,20:00,Premier League,Matchweek 3,Fri,Home,W,3,0,Luton Town,...,3-4-3,Robert Jones,Match Report,,19,8,14.2,1.0,0,0
3,2023-08-30,19:45,EFL Cup,Second round,Wed,Home,W,2,1,AFC Wimbledon,...,4-2-3-1,Tony Harrington,Match Report,,23,9,,,1,1
4,2023-09-02,15:00,Premier League,Matchweek 4,Sat,Home,L,0,1,Nott'ham Forest,...,3-4-3,Tim Robinson,Match Report,,21,2,17.7,2.0,0,0


In [29]:
# ^^^Scraped and Combined shooting and standings data for Chelsea FC^^^

# Scraping Data For Multiple Seasons and Teams with a Loop

In [30]:
#initially specifying the years we want to scrape in a list
years = list(range(2024,2022, -1))

In [31]:
years

[2024, 2023]

In [32]:
#initializing the list where the extracted data frame will be contained
all_matches = []

In [None]:
standings_url = "https://fbref.com/en/comps/9/Premier-League-Stats"

In [33]:
import time
for year in years:
    data = requests.get(standings_url)
    soup = BeautifulSoup(data.text)
    standings_table = soup.select('table.stats_table')[0]

    links = [l.get("href") for l in standings_table.find_all('a')]
    links = [l for l in links if '/squads/' in l]
    team_urls = [f"https://fbref.com{l}" for l in links]
    
    previous_season = soup.select("a.prev")[0].get("href")
    standings_url = f"https://fbref.com{previous_season}"
    
    for team_url in team_urls:
        team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-", " ")
        data = requests.get(team_url)
        matches = pd.read_html(data.text, match="Scores & Fixtures")[0]
        soup = BeautifulSoup(data.text)
        links = [l.get("href") for l in soup.find_all('a')]
        links = [l for l in links if l and 'all_comps/shooting/' in l]
        data = requests.get(f"https://fbref.com{links[0]}")
        shooting = pd.read_html(data.text, match="Shooting")[0]
        shooting.columns = shooting.columns.droplevel()
        try:
            team_data = matches.merge(shooting[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]], on="Date")
        except ValueError:
            continue
        team_data = team_data[team_data["Comp"] == "Premier League"]
        
        team_data["Season"] = year
        team_data["Team"] = team_name
        all_matches.append(team_data)
        time.sleep(1)
    

In [None]:
#DETAILED EXPLANATION OF THE CODE ABOVE 

import time

# Assume 'years' is a list containing the seasons you are interested in (e.g., [2024, 2023])
for year in years:
    # Retrieve the standings data for the current season
    data = requests.get(standings_url)
    soup = BeautifulSoup(data.text)
    
    # Extract the standings table from the webpage
    standings_table = soup.select('table.stats_table')[0]

    # Extract links to individual team pages
    links = [l.get("href") for l in standings_table.find_all('a')]
    
    # Filter links to include only those related to team squads
    links = [l for l in links if '/squads/' in l]
    
    # Create full URLs for each team's page
    team_urls = [f"https://fbref.com{l}" for l in links]
    
    # Find the URL for the standings of the previous season
    previous_season = soup.select("a.prev")[0].get("href")
    standings_url = f"https://fbref.com{previous_season}"
    
    # Loop through each team's page
    for team_url in team_urls:
        # Extract the team name from the URL
        team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-", " ")
        
        # Retrieve data from the team's page, including matches and shooting statistics
        data = requests.get(team_url)
        matches = pd.read_html(data.text, match="Scores & Fixtures")[0]
        
        # Extract links related to shooting statistics
        soup = BeautifulSoup(data.text)
        links = [l.get("href") for l in soup.find_all('a')]
        links = [l for l in links if l and 'all_comps/shooting/' in l]
        
        # Retrieve shooting statistics data
        data = requests.get(f"https://fbref.com{links[0]}")
        shooting = pd.read_html(data.text, match="Shooting")[0]
        
        # Clean up shooting column names
        shooting.columns = shooting.columns.droplevel()
        
        try:
            # Merge match and shooting data on the "Date" column
            team_data = matches.merge(shooting[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]], on="Date")
        except ValueError:
            # Continue to the next team if a ValueError occurs during the merge
            continue
        
        # Filter data for matches in the Premier League
        team_data = team_data[team_data["Comp"] == "Premier League"]
        
        # Add columns for season and team name
        team_data["Season"] = year
        team_data["Team"] = team_name
        
        # Append team data to the list
        all_matches.append(team_data)
        
        # Pause for 1 second before making the next request to avoid overwhelming the server
        time.sleep(1)


In [34]:
#transferring it to a pandas df
match_df = pd.concat(all_matches)

In [35]:
match_df

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Match Report,Notes,Sh,SoT,Dist,FK,PK,PKatt,Season,Team
0,2023-08-13,16:30,Premier League,Matchweek 1,Sun,Away,D,1.0,1.0,Chelsea,...,Match Report,,13.0,1.0,17.8,0.0,0,0,2024,Liverpool
1,2023-08-19,15:00,Premier League,Matchweek 2,Sat,Home,W,3.0,1.0,Bournemouth,...,Match Report,,25.0,9.0,16.8,1.0,0,1,2024,Liverpool
2,2023-08-27,16:30,Premier League,Matchweek 3,Sun,Away,W,2.0,1.0,Newcastle Utd,...,Match Report,,9.0,4.0,17.2,1.0,0,0,2024,Liverpool
3,2023-09-03,14:00,Premier League,Matchweek 4,Sun,Home,W,3.0,0.0,Aston Villa,...,Match Report,,17.0,4.0,14.7,0.0,0,0,2024,Liverpool
4,2023-09-16,12:30,Premier League,Matchweek 5,Sat,Away,W,3.0,1.0,Wolves,...,Match Report,,16.0,5.0,15.8,0.0,0,0,2024,Liverpool
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42,2023-04-30,14:00,Premier League,Matchweek 34,Sun,Away,L,1,3,Newcastle Utd,...,Match Report,,4.0,3.0,17.3,0.0,0,0,2023,Southampton
43,2023-05-08,20:00,Premier League,Matchweek 35,Mon,Away,L,3,4,Nott'ham Forest,...,Match Report,,18.0,4.0,14.0,0.0,1,1,2023,Southampton
44,2023-05-13,15:00,Premier League,Matchweek 36,Sat,Home,L,0,2,Fulham,...,Match Report,,5.0,1.0,24.2,0.0,0,0,2023,Southampton
45,2023-05-21,14:00,Premier League,Matchweek 37,Sun,Away,L,1,3,Brighton,...,Match Report,,5.0,1.0,13.8,1.0,0,0,2023,Southampton


In [36]:
# to transform the column headers to lower case
match_df.columns = [c.lower() for c in match_df.columns]

In [37]:
#saving it as csv
match_df.to_csv("matches.csv")