In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

In [2]:
# Start the driver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

In [3]:
# Get Data
league_url = "https://fbref.com/en/comps/10/Championship-Stats"
driver.get(league_url)

In [4]:
# Get the HTML content
data = driver.page_source

In [5]:
from bs4 import BeautifulSoup

In [6]:
soup = BeautifulSoup(data)
league_table = soup.select_one('table.stats_table') # identify correct table element using its CSS class: stats_table
links = league_table.find_all('a') # find all team links using their anchor tags
links = [l.get("href") for l in links] # get the href attribute value of each anchor element
links = [l for l in links if '/squads/' in l]

In [7]:
team_urls = [f"https://fbref.com{l}" for l in links] # change relative links to absolute links

In [8]:
from io import StringIO

In [9]:
driver.get(team_urls[0]) # get html data from league leader
data = StringIO(driver.page_source) # wrap HTML content inside a "file-like" object, which pandas can read

In [10]:
import pandas as pd

In [11]:
matches = pd.read_html(data, match="Scores & Fixtures")[0]

In [12]:
soup = BeautifulSoup(data)
links = soup.find_all('a', href=True)
links = [l.get("href") for l in links] # extract the 'href' attribute from each anchor tag
links = [l for l in links if l and 'all_comps/shooting/' in l] # Filter links to only those linked to the shooting stats page (and are not None)

In [13]:
driver.get(f"https://fbref.com{links[0]}")
data = StringIO(driver.page_source)

In [14]:
shooting = pd.read_html(data, match="Shooting", flavor='html5lib')[0]

In [15]:
shooting.head(5)

Unnamed: 0_level_0,For Stoke City,For Stoke City,For Stoke City,For Stoke City,For Stoke City,For Stoke City,For Stoke City,For Stoke City,For Stoke City,For Stoke City,...,Standard,Standard,Standard,Standard,Expected,Expected,Expected,Expected,Expected,Unnamed: 25_level_0
Unnamed: 0_level_1,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Dist,FK,PK,PKatt,xG,npxG,npxG/Sh,G-xG,np:G-xG,Match Report
0,2025-08-09,15:00 (16:00),Championship,Matchweek 1,Sat,Home,W,3,1,Derby County,...,18.5,1.0,0,0,1.1,1.1,0.09,1.9,1.9,Match Report
1,2025-08-12,19:45 (20:45),EFL Cup,First round,Tue,Home,D,0 (4),0 (3),Walsall,...,,,0,0,,,,,,Match Report
2,2025-08-16,15:00 (16:00),Championship,Matchweek 2,Sat,Away,W,3,0,Sheffield Weds,...,16.7,0.0,0,0,2.0,2.0,0.17,1.0,1.0,Match Report
3,,,,,,,--,,,,...,17.6,1.0,0,0,,,0.13,6.0,6.0,


In [16]:
shooting.columns = shooting.columns.droplevel() # remove top of double index heading

In [17]:
# Merge matches dataframe with some columns of the shooting dataframe using date
team_data = matches.merge(shooting[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt", "xG", "npxG"]], on="Date")
# Sh = shots, SoT = shots on target, Dist = avg. shot distance, FK = freekicks, PK = penalty kicks, PKatt = penalty kicks attempted, 
# xG = expected goals, npxG = expected goals not including penalties

In [18]:
team_data.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Match Report,Notes,Sh,SoT,Dist,FK,PK,PKatt,xG_y,npxG
0,2025-08-09,15:00 (16:00),Championship,Matchweek 1,Sat,Home,W,3,1,Derby County,...,Match Report,,12,7,18.5,1.0,0,0,1.1,1.1
1,2025-08-12,19:45 (20:45),EFL Cup,First round,Tue,Home,D,0 (4),0 (3),Walsall,...,Match Report,Stoke City won on penalty kicks following norm...,8,1,,,0,0,,
2,2025-08-16,15:00 (16:00),Championship,Matchweek 2,Sat,Away,W,3,0,Sheffield Weds,...,Match Report,,12,5,16.7,0.0,0,0,2.0,2.0


In [19]:
# Begin process of scraping data from multiple teams for multiple years using a for loop

years = list(range(2025, 2022, -1))
all_matches = []

In [None]:
import time

league_url = "https://fbref.com/en/comps/10/Championship-Stats"

for year in years:
    driver.get(league_url)
    data = driver.page_source
    soup = BeautifulSoup(data)
    league_table = soup.select_one('table.stats_table')

    links = [l.get("href") for l in league_table.find_all('a')]
    links = [l for l in links if '/squads/' in l]
    team_urls = [f"https://fbref.com{l}" for l in links]

    previous_season = soup.select_one("a.prev").get("href")
    league_url = f"https://fbref.com{previous_season}"
    
    for team_url in team_urls:
        team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-", " ")
        driver.get(team_url)
        data = StringIO(driver.page_source)
        matches = pd.read_html(data, match="Scores & Fixtures")[0]
        soup = BeautifulSoup(data)
        links = [l.get("href") for l in soup.find_all('a')]
        links = [l for l in links if l and 'all_comps/shooting/' in l]
        
        driver.get(f"https://fbref.com{links[0]}")
        data = StringIO(driver.page_source)
        shooting = pd.read_html(data, match="Shooting", flavor='html5lib')[0]
        shooting.columns = shooting.columns.droplevel()

        desired_cols = ["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt", "xG", "npxG"] # FK column has been dropped this season but available for previous
        available_cols = [col for col in desired_cols if col in shooting.columns]
        
        try:
            team_data = matches.merge(shooting[available_cols], on="Date")
        except ValueError as e:
            print(f"Skipping {team_name} ({year}) due to merge error: {e}")
            continue
        team_data = team_data[team_data["Comp"] == "Championship"] # filter out cup games

        team_data["Team"] = team_name
        team_data["Season"] = year
        all_matches.append(team_data) # add team name and season to the list
        time.sleep(5) # sleep for 5 seconds to avoid overwhelming the server

ValueError: No tables found

In [None]:
# Close the Selenium WebDriver when done
driver.quit()

In [None]:
len(all_matches)

In [None]:
match_df = pd.concat(all_matches)

In [None]:
match_df.columns = [c.lower() for c in match_df.columns]

In [None]:
match_df

In [None]:
from IPython.display import FileLink

match_df.to_csv("matches.csv")
FileLink("matches.csv")