In [3]:
import requests
import pandas as pd
import re
from bs4 import BeautifulSoup
import time
import random

### Scrapowanie pojedynczego meczu z linku do raportu meczowego na FBRef

In [4]:
def scrap_match(url, season):
    data = {}
    headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.5',
    'Accept-Encoding': 'gzip, deflate, br',
    'Connection': 'keep-alive',
    'Upgrade-Insecure-Requests': '1'
    }
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')

        data["season"] = season
        venuetime_span = soup.find('span', class_='venuetime')
        if venuetime_span:
            data["venue_date"] = venuetime_span.get('data-venue-date')
            data["venue_time"] = venuetime_span.get('data-venue-time')
        else:
            print("Date/Time information not found")

        matchweek_div = soup.select_one('div:-soup-contains("Matchweek")')
        if matchweek_div:
            data["round"] = int(re.search(r'Matchweek (\d+)', matchweek_div.text).group(1))
        else:
            print("Date/Time information not found")
        
        
        
        attendance_div = soup.select_one('div strong small:-soup-contains("Attendance")')
        if attendance_div:
            data["attendance_value"] = int((attendance_div.find_parent('strong').find_next_sibling().text).replace(",", ""))
        else:
            print("Attendance information not found")

        referee_span = soup.select_one('span:-soup-contains("(Referee)")')
        if referee_span:
            data["referee"] = referee_span.text.split()[0] + " " + referee_span.text.split()[1]
        else:
            print("Referee not found")

        manager_spans = soup.select('.datapoint:-soup-contains("Manager:")')
        if len(manager_spans) > 1:
            if manager_spans[0]:
                data["home_manager"] = manager_spans[0].text.replace("Manager: ", "")
            if manager_spans[1]:
                data["away_manager"] = manager_spans[1].text.replace("Manager: ", "")
            else:
                print("Managers not found")

        captain_spans = soup.select('.datapoint:-soup-contains("Captain:")')
        if len(captain_spans) > 1:
            if captain_spans[0]:
                data["home_captain"] = captain_spans[0].text.replace("Captain: ", "")
            if captain_spans[1]:
                data["away_captain"] = captain_spans[1].text.replace("Captain: ", "")
            else:
                print("Captains not found")

        if soup.select_one('#a.lineup th'):
            data["formation_home"] = re.findall(r'\((.*?)\)', soup.select_one('#a.lineup th').text)[0]
        if soup.select_one('#b.lineup th'):
            data["formation_away"] = re.findall(r'\((.*?)\)', soup.select_one('#b.lineup th').text)[0]

        data["home_possession"] = int(soup.select('#team_stats tr')[2].text.split()[0][:-1])/100
        data["away_possession"] = int(soup.select('#team_stats tr')[2].text.split()[1][:-1])/100

        home_lineup = []
        home_lineup_el = soup.select("#a.lineup tr")
        for el in home_lineup_el:
            txt = el.text
            if txt and txt[0].isdigit():
                home_lineup.append(txt.lstrip('0123456789').strip())

        away_lineup = []
        away_lineup_el = soup.select("#a.lineup tr")
        for el in away_lineup_el:
            txt = el.text
            if txt and txt[0].isdigit():
                away_lineup.append(txt.lstrip('0123456789').strip())

        data["home_lineup"] = home_lineup
        data["away_lineup"] = away_lineup

        
        teams = re.findall(r'(.+?)\s+vs\.\s+(.+?)\s+Match Report', soup.select_one('#content h1').text)
        data["home_team"], data["away_team"] = teams[0]

        # table_names = [div.text.strip() for div in soup.select('.filter.switcher')[0].select('div')]

        tables = soup.select('.table_wrapper.tabbed')
        home_tables = tables[0].select('.stats_table')
        away_tables = tables[1].select('.stats_table')
        all_players_data = {}

        # home team scraping
        for i in range(len(home_tables)):
            table = home_tables[i]

            ths = table.select_one('tr:nth-of-type(2)').select('th')
            df_columns = [th.get('data-stat') for th in ths]

            rows = table.select('tbody tr')
            for row in rows:
                columns = row.select('th, td')
                player_name = columns[0].text.strip()
                if player_name not in all_players_data:
                    all_players_data[player_name] = {}

                all_players_data[player_name]["team"] = data["home_team"]
                all_players_data[player_name]["where"] = "home"
                all_players_data[player_name]["date"] = data["venue_date"]
                all_players_data[player_name]["round"] = data["round"]
                all_players_data[player_name]["season"] = season

                for j in range(len(columns)):
                    column_name = df_columns[j]
                    player_stat = columns[j].text.replace('\xa0\xa0\xa0', '').strip()
                    all_players_data[player_name][column_name] = player_stat
                    
                if len(all_players_data[player_name]["nationality"].split())>1:
                    all_players_data[player_name]["nationality"] = all_players_data[player_name]["nationality"].split()[1]
                else:
                    all_players_data[player_name]["nationality"] = "none"
            match_stat_tds = table.select('tfoot td')
            for td in match_stat_tds:
                data["home_" + td.get('data-stat')] = td.text.strip()

        # away team scraping
        for i in range(len(away_tables)):
            table = away_tables[i]

            ths = table.select_one('tr:nth-of-type(2)').select('th')
            df_columns = [th.get('data-stat') for th in ths]

            rows = table.select('tbody tr')
            for row in rows:
                columns = row.select('th, td')
                player_name = columns[0].text.strip()
                if player_name not in all_players_data:
                    all_players_data[player_name] = {}

                all_players_data[player_name]["team"] = data["away_team"]
                all_players_data[player_name]["where"] = "away"
                all_players_data[player_name]["date"] = data["venue_date"]
                all_players_data[player_name]["round"] = data["round"]
                all_players_data[player_name]["season"] = season

                for j in range(len(columns)):
                    column_name = df_columns[j]
                    player_stat = columns[j].text.replace('\xa0\xa0\xa0', '').strip()
                    all_players_data[player_name][column_name] = player_stat

                if len(all_players_data[player_name]["nationality"].split())>1:
                    all_players_data[player_name]["nationality"] = all_players_data[player_name]["nationality"].split()[1]
                else:
                    all_players_data[player_name]["nationality"] = "none"

            match_stat_tds = table.select('tfoot td')
            for td in match_stat_tds:
                data["away_" + td.get('data-stat')] = td.text.strip()

        return data, all_players_data
    else:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")
        return None, None

### Testowanie scrapowania pojedynczego meczu

In [5]:
scrap_match("https://fbref.com/en/matches/864b029d/Lazio-Atalanta-December-28-2024-Serie-A", "1")

Referee not found


({'season': '1',
  'venue_date': '2024-12-28',
  'venue_time': '20:45',
  'round': 18,
  'attendance_value': 48000,
  'home_possession': 0.4,
  'away_possession': 0.6,
  'home_lineup': [],
  'away_lineup': [],
  'home_team': 'Lazio',
  'away_team': 'Atalanta',
  'home_shirtnumber': '',
  'home_nationality': '',
  'home_position': '',
  'home_age': '',
  'home_minutes': '990',
  'home_goals': '1',
  'home_assists': '1',
  'home_pens_made': '0',
  'home_pens_att': '0',
  'home_shots': '11',
  'home_shots_on_target': '5',
  'home_cards_yellow': '2',
  'home_cards_red': '0',
  'home_touches': '513',
  'home_tackles': '16',
  'home_interceptions': '13',
  'home_blocks': '12',
  'home_xg': '0.6',
  'home_npxg': '0.6',
  'home_xg_assist': '0.5',
  'home_sca': '20',
  'home_gca': '2',
  'home_passes_completed': '316',
  'home_passes': '403',
  'home_passes_pct': '78.4',
  'home_progressive_passes': '30',
  'home_carries': '221',
  'home_progressive_carries': '9',
  'home_take_ons': '12',
  'ho

### Scrapowanie całego sezonu premier league

In [6]:
def scrape_season(season, new_round = "16"):
    url = 'https://fbref.com/en/comps/20/' + season + '/schedule/' + season + '-Bundesliga-Scores-and-Fixtures'
    headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.5',
    'Accept-Encoding': 'gzip, deflate, br',
    'Connection': 'keep-alive',
    'Upgrade-Insecure-Requests': '1'
    }
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        link_elements = soup.select('td[data-stat="match_report"] a')
        links = []
        for el in link_elements:
            if el.text == "Match Report":
                links.append("https://fbref.com" + el.get('href') )
            # else: 
            #     row = el.parent.parent
            #     if (row.select_one('th[data-stat="gameweek"]').text == new_round):
            #         row_date = row.select_one('td[data-stat="date"] a').text
            #         row_time = row.select_one('.venuetime').text
            #         row_home_team = row.select_one('td[data-stat="home_team"]').text
            #         row_away_team = row.select_one('td[data-stat="away_team"]').text

        # links = ["https://fbref.com" + el.get('href') for el in link_elements]

        all_teams_data = []
        all_players_data = []
        unique_links = set(links) - set(links[-2:])
        print(len(unique_links))
        for link in unique_links:
            print(link)
            time.sleep(random.uniform(3, 5))
            teams_data, players_data = scrap_match(link, season)
            if not teams_data:
                return
            all_teams_data.append(teams_data)
            all_players_data.append(players_data)

        flattened_data = []
        for player_dict in all_players_data:
            for key, player_info in player_dict.items():
                flattened_data.append(player_info)

        df_players = pd.DataFrame(flattened_data)
        df_teams = pd.DataFrame(all_teams_data)

        return df_teams, df_players
    else:
        if response.status_code == 429:
            retry_after = response.headers.get('Retry-After')
    
        if retry_after:
            print(f"Rate limited. Retry after: {retry_after} seconds.")
        else:
            print("Rate limited but no 'Retry-After' header found")
        print(f"Failed to retrieve the page. Status code: {response.status_code}")

In [15]:
# df_teams_17_18, df_players_17_18 = scrape_season('2017-2018')
df_teams_18_19, df_players_18_19 = scrape_season('2018-2019')
# df_teams_19_20, df_players_19_20 = scrape_season('2019-2020')
# df_teams_20_21, df_players_20_21 = scrape_season('2020-2021')
# df_teams_21_22, df_players_21_22 = scrape_season('2021-2022')
# df_teams_22_23, df_players_22_23 = scrape_season('2022-2023')
# df_teams_23_24, df_players_23_24 = scrape_season('2023-2024')
# df_teams_24_25, df_players_24_25 = scrape_season('2024-2025')

306
https://fbref.com/en/matches/e10ce14b/Wolfsburg-RB-Leipzig-November-24-2018-Bundesliga
https://fbref.com/en/matches/f2f574d1/Freiburg-Hoffenheim-January-26-2019-Bundesliga
https://fbref.com/en/matches/00ee37ac/Eintracht-Frankfurt-Stuttgart-March-31-2019-Bundesliga
https://fbref.com/en/matches/60f958cf/Hoffenheim-Monchengladbach-December-15-2018-Bundesliga
https://fbref.com/en/matches/4336e7ef/Hoffenheim-Freiburg-September-1-2018-Bundesliga
https://fbref.com/en/matches/245a034e/Werder-Bremen-Nurnberg-September-16-2018-Bundesliga
https://fbref.com/en/matches/375b60fd/Wolfsburg-Dortmund-November-3-2018-Bundesliga
https://fbref.com/en/matches/947b6749/Mainz-05-Hoffenheim-May-18-2019-Bundesliga
https://fbref.com/en/matches/8023d4f5/Hannover-96-Stuttgart-October-6-2018-Bundesliga
https://fbref.com/en/matches/c52ed709/Schalke-04-Wolfsburg-January-20-2019-Bundesliga
https://fbref.com/en/matches/e9911489/Augsburg-Dusseldorf-January-19-2019-Bundesliga
https://fbref.com/en/matches/36cd6979/Ei

In [24]:
# df_players_17_18.to_csv("players_bl_17-18_fbref.csv", index=False)
# df_teams_17_18.to_csv("teams_bl_17-18_fbref.csv", index=False)
# df_players_18_19.to_csv("players_bl_18-19_fbref.csv", index=False)
# df_teams_18_19.to_csv("teams_bl_18-19_fbref.csv", index=False)
# df_players_19_20.to_csv("players_bl_19-20_fbref.csv", index=False)
# df_teams_19_20.to_csv("teams_bl_19-20_fbref.csv", index=False)
# df_players_20_21.to_csv("players_bl_20-21_fbref.csv", index=False)
# df_teams_20_21.to_csv("teams_bl_20-21_fbref.csv", index=False)
# df_players_21_22.to_csv("players_bl_21-22_fbref.csv", index=False)
# df_teams_21_22.to_csv("teams_bl_21-22_fbref.csv", index=False)
# df_players_22_23.to_csv("players_bl_22-23_fbref.csv", index=False)
# df_teams_22_23.to_csv("teams_bl_22-23_fbref.csv", index=False)
# df_players_23_24.to_csv("players_bl_23-24_fbref.csv", index=False)
# df_teams_23_24.to_csv("teams_bl_23-24_fbref.csv", index=False)
# df_players_24_25.to_csv("players_bl_24-25_fbref.csv", index=False)
# df_teams_24_25.to_csv("teams_bl_24-25_fbref.csv", index=False)