In [5]:
import requests
import json
import pandas as pd
import numpy as np
import bs4
import re
import time
from bs4 import BeautifulSoup
from tqdm import tqdm

# Premier League Football Players' Data Extraction

Extracting all information and statistics from the One Football website, which can be found here: https://onefootball.com/en/home

We are focusing on the Premier League and scraping the website using `BeautifulSoup` in order to obtain data on the matches played during the current 2023-24 season as well as the performance statistics and general information about the players of all teams involved.

The below code blocks work to extract all important information and statistics about the football players of each team in the Premier League. This is done by first collecting the href links of all teams in the premier league, then collecting all players hrefs to collect their info, and finally collecting all basic information about the players and collecting all their statistics in two seperate pandas dataframes, before merging the two into one final panda dataframe that will be our dataset.

In [6]:
def get_team_links(base_url):
    response = requests.get(base_url)
    team_links = []
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "html.parser")

        team_link_tags = soup.find_all('a', class_='Standing_standings__rowGrid__45OOd')

        for tag in team_link_tags:
            team_name_tag = tag.find('p', class_='Standing_standings__teamName__psv61')
            if team_name_tag:
                team_links.append(tag['href'])

        return team_links
    else:
        return None

# collecting the href links of all teams in the premier league
team_urls = []

base_url = "https://onefootball.com/en/competition/premier-league-9/table"
while len(team_urls) < 20:
    time.sleep(0.1)
    df = get_team_links(base_url)
    if df is not None:
        team_urls.extend(df)

# add '/squad' to href link in order to collect all players
team_urls_squad = team_urls.copy()
for i in range(len(team_urls_squad)):
  team_urls_squad[i] += '/squad'
print(team_urls_squad[:5])

['/en/team/liverpool-18/squad', '/en/team/aston-villa-199/squad', '/en/team/manchester-city-209/squad', '/en/team/arsenal-2/squad', '/en/team/tottenham-hotspur-202/squad']


In [7]:
def get_player_links(base_url, id_team):
    response = requests.get(base_url + id_team)
    player_links = []
    if response.status_code == 200:
        #print("Processing team: ", id_team)
        soup = BeautifulSoup(response.text, "html.parser")

        script_tag = soup.find('script', {'id': '__NEXT_DATA__'})

        # extract players by position
        if script_tag:
            data = json.loads(script_tag.string)

            # goalkeepers
            goalkeeper = data['props']['pageProps']['containers'][3]['type']['fullWidth']['component']['contentType']['entityNavigation']['links']
            keeper = [player['urlPath'] for player in goalkeeper]

            # defenders
            defenders = data['props']['pageProps']['containers'][4]['type']['fullWidth']['component']['contentType']['entityNavigation']['links']
            defender = [player['urlPath'] for player in defenders]

            # midfielders
            midfeilders = data['props']['pageProps']['containers'][6]['type']['fullWidth']['component']['contentType']['entityNavigation']['links']
            midfield = [player['urlPath'] for player in midfeilders]

            # forwards
            forwards = data['props']['pageProps']['containers'][7]['type']['fullWidth']['component']['contentType']['entityNavigation']['links']
            forward = [player['urlPath'] for player in forwards]

            return keeper + defender + midfield + forward
    else:
        print(f"Failed to retrieve the team page. Status code: {response.status_code}")
        return None

# collecting all players hrefs to collect their info
player_urls = []

base_url = "https://onefootball.com"
for team in tqdm(team_urls_squad):
  time.sleep(0.1)
  df = get_player_links(base_url, str(team))
  if df is not None:
    player_urls.extend(df)

print(player_urls[:5])

# add '/stats' to href link in order to collect all stats of players
player_urls_stats = player_urls.copy()
for i in range(len(player_urls_stats)):
  player_urls_stats[i] += '/stats'
print(player_urls_stats[:5])

100%|██████████| 20/20 [00:06<00:00,  3.10it/s]

['/en/player/adrian-30947', '/en/player/alisson-44430', '/en/player/caoimhin-kelleher-262070', '/en/player/fabian-mrozek-454565', '/en/player/marcelo-pitaluga-247112']
['/en/player/adrian-30947/stats', '/en/player/alisson-44430/stats', '/en/player/caoimhin-kelleher-262070/stats', '/en/player/fabian-mrozek-454565/stats', '/en/player/marcelo-pitaluga-247112/stats']





In [8]:
def get_player_info(base_url, id_player):
    stats = {"Name": None, "Team": None}
    rep = requests.get(base_url + id_player)
    if rep.status_code == 200:
        #print("id player : ", id_player)
        soup = BeautifulSoup(rep.text, "html.parser")

        # player name
        stats["Name"] = soup.title.string.split(' Profile | OneFootball')[0]

        # player team
        script_tag = soup.find('script', {'type': 'application/ld+json'})
        data = json.loads(script_tag.string)
        stats["Team"] = data['affiliation']['name']

        # player info
        desired_headers = ["KEY STATS"]
        potential_stats_divs = soup.find_all('div', class_='transfer-details')

        for div in potential_stats_divs:
            header = div.find('p', class_='transfer-details__header',
                              string=lambda string: any(header in string for header in desired_headers))
            if header:
                for entry in div.find_all('li', class_='transfer-details-list__entry'):
                    value = entry.find('p', class_='title-3-bold').text.strip()
                    stat_name = entry.find('p', class_='title-7-regular').text.strip()
                    value = value.replace(' cm', '').replace(' kg', '')
                    stats[stat_name] = value

        return pd.DataFrame(stats, index=[0])
    else:
        return None

# collecting basic information
list_results_info = []

base_url = "https://onefootball.com/"
for player in tqdm(player_urls):
  time.sleep(0.1)
  df = get_player_info(base_url, str(player))
  if df is not None:
    list_results_info.append(df)

table_info = pd.concat(list_results_info, ignore_index=True)
table_info.rename(columns={'Height': 'Height (cm)', 'Weight': 'Weight (kg)'}, inplace=True) # adding units to column titles
table_info

100%|██████████| 664/664 [03:47<00:00,  2.92it/s]


Unnamed: 0,Name,Team,Age,Position,Country,Height (cm),Weight (kg),Jersey number
0,Adrian,Liverpool,37,Goalkeeper,Spain,190,80,13
1,Alisson,Liverpool,31,Goalkeeper,Brazil,193,91,1
2,Caoimhín Kelleher,Liverpool,25,Goalkeeper,Ireland,188,74,62
3,Fabian Mrozek,Liverpool,20,Goalkeeper,Poland,192,0,93
4,Marcelo Pitaluga,Liverpool,21,Goalkeeper,Brazil,192,77,45
...,...,...,...,...,...,...,...,...
659,Daniel Jebbison,Sheffield United,20,Forward,Canada,190,69,36
660,Oliver McBurnie,Sheffield United,27,Forward,Scotland,188,79,9
661,Rhian Brewster,Sheffield United,23,Forward,England,180,75,7
662,Ryan Oné,Sheffield United,17,Forward,Scotland,0,0,26


In [9]:
def get_player_stats(base_url, id_player):
    stats = {"Name": None, "Team": None}
    rep = requests.get(base_url + id_player)
    if rep.status_code == 200:
        #print("id player : ", id_player)
        soup = BeautifulSoup(rep.text, "html.parser")

        # player name
        stats["Name"] = soup.title.string.split(' Season Stats |')[0]

        # player team
        script_tag = soup.find('script', {'type': 'application/ld+json'})
        data = json.loads(script_tag.string)
        stats["Team"] = data['affiliation']['name']

        # player stats
        desired_headers = ["KEY STATS", "DEFENCE", "DISTRIBUTION", "OFFENSE", "DISCIPLINE"]
        potential_stats_divs = soup.find_all('div', class_='transfer-details')

        for div in potential_stats_divs:
            header = div.find('p', class_='transfer-details__header',
                              string=lambda string: any(header in string for header in desired_headers))
            if header:
                for entry in div.find_all('li', class_='transfer-details-list__entry'):
                    value = entry.find('p', class_='title-3-bold').text.strip()
                    stat_name = entry.find('p', class_='title-7-regular').text.strip()
                    full_stat_name = f"{header.text.strip()}_{stat_name}"
                    stats[full_stat_name] = value

        return pd.DataFrame(stats, index=[0])
    else:
        return None

# collecting all player statistics
list_results_stats = []

base_url = "https://onefootball.com/"
for player in tqdm(player_urls_stats):
  time.sleep(0.1)
  df = get_player_stats(base_url, str(player))
  if df is not None:
    list_results_stats.append(df)

table_stats = pd.concat(list_results_stats, ignore_index=True)
table_stats

100%|██████████| 664/664 [03:54<00:00,  2.84it/s]


Unnamed: 0,Name,Team,KEY STATS_Goals,KEY STATS_Assists,KEY STATS_Shot Accuracy,KEY STATS_Pass Accuracy,DEFENCE_Tackles,DEFENCE_Tackles Won,DEFENCE_Duels,DEFENCE_Duels won,...,OFFENSE_Goals - inside the box,OFFENSE_Goals - outside the box,OFFENSE_Other goals,OFFENSE_Successful dribbles,OFFENSE_Offsides,DISCIPLINE_Yellow cards,DISCIPLINE_Yellow - red cards,DISCIPLINE_Red cards,DISCIPLINE_Fouls conceded,DISCIPLINE_Fouls won
0,Adrian,Liverpool,0,0,0.00,0.0,0,0.0,0,0.00 %,...,0,0,0,0,0,0,0,0,0,0
1,Alisson,Liverpool,0,0,0.00,84.1,0,0.0,11,72.73 %,...,0,0,0,0,0,1,0,0,1,4
2,Caoimhín Kelleher,Liverpool,0,0,0.00,91.0,0,0.0,1,100.00 %,...,0,0,0,0,0,0,0,0,0,0
3,Fabian Mrozek,Liverpool,,,,,,,,,...,,,,,,,,,,
4,Marcelo Pitaluga,Liverpool,0,0,0.00,0.0,0,0.0,0,0.00 %,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
659,Daniel Jebbison,Sheffield United,,,,,,,,,...,,,,,,,,,,
660,Oliver McBurnie,Sheffield United,3,1,64.29,58.0,9,22.2,183,48.63 %,...,3,0,0,4,4,1,2,2,12,17
661,Rhian Brewster,Sheffield United,0,0,100.00,67.4,4,25.0,19,36.84 %,...,0,0,0,0,0,0,0,0,2,1
662,Ryan Oné,Sheffield United,0,0,0.00,100.0,0,0.0,0,0.00 %,...,0,0,0,0,0,0,0,0,0,0


In [10]:
# merge both tables together

merged_stats = table_info.merge(table_stats, on=['Name', 'Team'], how='inner')
merged_stats

Unnamed: 0,Name,Team,Age,Position,Country,Height (cm),Weight (kg),Jersey number,KEY STATS_Goals,KEY STATS_Assists,...,OFFENSE_Goals - inside the box,OFFENSE_Goals - outside the box,OFFENSE_Other goals,OFFENSE_Successful dribbles,OFFENSE_Offsides,DISCIPLINE_Yellow cards,DISCIPLINE_Yellow - red cards,DISCIPLINE_Red cards,DISCIPLINE_Fouls conceded,DISCIPLINE_Fouls won
0,Adrian,Liverpool,37,Goalkeeper,Spain,190,80,13,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Alisson,Liverpool,31,Goalkeeper,Brazil,193,91,1,0,0,...,0,0,0,0,0,1,0,0,1,4
2,Caoimhín Kelleher,Liverpool,25,Goalkeeper,Ireland,188,74,62,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Fabian Mrozek,Liverpool,20,Goalkeeper,Poland,192,0,93,,,...,,,,,,,,,,
4,Marcelo Pitaluga,Liverpool,21,Goalkeeper,Brazil,192,77,45,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
659,Daniel Jebbison,Sheffield United,20,Forward,Canada,190,69,36,,,...,,,,,,,,,,
660,Oliver McBurnie,Sheffield United,27,Forward,Scotland,188,79,9,3,1,...,3,0,0,4,4,1,2,2,12,17
661,Rhian Brewster,Sheffield United,23,Forward,England,180,75,7,0,0,...,0,0,0,0,0,0,0,0,2,1
662,Ryan Oné,Sheffield United,17,Forward,Scotland,0,0,26,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
# clean dataset column names to make more presentable

def clean_column_names(df):
    df.columns = df.columns.str.lower()
    df.columns = df.columns.str.replace(' ', '_')
    df.columns = df.columns.str.replace(r'[^a-zA-Z0-9_]', '')  # remove special characters
    df.columns = df.columns.str.replace(r'__+', '_')
    return df

final_stats = clean_column_names(merged_stats)
final_stats

  df.columns = df.columns.str.replace(r'[^a-zA-Z0-9_]', '')  # remove special characters
  df.columns = df.columns.str.replace(r'__+', '_')


Unnamed: 0,name,team,age,position,country,height_cm,weight_kg,jersey_number,key_stats_goals,key_stats_assists,...,offense_goals_inside_the_box,offense_goals_outside_the_box,offense_other_goals,offense_successful_dribbles,offense_offsides,discipline_yellow_cards,discipline_yellow_red_cards,discipline_red_cards,discipline_fouls_conceded,discipline_fouls_won
0,Adrian,Liverpool,37,Goalkeeper,Spain,190,80,13,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Alisson,Liverpool,31,Goalkeeper,Brazil,193,91,1,0,0,...,0,0,0,0,0,1,0,0,1,4
2,Caoimhín Kelleher,Liverpool,25,Goalkeeper,Ireland,188,74,62,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Fabian Mrozek,Liverpool,20,Goalkeeper,Poland,192,0,93,,,...,,,,,,,,,,
4,Marcelo Pitaluga,Liverpool,21,Goalkeeper,Brazil,192,77,45,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
659,Daniel Jebbison,Sheffield United,20,Forward,Canada,190,69,36,,,...,,,,,,,,,,
660,Oliver McBurnie,Sheffield United,27,Forward,Scotland,188,79,9,3,1,...,3,0,0,4,4,1,2,2,12,17
661,Rhian Brewster,Sheffield United,23,Forward,England,180,75,7,0,0,...,0,0,0,0,0,0,0,0,2,1
662,Ryan Oné,Sheffield United,17,Forward,Scotland,0,0,26,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
# save player stats dataframe to CSV file

final_stats.to_csv("C:/Users/jules/OneDrive/Desktop/Data-AES/FinalProject_PremierLeague/premier_league_player_stats.csv", index=False)