In [1]:
import requests
import json
import pandas as pd
import numpy as np
import bs4

# Premier League Football Matches' scores and Matches' schedule Data Extraction
Extracting all information and match statistics from the One Football website, whic can be found here : https://onefootball.com/en/home
We are focusing on the Premier League during the season 2023-2024, for sport betting purpose, the most relevant information are the current league ranking and the matches' results of the ongoing season.
The following notebook scraps these data using mainly BeautifulSoup package, some attempts to use Selenium have been made to interact with JavaScript elements but ended in failure. We have adopted some less subtule and clean methods but we achieved the main objective of this notebook.  

In [59]:
import re
import time
from bs4 import BeautifulSoup
import datetime

def get_teams_links(base_url):
    response = requests.get(base_url)
    team_links = []
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "html.parser")

        team_link_tags = soup.find_all('a', class_='Standing_standings__rowGrid__45OOd')

        for tag in team_link_tags:
            team_name_tag = tag.find('p', class_='Standing_standings__teamName__psv61')
            if team_name_tag:
                team_links.append(tag['href'])

        return team_links
    else:
        return None


def get_match(base_url , id_match):
  """
  Get relevant informations about a match.
  Discriminate based on past and future match.
  """
  res = {}
  rep = requests.get(base_url + id_match)
  if rep.status_code == 200:
    #print("id match : ", id_match)

    soup = BeautifulSoup(rep.text, "html.parser")



    if not "vs" in soup.title.text:
      return None

    league = soup.find("span", class_="title-7-medium MatchScoreCompetition_competitionName__wONrf")
    if league is None or league.text != "Premier League":
      return None

    s = soup.title.text.split("vs")

    team_home = s[0]
    res["team_home"] = team_home
    match = re.match(r'^(.*?)(?=\|)', s[1])
    if match:
        team_away = match.group(1)
        res["team_away"] = team_away
    else:
        print("Pattern not found in the string.")
    res["onefootball_link"] = base_url + id_match

    info = soup.find_all("span", class_="title-8-regular MatchInfoEntry_subtitle__Mb7Jd")
    res["datetime"] = info[1].text
    res["location"] = info[2].text
    date = datetime.datetime.strptime(str(res["datetime"]), "%d/%m/%Y")

    print(team_home , " vs ", team_away, " ", str(res["datetime"]))

    if date > datetime.datetime.now():
      return pd.DataFrame(res, index=[0])
    else :
      s = soup.find("p",class_="MatchScore_scores__Hnn5f title-2-bold") #get the score
      if s is None:
        return None
      else:
        s = s.text
        score_home , score_away = s.split(":")

        res["score_home"] = score_home
        res["score_away"] = score_away

        preds = soup.find_all("li", class_="MatchPrediction_buttonsElement__EDqZN") #get the match predictions given before the match
        probas = ["home_win_proba", "draw_proba","away_win_proba"]

        if preds is not None:
          for i, s in enumerate(preds):
            res[probas[i]] = s.text

        list_elements = ["possession","total_shots","shots_on_target","duels_won"]
        for k, t in enumerate(soup.find_all('p', class_= lambda x: x and 'Entry_home' in x)):
          res["home_" + list_elements[k]] = t.text
        for k, t in enumerate(soup.find_all('p', class_= lambda x: x and 'Entry_away' in x)):
          res["away_" + list_elements[k]] = t.text

        return pd.DataFrame(res, index=[0])
  else:
    return None

In [60]:
list_results = []
list_future_matches = []
match_id = 2390326  #first match of the 2023-2024 Premier League Season

base_url = "https://onefootball.com/en/match/"
cpt=0

while cpt < 10:
  time.sleep(0.1)
  df = get_match(base_url, str(match_id))
  if df is not None:
    cpt = 0
    if not "score_home" in df.columns:
      list_future_matches.append(df)
    else:
      list_results.append(df)
  else:
    cpt+=1

  match_id +=1

if len(list_results) > 0 :
  table_past_matches = pd.concat(list_results, ignore_index=True)
if len(list_future_matches)> 0:
  table_future_matches =  pd.concat(list_future_matches, ignore_index=True)

Burnley   vs   Manchester City    11/08/2023
Arsenal   vs   Nottingham Forest    12/08/2023
AFC Bournemouth   vs   West Ham United    12/08/2023
Brighton & Hove Albion   vs   Luton Town    12/08/2023
Everton   vs   Fulham    12/08/2023
Newcastle United   vs   Aston Villa    12/08/2023
Sheffield United   vs   Crystal Palace    12/08/2023
Brentford   vs   Tottenham Hotspur    13/08/2023
Chelsea   vs   Liverpool    13/08/2023
Manchester United   vs   Wolverhampton Wanderers    14/08/2023
Aston Villa   vs   Everton    20/08/2023
Crystal Palace   vs   Arsenal    21/08/2023
Fulham   vs   Brentford    19/08/2023
Liverpool   vs   AFC Bournemouth    19/08/2023
Luton Town   vs   Burnley    03/10/2023
Manchester City   vs   Newcastle United    19/08/2023
Nottingham Forest   vs   Sheffield United    18/08/2023
Tottenham Hotspur   vs   Manchester United    19/08/2023
West Ham United   vs   Chelsea    20/08/2023
Wolverhampton Wanderers   vs   Brighton & Hove Albion    19/08/2023
Arsenal   vs   Fulha

In [62]:
table_past_matches

Unnamed: 0,team_home,team_away,onefootball_link,datetime,location,score_home,score_away,home_win_proba,draw_proba,away_win_proba,home_possession,home_total_shots,home_shots_on_target,home_duels_won,away_possession,away_total_shots,away_shots_on_target,away_duels_won
0,Burnley,Manchester City,https://onefootball.com/en/match/2390326,11/08/2023,Turf Moor,0,3,0%,0%,0%,34%,6,1,45%,66%,17,8,55%
1,Arsenal,Nottingham Forest,https://onefootball.com/en/match/2390327,12/08/2023,Emirates Stadium,2,1,0%,0%,0%,79%,15,7,46%,21%,6,2,54%
2,AFC Bournemouth,West Ham United,https://onefootball.com/en/match/2390328,12/08/2023,Vitality Stadium,1,1,0%,0%,0%,63%,14,5,41%,37%,16,3,59%
3,Brighton & Hove Albion,Luton Town,https://onefootball.com/en/match/2390329,12/08/2023,Amex Stadium,4,1,0%,0%,0%,71%,27,12,47%,29%,9,3,53%
4,Everton,Fulham,https://onefootball.com/en/match/2390330,12/08/2023,Goodison Park,0,1,0%,0%,0%,40%,19,9,46%,60%,9,2,55%
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193,Manchester City,Sheffield United,https://onefootball.com/en/match/2390521,30/12/2023,Etihad Stadium,2,0,90%,2%,8%,81%,18,4,49%,19%,4,2,51%
194,Nottingham Forest,Manchester United,https://onefootball.com/en/match/2390522,30/12/2023,The City Ground,2,1,21%,3%,76%,45%,8,2,55%,55%,10,5,45%
195,Tottenham Hotspur,AFC Bournemouth,https://onefootball.com/en/match/2390523,31/12/2023,Tottenham Hotspur Stadium,3,1,82%,3%,15%,51%,12,6,50%,49%,24,4,50%
196,West Ham United,Brighton & Hove Albion,https://onefootball.com/en/match/2390524,02/01/2024,London Stadium,0,0,61%,7%,32%,31%,6,2,55%,69%,22,8,45%


In [63]:
table_past_matches.to_csv('premier_league_past_matches_' + datetime.datetime.now().strftime("%d_%m_%Y") + ".csv")
table_future_matches.to_csv('premier_league_future_matches_' + datetime.datetime.now().strftime("%d_%m_%Y") + ".csv")

## Get the result table

In [64]:
def get_result_table():
  link = "https://onefootball.com/en/competition/premier-league-9/table"
  rep = requests.get(link)
  d = {"Rank": [], "Team":[], "Played":[], "Wins":[], "Draw":[],"Losses":[],"Goal_diff":[],"Points":[]}
  df = pd.DataFrame(d)
  if rep.status_code == 200:
    page = BeautifulSoup(rep.text, "html.parser")
    rankings = page.find_all("li", class_="Standing_standings__row__5sdZG Standing_standings__rowLink__Skr86")
    for pos , results in enumerate(rankings):
      team = results.find("a").get("aria-label")
      df.loc[len(df)] = [t.text for t in results.find_all("div") if t.text !='']

  df.set_index("Rank",inplace=True )
  return df

get_result_table()

Unnamed: 0_level_0,Team,Played,Wins,Draw,Losses,Goal_diff,Points
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,Liverpool,20,13,6,1,25,45
2,Aston Villa,20,13,3,4,16,42
3,Manchester City,19,12,4,3,24,40
4,Arsenal,20,12,4,4,17,40
5,Tottenham Hotspur,20,12,3,5,13,39
6,West Ham United,20,10,4,6,3,34
7,Brighton & Hove Albion,20,8,7,5,5,31
8,Manchester United,20,10,1,9,-5,31
9,Newcastle United,20,9,2,9,10,29
10,Chelsea,20,8,4,8,3,28
