In [2]:
import re
import time
from bs4 import BeautifulSoup
import datetime
import requests
import json
import pandas as pd
import numpy as np
import bs4

# Premier League Football news Data Extraction
Since Premier League has always been a hot topic, dozens of articles are written every day about PL (scores, transfert, gossip and so on). We propose to aggregate PL related articles from two sources : https://onefootball.com/en/competition/premier-league-9 and https://www.bbc.com/sport/football/premier-league . We tried to scrap other website like eurosport.com but they were protected against scraping and crawling redirecting the get requests.  


### Articles from One Football


In [21]:
from lxml import html, etree

def get_news_link_onefootball(link):
  """
  extract news links from a web page
  """
  rep = requests.get(link)
  if rep.status_code == 200:
    soup = BeautifulSoup(rep.text, 'html.parser')
    """xml_doc = html.fromstring(rep.text)
    xpath_query = "//a[contains(@href,'news')]"
    matching_elements = xml_doc.xpath(xpath_query)
    news_links = [element.get('href') for element in matching_elements]
    return news_links"""
    pattern = r"^/en/news"
    web_links = soup.find_all('a', href=True)
    return set([web_link['href'] for web_link in web_links if re.search(pattern, str(web_link['href']))])


link = "https://onefootball.com/en/competition/premier-league-9"
get_news_link_onefootball(link)

{'/en/news/20-most-expensive-premier-league-transfers-ever-based-on-inflation-adjusted-fees-38825330',
 '/en/news/40m-ace-has-been-watched-several-times-by-man-united-38825374',
 '/en/news/arsenal-learn-the-asking-price-for-23-year-old-attacking-target-38825375',
 '/en/news/liverpool-fan-favourite-has-been-offered-to-several-premier-league-clubs-38825312',
 '/en/news/man-city-complete-their-first-transfer-deal-of-the-january-window-38825126',
 '/en/news/man-united-are-keen-on-signing-the-next-moises-caicedo-38824982',
 '/en/news/man-utd-told-to-be-relieved-to-get-5m-back-for-deadweight-whose-agent-deserves-a-slap-38825358',
 '/en/news/tottenham-are-desperate-to-complete-the-signing-of-60m-striker-38825082',
 '/en/news/van-de-beek-makes-tough-man-utd-admission-as-he-highlights-one-positive-from-horror-prem-spell-38825093',
 '/en/news/why-jesse-lingard-still-doesnt-have-a-new-club-38825489'}

In [22]:
from collections import deque
def get_news_links_bfs(starting_url, website_path ,link_extractor = get_news_link_onefootball ,  visited_urls=None, max_nb = 5):
  """
  Implement a BFS to explore and retrieve article urls from onefootball.com
  """
  queue = deque([starting_url])
  visited_urls = set()
  level = 0
  cpt_old = len(visited_urls)
  cpt = 0
  while queue and len(visited_urls) < max_nb:
      url = queue.popleft()
      visited_urls.add(url)  # only visits the page
      links = link_extractor(url)  # get links, maybe parse the result of last statement
      if links is not None:
        for l in links:
          queue.append(website_path + l)
        if len(visited_urls)%10 == 0:
          print("nb visited articles : ", len(visited_urls))
        cpt_new = len(visited_urls)

      if cpt_new == cpt_old:
        cpt+=1
      else:
        cpt=0
        cpt_old = cpt_new
      if cpt == cpt_old:
        return visited_urls
  visited_urls.remove(starting_url)
  return visited_urls

In [23]:
list_articles = get_news_links_bfs(starting_url = "https://onefootball.com/en/competition/premier-league-9", website_path = "https://onefootball.com", link_extractor=get_news_link_onefootball, max_nb=1000)

nb visited articles :  10
nb visited articles :  20
nb visited articles :  20
nb visited articles :  30
nb visited articles :  30
nb visited articles :  30
nb visited articles :  40
nb visited articles :  50
nb visited articles :  50
nb visited articles :  60
nb visited articles :  60
nb visited articles :  60
nb visited articles :  60
nb visited articles :  60
nb visited articles :  70
nb visited articles :  70
nb visited articles :  70
nb visited articles :  70
nb visited articles :  70
nb visited articles :  70
nb visited articles :  70
nb visited articles :  70
nb visited articles :  70
nb visited articles :  80
nb visited articles :  80
nb visited articles :  90
nb visited articles :  90
nb visited articles :  90


In [31]:
from datetime import datetime
import time
from tqdm import tqdm

def table_news_articles_one_football(list_links_articles):

  list_infos = []
  for i, link in tqdm(enumerate(list_links_articles)):
    d = {"title": [], "released_date":[], "source":[], "url":[],"mentions":[]}
    time.sleep(0.1)
    rep = requests.get(link)
    if rep.status_code == 200:
      page = BeautifulSoup(rep.text, "html.parser")
      d["url"] = link
      d["title"] = page.title.text.split("|")[0]
      source = page.find("p", class_="title-8-bold")
      if source is not None:
        d["source"] = source.text
      date = source.find_next_sibling()
      if date is not None:
        date = date.text.split("·")[1]
      d["released_date"] = datetime.strptime(date, "%d %B %Y").strftime('%d/%m/%Y')
      mentions = [ l.get("aria-label") for l in page.find(string='Mentioned in this article').find_parent().find_parent().find_next_sibling().find_all("a")]
      d["mentions"] = ', '.join(mentions)
      #print(i ," ", d)
      list_infos.append(pd.DataFrame(d, index=[0]))
    else:
      print("Error status code {}".format(rep.status_code))

  return pd.concat(list_infos, ignore_index=True)


In [29]:
list_articles = set(list_articles)
list_articles.remove('https://onefootball.com/en/competition/premier-league-9')
list_articles = list(list_articles)
list_articles

['https://onefootball.com/en/news/forest-down-man-utd-villa-leave-it-late-city-palace-chelsea-win-38799472',
 'https://onefootball.com/en/news/which-premier-league-sides-will-suffer-most-with-afcon-and-asian-cup-call-ups-38815261',
 'https://onefootball.com/en/news/arsenal-top-this-premier-league-table-that-is-the-mark-of-champions-38798669',
 'https://onefootball.com/en/news/arsenal-are-keeping-a-keen-eye-on-this-juventus-forward-good-choice-for-arteta-38812588',
 'https://onefootball.com/en/news/every-starting-budget-at-each-premier-league-club-in-football-manager-2024-38816508',
 'https://onefootball.com/en/news/frustrated-liverpool-icon-firmino-could-make-pl-return-amid-shock-interest-from-english-club-38796143',
 'https://onefootball.com/en/news/premier-league-transfer-rumour-mill-vlahovic-a-no-go-for-arsenal-38824753',
 'https://onefootball.com/en/news/transfer-news-live-chelsea-rival-arsenal-for-toney-onana-move-dragusin-to-tottenham-update-sancho-latest-38803051',
 'https://one

In [32]:
table_articles_one_football = table_news_articles_one_football(list_articles)
table_articles_one_football

90it [00:45,  1.97it/s]


Unnamed: 0,title,released_date,source,url,mentions
0,🦁 Forest down Man Utd; Villa leave it late; Ci...,30/12/2023,OneFootball,https://onefootball.com/en/news/forest-down-ma...,"Burnley, Manchester United, Manchester City, C..."
1,Which Premier League sides will suffer most wi...,03/01/2024,The Independent,https://onefootball.com/en/news/which-premier-...,"Liverpool, Tottenham Hotspur, Manchester Unite..."
2,Arsenal top this Premier League table that is ...,30/12/2023,Football365,https://onefootball.com/en/news/arsenal-top-th...,"Arsenal, Manchester City, Brighton & Hove Albi..."
3,Arsenal Are Keeping A Keen Eye On This Juventu...,02/01/2024,The 4th Official,https://onefootball.com/en/news/arsenal-are-ke...,"Juventus, Arsenal, Dušan Vlahović"
4,Every starting budget at each Premier League c...,03/01/2024,GiveMeSport,https://onefootball.com/en/news/every-starting...,"Premier League, UEFA Champions League"
...,...,...,...,...,...
85,Why Tottenham trump Arsenal and Chelsea with J...,04/01/2024,Evening Standard,https://onefootball.com/en/news/why-tottenham-...,"Chelsea, Arsenal, Tottenham Hotspur, Premier L..."
86,Transfer news LIVE! Arsenal have Zubimendi pla...,30/12/2023,Evening Standard,https://onefootball.com/en/news/transfer-news-...,"Chelsea, Arsenal, Tottenham Hotspur, Mancheste..."
87,Man United are keen on signing ‘the next Moise...,05/01/2024,caughtoffside,https://onefootball.com/en/news/man-united-are...,"Manchester United, Brighton & Hove Albion, Pre..."
88,Eintracht Frankfurt reach agreement with Aurèl...,04/01/2024,Get German Football News,https://onefootball.com/en/news/eintracht-fran...,"Manchester United, Eintracht Frankfurt, Donny ..."


### Articles from BBC sports

In [33]:
def get_news_link_bbc(link):
  """
  extract news links from a BBC sport webpage
  """
  rep = requests.get(link)
  pattern = r"/sport/football/\d+"
  if rep.status_code == 200:
    soup = BeautifulSoup(rep.text, 'html.parser')
    web_links = soup.find_all('a')
    actual_web_links = [re.search(pattern,str(web_link['href']))[0]  for web_link in web_links if re.search(pattern, str(web_link))]
    return set(actual_web_links)

In [34]:
list_articles_bbc = get_news_links_bfs(starting_url ="https://www.bbc.com/sport/football/premier-league" , website_path="https://www.bbc.com" ,link_extractor = get_news_link_bbc ,max_nb = 1000)

nb visited articles :  10
nb visited articles :  20
nb visited articles :  30
nb visited articles :  40
nb visited articles :  40
nb visited articles :  50
nb visited articles :  50
nb visited articles :  50
nb visited articles :  50
nb visited articles :  50
nb visited articles :  60
nb visited articles :  60
nb visited articles :  70
nb visited articles :  70
nb visited articles :  70
nb visited articles :  70
nb visited articles :  70


In [35]:
import datetime

def get_news_table_bbc(list_articles_bbc):
  list_infos = []
  for link in tqdm(list_articles_bbc):
    d = {}
    time.sleep(0.1)
    rep = requests.get(link)
    if rep.status_code == 200:
      soup = BeautifulSoup(rep.text, 'html.parser')
      # Find the <meta> element with property="og:title"
      og_title_meta = soup.find('meta', {'property': 'og:title'})
      # Print the content attribute of the found element
      if og_title_meta:
        d["title"] = og_title_meta.get('content')
      #soup.find("time").get_text(separator="|"))
      s = soup.find("time")
      if s is None:
        date = s.get("datetime").split("T")[0]

        d["released_date"] = datetime.datetime.strptime(date, "%Y-%m-%d").strftime("%d/%m/%Y")
        d["source"] = "BBC Sport"
        d["url"] = link
        css_selector = 'a[href^="/sport/football/"]'
        d["mentions"] = [l.text for l in soup.select(css_selector)][0]

      list_infos.append(pd.DataFrame(d, index=[0]))
    else:
      print("Error status code {}".format(rep.status_code))

  return pd.concat(list_infos, ignore_index=True)


In [38]:
list_articles_bbc = set(list_articles_bbc)
list_articles_bbc.remove("https://www.bbc.com/sport/football/premier-league")
list_articles_bbc = list(list_articles_bbc)
table_bbc = get_news_table_bbc(list_articles_bbc)
table_bbc

 41%|████▏     | 31/75 [00:13<00:19,  2.30it/s]

Error status code 404


100%|██████████| 75/75 [00:32<00:00,  2.28it/s]


Unnamed: 0,title,released_date,source,url,mentions
0,Fulham's Silva charged by FA after referee com...,04/01/2024,BBC Sport,https://www.bbc.com/sport/football/67886178,Fulham
1,Wan-Bissaka one of three to extend Man Utd deals,04/01/2024,BBC Sport,https://www.bbc.com/sport/football/67879910,Manchester United
2,Who was 'outstanding'? Garth's Team of the Week,02/01/2024,BBC Sport,https://www.bbc.com/sport/football/67865895,Premier League
3,Follow Liverpool in the BBC Sport app,05/01/2024,BBC Sport,https://www.bbc.com/sport/football/62289316,Liverpool
4,"Referee couldn't handle pressure, says angry S...",16/12/2023,BBC Sport,https://www.bbc.com/sport/football/67739716,Premier League
...,...,...,...,...,...
69,Follow your club and get news sent direct to you,05/01/2024,BBC Sport,https://www.bbc.com/sport/football/66302372,Premier League
70,Luton's Lockyer still in hospital after collap...,17/12/2023,BBC Sport,https://www.bbc.com/sport/football/67744280,Luton Town
71,Have your say on Man Utd,04/01/2024,BBC Sport,https://www.bbc.com/sport/football/59821410,Manchester United
72,Reguilon returns to Spurs as Man Utd end loan ...,02/01/2024,BBC Sport,https://www.bbc.com/sport/football/67864491,Premier League


In [42]:
table_articles_premier_league =  pd.concat([table_articles_one_football,table_bbc], ignore_index=True)

In [43]:
table_articles_premier_league

Unnamed: 0,title,released_date,source,url,mentions
0,🦁 Forest down Man Utd; Villa leave it late; Ci...,30/12/2023,OneFootball,https://onefootball.com/en/news/forest-down-ma...,"Burnley, Manchester United, Manchester City, C..."
1,Which Premier League sides will suffer most wi...,03/01/2024,The Independent,https://onefootball.com/en/news/which-premier-...,"Liverpool, Tottenham Hotspur, Manchester Unite..."
2,Arsenal top this Premier League table that is ...,30/12/2023,Football365,https://onefootball.com/en/news/arsenal-top-th...,"Arsenal, Manchester City, Brighton & Hove Albi..."
3,Arsenal Are Keeping A Keen Eye On This Juventu...,02/01/2024,The 4th Official,https://onefootball.com/en/news/arsenal-are-ke...,"Juventus, Arsenal, Dušan Vlahović"
4,Every starting budget at each Premier League c...,03/01/2024,GiveMeSport,https://onefootball.com/en/news/every-starting...,"Premier League, UEFA Champions League"
...,...,...,...,...,...
159,Follow your club and get news sent direct to you,05/01/2024,BBC Sport,https://www.bbc.com/sport/football/66302372,Premier League
160,Luton's Lockyer still in hospital after collap...,17/12/2023,BBC Sport,https://www.bbc.com/sport/football/67744280,Luton Town
161,Have your say on Man Utd,04/01/2024,BBC Sport,https://www.bbc.com/sport/football/59821410,Manchester United
162,Reguilon returns to Spurs as Man Utd end loan ...,02/01/2024,BBC Sport,https://www.bbc.com/sport/football/67864491,Premier League


In [41]:
import os 
current = os.getcwd() #get the path of current directory
parent_directory = os.path.dirname(current) 
table_articles_premier_league.to_csv(parent_directory + "/data/articles_premier_leagues.csv")