# Web Scraping of sport articles

In [1]:
# Import relevent librairies
import requests
from bs4 import BeautifulSoup
from urllib.request import urlopen
import xml.etree.ElementTree as ET

import numpy as np
import pandas as pd

import time
from datetime import datetime

import glob
import os

import re #Regular expression operations module
import warnings
warnings.filterwarnings("ignore")
from gensim.summarization import summarize
from gensim.summarization.textcleaner import clean_text_by_sentences as _clean_text_by_sentences

In [211]:
# url definition
URL= "https://news.google.com/stories/CAAqSQgKIkNDQklTTERvSmMzUnZjbmt0TXpZd1NoOGFIV1JuVWs5bUxWa3llVEpzVWtwcVRUZEVkelZFWlVwVk4xcFVZV05OS0FBUAE?q=ligue+1+psg+blessure&lr=French&hl=fr&gl=FR&ceid=FR:fr"

def currrent_date_time():
    return datetime.now().strftime("%d-%m-%Y-%H-%M-%S")

def replace_non_asci_char(string):
    # https://stackoverflow.com/questions/227459/how-to-get-the-ascii-value-of-a-character
    list_string = list(string)
    changes= [] 
    for element,i in zip(list_string,range(len(list_string))):
        if ord(element)>128 : changes.append([i,element])
            
    while changes != []:
        i,element = changes.pop()
        list_string[i] = quote(element)
    return "".join(list_string)

def m1_get_URL_articles(search_keywords,n_articles=10):
    ''' Method 1 : parse the hmtl through the google news search engine with BeautifulSoup
    Goal : Obtain list of article links from the google search page url
    input : 
    keywords = list of keywords to be converted to a url in google news search pages (string) 
    n_articles = number of articles to scrap by keywords
    ouput : 
    URL_articles = list of url of all articles referenced by class = "VDXfz" (list of strings) '''
    
    URL_search = 'https://news.google.com/search?q='+'+'.join(search_keywords)+'&hl=fr&gl=FR&ceid=FR%3Afr' 
    # Request
    r1 = requests.get(URL_search)
    assert r1.status_code==200
    
    # We'll save in coverpage the cover page content
    search_page = r1.content
    
    # Soup creation
    soup1 = BeautifulSoup(search_page, 'html5lib')
    
    # News identification
    search_page_articles = soup1.findAll("a",class_="VDXfz") # find all articles by html class
    URL_articles=['https://news.google.com'+search_page_articles[i]['href'][1:] for i in range(n_articles)] # We remove the first chararcter '.' and retrieve only the first n_articles articles. The order of articles is a Google algorithm
    
    return URL_articles  

def m1_get_contents(URL_articles):
    '''Method 1 : parse the hmtl through the google news search engine with BeautifulSoup
    Goal : Get the contents of each article from their link
    input :
        links = list of url of all articles referenced by class ="VDXfz (list of strings) 
        n = number of articles
    ouput : 
        links = list of resulting article URL after redirection (list of string)
        titles = list of associated titles (list of string)
        contents = list of associated contents (list of string)    '''
    contents = []
    links = []
    titles = []

    for link in URL_articles:

        # Reading the link (it is divided in paragraphs)
        article = requests.get(link)
        
        # Getting the link
        links.append(article.url) ## get the resulting URL after redirection

        # if cannot access the link : add "None"
        if article.status_code!=requests.codes.ok:
            titles.append("None")
            contents.append("None")
        # else parse the article 
        else :
            article_content = article.content
            soup_article = BeautifulSoup(article_content, 'html5lib')

            # Getting the title
            title = soup_article.title.get_text()
            titles.append(title)

            # Getting the content
            body = soup_article.find_all('p')   
            
            if (len(body)!=0) :
                # Unifying the paragraph
                p_tags_text=[tag.get_text().strip() for tag in body]
                # Filter out sentences that contain newline characters '\n' or don't contain periods.
                sentence_list = [sentence for sentence in p_tags_text if not '\n' in sentence]
                sentence_list = [sentence for sentence in sentence_list if '.' in sentence]
                final_article = "".join(sentence_list)  
                contents.append(final_article)
            else:
                # Did not find text in p class ex : https://www.jeunesfooteux.com/OM-AS-Monaco-Coup-dur-pour-l-ASM-avant-Marseille-Monaco-_a43907.html
                contents.append("None")
    return links, titles, contents

def m2_get_contents(search_keywords,n_articles=10):
    ''' Method 2 : extract the news with the google RSS feeds by parsing the xml files '''
    search_string_encoded = replace_non_asci_char('+'.join(search_keywords))
    # for non ascii characters like "é"  
    URL_search = 'https://news.google.com/rss/search?q='+search_string_encoded+'&hl=fr&gl=FR&ceid=FR%3Afr' 
    # Parse XML Data with Python
    response = urlopen(URL_search).read()
    xmldoc = ET.fromstring(response)
    
    links = []
    titles = []
    dates = []
    for index, item in zip(range(n_articles), xmldoc.iterfind('channel/item')) :    
        title = item.findtext('title')
        date = item.findtext('pubDate')
        dt_publication = datetime.strptime(date,'%a, %d %b %Y %H:%M:%S %Z')
        link = item.findtext('link')
        
        ##if pass: date condition
        links.append(link)
        titles.append(title)
        dates.append(dt_publication)
    # links and links2 are the same 
    # titles and titles are almost the same, titles is slightly better
    links2, titles2, contents  = m1_get_contents(links)
    
    return links, titles, contents, dates

    
def overview(links, titles, contents,n_articles=5):
    ''' Goal : displays/checks by printing the data associated '''
    for n in np.arange(0, n_articles):
        print('-------------------------------')
        print('title : {}'.format(titles[n]))
        print('link : {}'.format(links[n]))
        print('content : {}'.format(" ".join(contents[n][:300].split())[:200]+"...")) ##first 150 chars

def scrap_news_single_search(search_keywords,n_articles=10):
    ''' Goal : create data frame
    input :
         url = url of google news search pages (string) 
         team = name of the team (string)
         n = number of articles (integer)
    ouput : 
         df_features'''
    # method 1 
    '''# get links of article from google news
    URL_articles = m1_get_URL_articles(search_keywords, n_articles)
    # get content within each article
    links,titles,contents=m1_get_contents(URL_articles)'''
    # method 2
    # get content within each article
    links,titles,contents,dates = m2_get_contents(search_keywords, n_articles)
   
    # df_features
    df_news = pd.DataFrame({'article_title': titles,
                            'content': contents,
                            'keywords': " ".join(search_keywords),
                            'article_link': links,
                            'dt_publication':dates,
                            'dt_extraction': currrent_date_time()})
    
    return df_news

def scrap_news_searches(searches,n_articles=10,n_days=14):
    # initialize df
    df_news = pd.DataFrame(columns=['article_title','content','keywords','article_link','dt_publication','dt_extraction'])
    
    # compute the scrap for each search_keywords
    for search_keywords in searches : 
        df = scrap_news_single_search(search_keywords,n_articles = n_articles)
        df_news = df_news.append(df)
        print('done with '+ " ".join(search_keywords))
    # filter out if publication date is more than n_days days
    df_filtered = df_news[df_news['dt_publication'] >= datetime.now()- timedelta(days=n_days)]

    return df_filtered

### Let's extract the text from the articles:

In [118]:
# Step 1 : Obtain list of article links from the google search page (searching "ligue 1 OM blessure")
# Step 2 : Get the contents of each article from their link

search_keywords = ['ligue', '1', 'om', 'blessure']
start = time.time()
df_news = scrap_news(search_keywords,n_articles=10)
print("The time elapsed is %f seconds" %(time.time()-start))
df_news.head()

The time elapsed is 5.372430 seconds


Unnamed: 0,article_title,content,keywords,article_link,dt_publication,dt_extraction
0,OM - ASM : Morgan Sanson sorti sur blessure - ...,Opposé à l'AS Monaco dans le cadre de la 14e j...,ligue 1 om blessure,https://www.footmercato.net/a27598870876164670...,2020-12-12 08:00:00,29-12-2020-15-50-00
1,Marseille-Monaco : pourquoi l'OM est épargné p...,Hiroki Sakai a de la chance. Le latéral de l'O...,ligue 1 om blessure,https://www.leparisien.fr/sports/football/mars...,2020-12-12 08:00:00,29-12-2020-15-50-00
2,OM - André Villas-Boas revient sur la sortie s...,L'Olympique de Marseille a enchainé un sixième...,ligue 1 om blessure,http://www.madeinfoot.com/infos/article-om-and...,2020-12-12 08:00:00,29-12-2020-15-50-00
3,PSG - OL (0-1) : Grosse inquiétude pour Neymar...,Le PSG a peut-être perdu bien plus qu'un match...,ligue 1 om blessure,https://www.eurosport.fr/football/ligue-1/2020...,2020-12-13 08:00:00,29-12-2020-15-50-00
4,Angers - Pas de blessure grave pour Romain Tho...,Les dirigeants et supporters du SCO peuvent so...,ligue 1 om blessure,http://www.madeinfoot.com/infos/article-angers...,2020-12-26 08:15:00,29-12-2020-15-50-00


In [94]:
# Then we define the searches to be made, and run the scrapping for earch search 
files = os.listdir('../data/teams/')
paths=[os.path.join(path, basename) for basename in files]
latest_file = max(paths, key=os.path.getctime)
# https://stackoverflow.com/questions/39327032/how-to-get-the-latest-file-in-a-folder/39327156#39327156

teams = pd.read_csv(latest_file,sep=";",encoding="latin-1")
teams 

Unnamed: 0.1,Unnamed: 0,footballer,team,position,price,player,dt_extraction
0,0,Larsonneur,Brest,Gardien,39,les Cocos Singapouriens,28-12-2020-18-10-55
1,1,Ntumba,Dijon,Gardien,1,les Cocos Singapouriens,28-12-2020-18-10-55
2,2,Gomis,Rennes,Gardien,22,les Cocos Singapouriens,28-12-2020-18-10-55
3,3,Kamara,Marseille,Def. Cen.,16,les Cocos Singapouriens,28-12-2020-18-10-55
4,4,GonzÁLez,Marseille,Def. Cen.,15,les Cocos Singapouriens,28-12-2020-18-10-55
5,5,Balerdi,Marseille,Def. Cen.,13,les Cocos Singapouriens,28-12-2020-18-10-55
6,6,Disasi,Monaco,Def. Cen.,30,les Cocos Singapouriens,28-12-2020-18-10-55
7,7,Robson Bambu,Nice,Def. Cen.,7,les Cocos Singapouriens,28-12-2020-18-10-55
8,8,Muzinga,Dijon,Def. Lat.,5,les Cocos Singapouriens,28-12-2020-18-10-55
9,9,Amavi,Marseille,Def. Lat.,16,les Cocos Singapouriens,28-12-2020-18-10-55


In [222]:
searches_composition_probable = [["composition","probable",team] for team in teams['team'].unique()]
print ( "number of teams searched: "  + str(len(searches_composition_probable)))
composition_probable =  scrap_news_searches(searches_composition_probable,n_articles=10,n_days=7)

from IPython.display import HTML
composition_probable = composition_probable[["keywords","article_title","content","article_link","dt_publication"]]
HTML(composition_probable.to_html())

number of teams searched: 15
done with composition probable Brest
done with composition probable Dijon
done with composition probable Rennes
done with composition probable Marseille
done with composition probable Monaco
done with composition probable Nice
done with composition probable Lyon
done with composition probable Metz
done with composition probable Paris
done with composition probable Lille
done with composition probable Saint-Étienne
done with composition probable Bordeaux
done with composition probable Montpellier
done with composition probable Reims
done with composition probable Strasbourg


Unnamed: 0,keywords,article_title,content,article_link,dt_publication
4,composition probable Dijon,"Nîmes - Dijon : Les compos probables avec Sarr, Ripart et Marié - Sport.fr - Sport.fr","Nîmes : Reynet – Alakouch, Guessoum, Miguel, Paquiez – Sarr, Ahlinvi – Ferhat, Duljevic, Eliasson – Ripart (c).Dijon : Racioppi – Boey, Ecuele Manga (c), Panzo, Ngonda – Marié, Ndong – Sammaritano, Celina, Dina Ebimbe – Baldé.Journaliste Sport.fr© SPORT.FR© SPORT.FR",https://www.sport.fr/football/nimes-dijon-les-compos-probables-avec-sarr-ripart-et-marie-746482.shtm,2020-12-23 11:40:00
3,composition probable Nice,"L1 : Nice-Lorient, les compos probables - Maxifoot","Nice accueille Lorient, ce mercredi (19h, Téléfoot Stadium 4), à l’occasion de la 17e journée de Ligue 1. Pour cette rencontre, le coach azuréen, Adrian Ursea, doit faire sans Danilo, Dante, Coly, Lees-Melou, Maolida et Dolberg, à l’infirmerie. Gouiri est suspendu. Pour sa part, le technicien breton, Christophe Pélissier, est privé de Mendes, Saunier, Homawoo, Fontaine et Hamel, blessés, alors que Gravillon est suspendu. Voici la composition probable des deux équipes.Nice : Benitez - Daniliuc, Bambu, Pelmard - Atal, Thuram, Schneiderlin (c), Kamara - Rony Lopes, Ndoye, Reine-Adélaïde.Lorient : Nardi - Delaplace, Laporte, Morel, Le Goff - Le Fée, Lemoine (c), Abergel - Laurienté, Moffi, Boisgard.",https://news.maxifoot.fr/ligue-1/nice-lorient-les-compos-probables-foot-338649.htm,2020-12-23 10:28:00
0,composition probable Lyon,"L1 : Lyon-Nantes, les compos probables - Maxifoot","Ce mercredi (21h, Téléfoot Stadium 2), l’Olympique Lyonnais affronte le FC Nantes lors de la 17e journée de Ligue1. Pour cette partie, l’entraîneur rhodanien, Rudi Garcia, doit seulement faire sans Dembélé, blessé. Pour sa part, le coach intérimaire des Canaris, Patrick Collot, est privé de Perreira, Augustin et Limbombe, à l’infirmerie, tandis que Fabio et Blas sont suspendus. Voici la composition probable des deux équipes.Lyon : Lopes - Dubois, Marcelo, Denayer, Cornet - Paqueta, Mendes, Aouar - Kadewere, Depay (c), Toko Ekambi.Nantes : Lafont - Corchia, Castelletto, Pallois (c), Traoré - Abeid, Girotto, Louza - Coco, Kolo Muani, Bamba.",https://news.maxifoot.fr/ligue-1/lyon-nantes-les-compos-probables-foot-338655.htm,2020-12-23 10:33:00
0,composition probable Saint-Étienne,"L1 : Monaco-St Etienne, les compos probables - Maxifoot","Monaco affronte Saint-Étienne, ce mercredi (21h, Téléfoot Stadium 3), à l’occasion de la 17e journée de Ligue 1. Pour ce match, le coach asémiste, Niko Kovac, doit faire sans Lecomte et Fabregas, blessés, alors que Fofana est suspendu. De son côté, le technicien stéphanois est privé de Retsos, Moukoudi, Gabriel Silva, Maçon et Khazri, tous à l’infirmerie. Voici la composition probable des deux équipes.Monaco : Mannone - Aguilar, Badiashile, Maripan, Caio Henrique - Gelson Martins, Fofana, Tchouameni, Diop - Volland, Ben Yedder (c).St Etienne : Moulin – Debuchy (c), Camara, Kolodziejczak, Trauco – Nordin, Youssouf, Neyou, Aouchiche – Hamouma, Bouanga.",https://news.maxifoot.fr/ligue-1/monaco-st-etienne-les-compos-probables-foot-338656.htm,2020-12-23 10:31:00
4,composition probable Saint-Étienne,"Monaco - ASSE : Les compos probables avec Diop, Nordin et Bouanga - Sport.fr - Sport.fr","Monaco : Mannone – Aguilar, Badiashile, Maripan, Caio Henrique – Gelson Martins, Fofana, Tchouameni, Diop – Volland, Ben Yedder (c).St Etienne : Moulin – Debuchy (c), Camara, Kolodziejczak, Trauco – Nordin, Youssouf, Neyou, Aouchiche – Hamouma, Bouanga.Journaliste Sport.fr© SPORT.FR© SPORT.FR",https://www.sport.fr/football/monaco-asse-les-compos-probables-avec-diop-nordin-et-bouanga-746500.shtm,2020-12-23 13:02:00
0,composition probable Bordeaux,Bordeaux-Reims : la composition d'équipe probable des Girondins - WebGirondins,"À l'heure où nous publions cette composition d'équipe probable, le groupe de joueur n'a pas été communiqué. Nous savons que Kalu, Koscielny, Otavio et Kwateng sont absents.Des incertitudes sont présentes pour la participation de Loris Benito (dos) et celle de De Nicolas Préville (cuisse).Le groupe sera communiqué à midi, après la séance d'entraînement de mercredi matin.FCNhisto.fr",https://www.webgirondins.com/actualite-girondins-bordeaux-reims-la-composition-d-equipe-probable-des-girondins-107890,2020-12-23 00:00:00
0,composition probable Montpellier,"L1 : Montpellier-Lille, les compos probables - Maxifoot","Montpellier et Lille se retrouvent, ce mercredi (21h, Canal+ Sport), dans le cadre de la 17e journée de Ligue 1. Pour ce choc, l’entraîneur héraultais, Michel Der Zakarian, doit faire sans Savanier, blessé. De son côté, le coach nordiste, Christophe Galtier, est privé de Pied, Soumaoro, Sanches et Luiz Araujo, tous à l’infirmerie. Voici la composition probable des deux équipes.Montpellier : Omlin - Mendes, Hilton (c), Congré - Sambia, Ferri, Mollet, Chotard, Ristic - Laborde, Delort.Lille : Maignan - Celik, Fonte (c), Botman, Reinildo - Ikoné, André, Xeka, Bamba - Yazici, David.",https://news.maxifoot.fr/ligue-1/montpellier-lille-les-compos-probables-foot-338657.htm,2020-12-23 10:32:00
7,composition probable Montpellier,"Montpellier - Lille : Les choix de Galtier avec Xeka, Bamba et Fonte - Sport.fr - Sport.fr","Montpellier : Omlin – Mendes, Hilton (c), Congré – Sambia, Ferri, Mollet, Chotard, Ristic – Laborde, Delort.Lille : Maignan – Celik, Fonte (c), Botman, Reinildo – Ikoné, André, Xeka, Bamba – Yazici, David.Journaliste Sport.fr© SPORT.FR© SPORT.FR",https://www.sport.fr/football/montpellier-lille-les-choix-de-galtier-avec-xeka-bamba-et-fonte-746502.shtm,2020-12-23 13:26:00
6,composition probable Strasbourg,"L1 : Paris SG-Strasbourg, les compos probables - Maxifoot","Le Paris Saint-Germain accueille Strasbourg, ce mercredi (21h, Canal+), lors de la 17e journée de Ligue 1. Pour cette partie, le coach parisien, Thomas Tuchel, est privé de très nombreux joueurs puisque Florenzi, Diallo, Kimpembe, Kurzawa, Bernat, Danlio, Paredes, Sarabia, Neymar et Icardi sont tous à l’infirmerie. Mbappé pourrait débuter la rencontre sur le banc. De son côté, l’entraîneur alsacien, Thierry Laurey, se déplace sans Sels, Kamara, Koné, Saadi, Waris et Mothiba, tous blessés. Voici la composition probable des deux équipes.Paris SG : Navas - Dagba, Marquinhos (c), Kehrer, Bakker - Verratti, Gueye, Herrera - Di Maria, Kean, Draxler.Strasbourg : Kawashima - Lala, Simakan, Mitrovic (c), Djiku, Carole - Bellegarde, Aholou, Liénard - Diallo, Ajorque.",https://news.maxifoot.fr/ligue-1/paris-sg-strasbourg-les-compos-probables-foot-338658.htm,2020-12-23 10:35:00


In [241]:
teams.iloc[10,1] in composition_probable.iloc[0,2]
#'Ndong' in contents of "Dijon" ! 
# Teams LEFT JOIN on equipe = keywords


True

In [None]:
blessure = []

In [None]:
# list of ligue 1 teams
teams= ['Amiens SC','Angers SCO','AS Monaco','AS Saint-Étienne','Dijon FCO','FC Metz','FC Nantes',
         'Girondins de Bordeaux','Lille OSC','Montpellier HSC','OGC Nice','Olympique de Marseille',
         'Olympique lyonnais','Paris Saint-Germain','RC Strasbourg','Stade brestois 29','Stade de Reims',
         'Stade rennais FC','Toulouse FC']

##hdr = {'User-Agent': 'Mozilla/5.0'}
hdr = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36",
       'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
       'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
       'Accept-Encoding': 'gzip, deflate, br',
       'Accept-Language': 'fr-FR,fr;q=0.9,en-US;q=0.8,en;q=0.7',
       'Connection': 'keep-alive'}

r= requests.get('https://www.ouest-france.fr/sport/football/olympique-de-marseille/ligue-1-morgan-sanson-absent-pour-le-prochain-match-de-l-om-contre-le-stade-rennais-7084042',
                headers = hdr)

r.raise_for_status()

In [None]:
def function_summarize(row):
    if len(_clean_text_by_sentences(row.Content))<=1:
        return row.Content
    else:
        #word count defines the number of words of the summary
        return summarize(row.Content,word_count=60)

df_features['Summary']=df_features.apply(function_summarize, axis = 1)
##add a date stamp to the info
df_show_info['Search Date']=pd.Timestamp.today().date()

In [None]:
#print(df_features['Summary'].to_string())
df_features.iloc[1,2]

In [None]:
#df_features.to_csv("df_features")