Créé le 9 mars 2021

**Projet Tableau de Bord** 

**Groupe n°3 - Arnaques en ligne**

**Scrapping : récupération des articles**


@authors:
- VEDIS Theo 
- MANSON Marianne
- KIRED Nour Elhouda


#### Import libraries

In [None]:
########## Module import ##########
# Fichiers
import json
from tqdm.notebook import tqdm
from os import path 

# Scraping

from bs4 import BeautifulSoup
import requests 
import sys

# Format
import  time




#### Fonctions

écupération des titres / résumés de chaque liens

In [None]:
def article_url_to_data(url) -> dict:
    """Documentation
    fonction qui scrape l'article et retourne les données collectées sous forme d'un dictionnaire
    Parameters:
            url: lien de l'article  dont on veut extraire les données

      Out:
            dictionnaire contenant les éléments essentiels des articles 
    """
    try:
        r = requests.get(url)
    except:
        print("Error : ", sys.exc_info()[0])
        return {
            "url": url,
            "abstract": "ERREUR_REQUEST",
            "author": "ERREUR_REQUEST",
            "keyword": "ERREUR_REQUEST",
            "title": "ERREUR_REQUEST",
            "date": "ERREUR_REQUEST",
        }

    soup = BeautifulSoup(r.text, "html.parser")

    try:
        abstract = soup.find("div", {"id": "Abs1-section"}).text
    except AttributeError:
        abstract = "no_data"

    author = [
        i.find("span", {"itemprop": "name"}).text
        for i in soup.find("article").find_all("li", {"class": "c-author-list__item"})
    ]

    keyword = [
        i.text
        for i in soup.find("article").find_all(
            "li", {"class": "c-article-subject-list__subject"}
        )
    ]

    title = soup.title.text.replace(" | SpringerLink", "")

    date = soup.find("article").find("header").find("time")["datetime"]

    return {
        "url": url,
        "abstract": abstract,
        "author": author,
        "keyword": keyword,
        "title": title,
        "date": date,
    }


##############################################################################
def chapter_url_to_data(url) -> dict:
    """Documentation
    fonction qui scrape un chapter et retourne les données collectées sous forme d'un dictionnaire

      Parameters:
            url: lien du chapter dont on veut extraire les données

      Out:
            dictionnaire contenant les éléments essentiels des articles 
    """
    try:
        r = requests.get(url)
    except:
        print("Error : ", sys.exc_info()[0])
        return {
            "url": url,
            "abstract": "ERREUR_REQUEST",
            "author": "ERREUR_REQUEST",
            "keyword": "ERREUR_REQUEST",
            "title": "ERREUR_REQUEST",
            "date": "ERREUR_REQUEST",
        }
    soup = BeautifulSoup(r.text, "html.parser")

    try:
        abstract = (
            soup.find("section", {"id": "Abs1"}).find("p", {"class": "Para"}).text
        )
    except AttributeError:
        abstract = "no_data"

    # print(abstract)

    try:
        l = [
            i.text
            for i in soup.find("div", {"class": "authors-affiliations"})
            .find("ul")
            .find_all("li")
        ]  # Author + num
    except AttributeError:
        l = []

    author = []
    for i in range(int(len(l) / 2)):
        author.append((l[i * 2][:-1], l[i * 2 + 1]))

    # Affiliation + author id
    try:
        l = [
            i.text
            for i in soup.find("div", {"class": "authors-affiliations"})
            .find("ol")
            .find_all("li")
        ]
    except AttributeError:
        l = []

    affiliations = []
    for i in l:
        splited_affiliations = i.split(".")
        affiliations.append(
            (splited_affiliations[0], ".".join(splited_affiliations[1:]))
        )

    author_final = {}
    for i in author:
        author_final[i[0]] = []
        for j in affiliations:
            if j[0] == i[1]:
                author_final[i[0]] += author_final[i[0]] + [j[1]]

    # print("Author :", author_final)

    try:
        keyword = [
            i.text
            for i in soup.find("div", {"class": "KeywordGroup"}).find_all(
                "span", {"class": "Keyword"}
            )
        ]
    except AttributeError:
        keyword = []

    title = soup.title.text.replace(" | SpringerLink", "")

    date = "-".join(
        [
            i if len(i) > 1 else "0" + i
            for i in soup.find("meta", {"name": "citation_publication_date"})[
                "content"
            ].split("/")
        ]
    )

    if len(date) != 10:
        try:
            date = soup.find("div", {"class": "article-dates"}).find("time")["datetime"]
        except AttributeError:
            try:
                date = soup.find("meta", {"name": "citation_publication_date"})[
                    "content"
                ]
            except AttributeError:
                date = "no_data"

    return {
        "url": url,
        "abstract": abstract,
        "author": author_final,
        "keyword": keyword,
        "title": title,
        "date": date,
    }


##############################################################################
# Full fct Reference Work Entry
def workEntry_url_to_data(url) -> dict:
    """Documentation
    fonction qui scrape un work entry et retourne les données collectées sous forme d'un dictionnaire

      Parameters:
            url: lien de work entry dont on veut extraire les données

      Out:
            dictionnaire contenant les éléments essentiels des articles 
    """
    try:
        r = requests.get(url)
    except:
        print("Error : ", sys.exc_info()[0])
        return {
            "url": url,
            "abstract": "ERREUR_REQUEST",
            "author": "ERREUR_REQUEST",
            "keyword": "ERREUR_REQUEST",
            "title": "ERREUR_REQUEST",
            "date": "ERREUR_REQUEST",
        }

    soup = BeautifulSoup(r.text, "html.parser")

    abstract = soup.find("meta", {"name": "description"})["content"]

    l = [
        i.text
        for i in soup.find("div", {"class": "authors-affiliations"})
        .find("ul")
        .find_all("li")
    ]  # Author + num

    author = []
    for i in range(int(len(l) / 2)):
        author.append((l[i * 2][:-1], l[i * 2 + 1]))

    try:
        l = [
            i.text
            for i in soup.find("div", {"class": "authors-affiliations"})
            .find("ol")
            .find_all("li")
        ]
    except AttributeError:
        l = []

    affiliations = []
    for i in l:
        splited_affiliations = i.split(".")
        affiliations.append(
            (splited_affiliations[0], ".".join(splited_affiliations[1:]))
        )

    author_final = {}
    for i in author:
        author_final[i[0]] = []
        for j in affiliations:
            if j[0] == i[1]:
                author_final[i[0]] += author_final[i[0]] + [j[1]]

    try:
        keyword = [
            i.text
            for i in soup.find("div", {"class": "KeywordGroup", "lang": "en"}).findAll(
                "span", {"class": "Keyword"}
            )
        ]
    except AttributeError:
        keyword = []

    title = soup.title.text.replace(" | SpringerLink", "")

    try:
        date = soup.find("div", {"class": "article-dates"}).find("time")["datetime"]
    except AttributeError:
        try:
            date = soup.find("meta", {"name": "citation_publication_date"})["content"]
        except AttributeError:
            date = "no_data"

    return {
        "url": url,
        "abstract": abstract,
        "author": author_final,
        "keyword": keyword,
        "title": title,
        "date": date,
    }


#### Collecte et sauvegrade des données 

In [None]:
def urls_to_type(url: str) -> str:
    """Documentation
    Parameters:
        url: springer article url
    Out:
        type de l'url 

    """
    return url.split("/")[3]


def collect(urls: zip) -> None:
    """Documentation
    fonction qui reçoit en paramètres les liens d'articles et qui extrairait par la suite 
    les données qu'on sauvegardera sous format json    

    Parameters:
        urls: Liste des urls 
    Out:
        data: creation d'un fichier json contenant toutes les données collectées
    """

    data = {}

    for id, url in tqdm(zip(range(len(urls)), urls), total=len(urls)):
        article_type = urls_to_type(url)
        if article_type == "referenceworkentry":
            try:
                data[id] = workEntry_url_to_data(url)
            except:
                print("ERROR-WE :", sys.exc_info()[0])
                data[id] = {
                    "url": url,
                    "abstract": "ERROR",
                    "author": "ERROR",
                    "keyword": "ERROR",
                    "title": "ERROR",
                    "date": "ERROR",
                }
        elif article_type == "article":
            try:
                data[id] = article_url_to_data(url)
            except:
                print("ERROR-ART :", sys.exc_info()[0])
                data[id] = {
                    "url": url,
                    "abstract": "ERROR",
                    "author": "ERROR",
                    "keyword": "ERROR",
                    "title": "ERROR",
                    "date": "ERROR",
                }
        elif article_type == "chapter":
            try:
                data[id] = chapter_url_to_data(url)
            except:
                print("ERROR-CHAP :", sys.exc_info()[0])
                data[id] = {
                    "url": url,
                    "abstract": "ERROR",
                    "author": "ERROR",
                    "keyword": "ERROR",
                    "title": "ERROR",
                    "date": "ERROR",
                }

    with open(".\data.json", "w") as f:
        json.dump(data, f)
