Created on Monday 4 January 2021

**Group 5 - Classification**  
**Extraction features syntaxe v0**

@authors : Jeremy Johann

This Notebook allows to extract syntax features on scrapped articles. 

# Import

In [None]:
import re
import nltk
import tqdm
import string
import pandas as pd

from textblob import TextBlob
from urllib.parse import urlparse
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer

nltk.download('punkt')
nltk.download('vader_lexicon')
analyzer = SentimentIntensityAnalyzer()



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


# Creation of the link between the drive and the notebook

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Import data

In [None]:
data: pd.DataFrame = pd.read_json(
    "/content/drive/MyDrive/G5 Inter-Promo 2021/Données/Input/Data.json")

In [None]:
df_lexique: pd.DataFrame = pd.read_csv(
    "/content/drive/MyDrive/G5 Inter-Promo 2021/Ressources/Lexique_Innovation.txt", sep="  ", header=None)
df_lexique.columns: list = ['mots_cle']

  


# Features extraction

In [None]:
# Counts the number of words
def nb_word(text: list) -> int:
    """Documentation
      Parameters:
        text: texts of the article

      Out (if exists):
        nb_word: number of word in  the document
    """
    nb_words: list = []
    nb: int = 0
    # browse through the different texts
    for i in text:
        # removes special characters
        i.replace(',', ' ')
        i.replace('.', ' ')
        i.replace('!', ' ')
        i.replace('?', ' ')
        i.replace('/', ' ')
        # creates a list with all the words present in the text
        list_words: list = i.split()
        # counts the number of words present in the text
        nb_words.append(len(list_words))
    return nb_words


print(nb_word(data['art_content']))

[0, 40, 157, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1340, 0, 124, 182, 0, 215, 148, 229, 0, 222, 259, 337, 0, 182, 1081, 201, 0, 0, 178, 0, 0, 0, 0, 0, 0, 164, 868, 150, 183, 432, 154, 204, 373, 266, 148, 0, 0, 65, 0, 79, 211, 0, 0, 0, 0, 54, 167, 0, 0, 463, 426, 418, 917, 419, 365, 350, 168, 181, 0, 502, 328, 263, 582, 386, 551, 515, 595, 340, 604, 454, 454, 1070, 494, 191, 849, 609, 357, 344, 615, 483, 772, 459, 360, 750, 504, 322, 747, 264, 454, 460, 707, 472, 411, 297, 389, 213, 126, 186, 637, 473, 359, 232, 84, 389, 914, 552, 416, 320, 420, 361, 581, 334, 395, 826, 181, 91, 927, 196, 371, 502, 131, 393, 519, 125, 158, 107, 278, 454, 482, 757, 83, 185, 160, 117, 630, 511, 111, 224, 98, 756, 346, 426, 227, 262, 404, 508, 510, 552, 260, 331, 62, 795, 742, 773, 603, 680, 327, 177, 478, 661, 371, 434, 1504, 41, 780, 171, 220, 489, 98, 50, 271, 151, 551, 405, 337, 487, 141, 162, 98, 170, 349, 102, 305, 126, 302, 237, 436, 557, 320, 287, 708, 243, 132, 516, 247

In [None]:
# Count the number of time where the words in the list appear
def count_key_words(data: pd.DataFrame, l: list) -> list:
    """Documentation
    Parameters:
        data: List of articles
        l: List of word that we will check in the sentences

    Out (if exists):
        res: List where each value is the number of time where key word appear in the article
    """
    res: list = []
    for i in range(len(data)):
        sentence: str = data[i]
        if sentence is None:
            res.append(0)
        else:
            sentence = sentence.lower()
            sentence = sentence.split()
            t: int = 0
            for j in sentence:
                if (j in list(l)):
                    t = t + 1
            res.append(t)
    return res

In [None]:
# Count the number of time where a word appear
def Word_Apparition(data: pd.DataFrame, word: str, column: str) -> list:
    """Documentation
    Parameters:
      data: Dataframe with all the data
      word: The word that we will check
      columns: The columns of the dataframe that we will use

    Out (if exists):
        l: List where each value is the number of time where the word appear in a article
    """

    l: list = []
    for i in range(len(data[column])):
        comm: str = data[column][i]
        iter: int = 0

        if not isinstance(comm, str):
            comm: str = str(comm)

        if (comm is None):
            l.append(0)
        else:
            for j in range(len(comm)):
                if comm[j] == word:
                    iter += 1
            l.append(iter)
    return l

In [None]:
# Count the number of sentence
def phrases(data: pd.DataFrame, col: str) -> list:
    """Documentation
    Parameters:
        data: Dataframe with all the data
        columns: The columns of the dataframe that we will use

    Out (if exists):
        l: List where each value is the number of sentence in a article
    """

    l: list = []
    for i in range(len(data[col])):
        sentences: str = data[col][i]

        if not isinstance(sentences, str):
            sentences: str = str(sentences)

        if (sentences is None):
            count_sentence.append(0)
        else:
            sentences = sentences.replace("..", ".")
            sentences = sentences.replace("...", ".")
            sentences = sentences.replace("!", ".")
            sentences = sentences.replace("!!", ".")
            sentences = sentences.replace("!!!", ".")
            sentences = sentences.replace("?", ".")
            sentences = sentences.replace("??", ".")
            sentences = sentences.replace("???", ".")
            sentences = sentences.replace("?!", ".")
            sentences = sentences.replace("!?", ".")
            l.append(len(sent_tokenize(sentences)))

    return l

In [None]:
def sentiment_analisys_positive(text: str) -> int:
    """Documentation
    Parameters:
        text: An article
    Out (if exists):
        The positive score corresponding to the article
    """
    scores = analyzer.polarity_scores(text)
    return(scores['pos'])


def sentiment_analisys_negative(text: str) -> int:
    """Documentation
    Parameters:
        text: An article
    Out (if exists):
        The negative score corresponding to the article
    """
    scores = analyzer.polarity_scores(text)
    return(scores['neg'])


def get_polarity(text: str) -> int:
    """Documentation
    Parameters:
        text: An article
    Out (if exists):
        The polarity score corresponding to the article
    """
    return(TextBlob(text).sentiment.polarity)


def get_sentiment_sujectivity(text: str) -> int:
    """Documentation
    Parameters:
        text: An article
    Out (if exists):
        The subjectivity score corresponding to the article
    """
    return(TextBlob(text).sentiment.subjectivity)

In [None]:
def compute_features(data: pd.DataFrame) -> pd.DataFrame:
    """Documentation
    Parameters:
        data: A dataframe with the initial content
    Out (if exists):
        data: A dataframe with new features based on the articles
    """

    # Compute features on the content
    data["Nb_key_words"]: np.DataFrame = count_key_words(
        data["art_content"], df_lexique["mots_cle"])
    data["Nb_key_words_title"]: np.DataFrame = count_key_words(
        data["art_title"], df_lexique["mots_cle"])
    data["Nb_words"]: np.DataFrame = nb_word(data['art_content'])
    data["Nb_words_title"]: np.DataFrame = nb_word(data['art_title'])
    data["Nb_sentences"]: np.DataFrame  = phrases(data, 'art_content')
    data["average_word_sentence"]: np.DataFrame = data["Nb_words"] / data["Nb_sentences"]
    data["ratio_word_title_on_word"]: np.DataFrame  = data["Nb_words_title"] / data["Nb_words"]
    data["exclamation"]: np.DataFrame  = Word_Apparition(data, '!', 'art_content')
    data["interrogation"]: np.DataFrame  = Word_Apparition(data, '?', 'art_content')
    data['ratio_key_words']: np.DataFrame  = data['Nb_key_words']/data['Nb_words']
    data['ratio_key_words']: np.DataFrame  = data['ratio_key_words'].fillna(0)
    data['ratio_key_sentences']: np.DataFrame  = data['Nb_key_words']/data['Nb_sentences']
    data['ratio_key_sentences']: np.DataFrame  = data['ratio_key_sentences'].fillna(0)
    data['ratio_key_word_title']: np.DataFrame  = data['Nb_key_words_title'] / \
        data['Nb_words_title']
    data['ratio_key_word_title']: np.DataFrame  = data['ratio_key_word_title'].fillna(0)

    # Compute features on URL
    data["netloc"]: np.DataFrame  = data["art_url"].apply(lambda x: urlparse(x).netloc)
    data["netloc.com"]: np.DataFrame  = data["netloc"].apply(
        lambda x: re.findall("\.[a-z]+", x))
    data["nb_netloc.com"]: np.DataFrame  = data["netloc.com"].apply(lambda x: len(x))
    data["path"]: np.DataFrame  = data["art_url"].apply(lambda x: urlparse(x).path)
    data["nb_word_path"]: np.DataFrame  = data["path"].apply(lambda x: len(
        x.replace("/", " ").replace("-", " ").replace("_", " ").split()))

    data.drop(["path", "netloc", "nb_netloc.com"], axis=1, inplace=True)

    # Compute features on sentiment analisys
    data["content_postive_score"]: np.DataFrame  = data['art_content'].apply(
        sentiment_analisys_positive)
    data["title_postive_score"]: np.DataFrame  = data['art_title'].apply(
        sentiment_analisys_positive)

    data["content_negative_score"]: np.DataFrame  = data['art_content'].apply(
        sentiment_analisys_negative)
    data["title_negative_score"]: np.DataFrame  = data['art_title'].apply(
        sentiment_analisys_negative)

    data["content_polarity_score"]: np.DataFrame  = data['art_content'].apply(get_polarity)
    data["title_polarity_score"]: np.DataFrame  = data['art_title'].apply(get_polarity)

    data["content_subjectivity_score"]: np.DataFrame  = data['art_content'].apply(
        get_sentiment_sujectivity)
    data["title_subjectivity_score"]: np.DataFrame  = data['art_title'].apply(
        get_sentiment_sujectivity)

    return data

# We use the function to compute the features

In [None]:
data: pd.DataFrame  = compute_features(data)

In [None]:
data.head()

Unnamed: 0,art_id,art_content,art_content_html,art_extract_datetime,art_lang,art_title,art_url,src_name,src_type,src_url,src_img,art_auth,art_tag,Nb_key_words,Nb_key_words_title,Nb_words,Nb_words_title,Nb_sentences,average_word_sentence,ratio_word_title_on_word,exclamation,interrogation,ratio_key_words,ratio_key_sentences,ratio_key_word_title,netloc.com,nb_word_path,content_postive_score,title_postive_score,content_negative_score,title_negative_score,content_polarity_score,title_polarity_score,content_subjectivity_score,title_subjectivity_score
0,0,,,7 décembre 2018,fr,4ème Conférence Nationale de l’Emploi Territor...,http://fncdg.com/4eme-conference-nationale-de-...,FNCDG,xpath_source,http://fncdg.com/actualites/,http://fncdg.com/wp-content/uploads/2018/12/ra...,,,0,0,0,8,0,,inf,0,0,0.0,0.0,0.0,[.com],11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,La FNCDG et l’ANDCDG ont publié en septembre l...,"<p style=""text-align: justify;"">La FNCDG et l’...",22 septembre 2020,fr,9ème édition du Panorama de l’emploi territorial,http://fncdg.com/9eme-edition-du-panorama-de-l...,FNCDG,xpath_source,http://fncdg.com/actualites/,http://fncdg.com/wp-content/uploads/2020/09/im...,,,0,0,40,7,3,13.333333,0.175,0,0,0.0,0.0,0.0,[.com],7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,Malgré la levée des mesures de confinement le ...,"<p style=""text-align: justify;"">Malgré la levé...",17 mars 2020,fr,ACTUALITÉS FNCDG / COVID19,http://fncdg.com/actualites-covid19/,FNCDG,xpath_source,http://fncdg.com/actualites/,http://fncdg.com/wp-content/uploads/2020/03/co...,,,2,0,157,4,4,39.25,0.025478,0,0,0.012739,0.5,0.0,[.com],2,0.03,0.0,0.0,0.0,0.4,0.0,1.0,0.0
3,3,,,28 juin 2017,fr,Arrêté de création GIP,http://fncdg.com/arrete-de-creation-gip/,FNCDG,xpath_source,http://fncdg.com/actualites/,http://fncdg.com/wp-content/uploads/2017/05/fo...,,,0,0,0,4,0,,inf,0,0,0.0,0.0,0.0,[.com],4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,,,24 novembre 2017,fr,Article Acteurs Publics « Les centres de gesti...,http://fncdg.com/article-acteurs-publics-les-c...,FNCDG,xpath_source,http://fncdg.com/actualites/,,,,0,0,0,16,0,,inf,0,0,0.0,0.0,0.0,[.com],14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
data.to_csv(
    "/content/drive/MyDrive/G5 Inter-Promo 2021/Données/Input/Data_With_Features_Syntax.csv", index=False)