Created on Thursday 07 January 2021

**Group 5 - Classification**  
**Extraction features syntaxe**

@authors : J.J. and F.B

This Notebook allows to extract syntax features on scrapped articles. 

# Import

In [None]:
import re
import os
import nltk
import tqdm
import string
import warnings
import numpy as np
import pandas as pd

from string import punctuation
from textblob import TextBlob
from urllib.parse import urlparse
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer

nltk.download('punkt')
nltk.download('vader_lexicon')
warnings.filterwarnings('ignore')
analyzer = SentimentIntensityAnalyzer()

os.chdir("/content/drive/My Drive/G5 Inter-Promo 2021/Données/Input")



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


# Creation of the link between the drive and the notebook

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Import data

In [None]:
data: pd.DataFrame = pd.read_json("Data.json")
data.fillna(' ',inplace=True )

In [None]:
df_lexique: pd.DataFrame = pd.read_csv("/content/drive/My Drive/G5 Inter-Promo 2021/Ressources/Lexique_Innovation.txt", sep="  ", header=None)
df_lexique.columns: list = ['mots_cle']

# Features extraction

In [None]:
# Counts the number of words
def nb_word(text: str) -> int:
    """Documentation
    Parameters:
        text: Text of the article

    Out (if exists):
        nb_word: Number of word in  the text
    """
    nb_words: list = []
    nb: int = 0

    # Removes special characters
    for p in punctuation:
      text= text.replace(p, ' ')

    # Counts the number of words present in the text
    return len(text.split())

In [None]:
# Count the number of time where the words in the list appear
def count_key_words(text : str) -> int:
    """Documentation
    Parameters:
        text: Text of the article

    Out:
        t: Number of key word in  the text
    """
    t : int = 0

    if text != None:
        text = text.lower()
        text = text.split()
        for j in text:
            if (j in list_mot_cle):   #list_mot_cle: List of word that we will check in the sentences
                t = t + 1
    return t

In [None]:
# Count the number of time where a word appear
def Word_Apparition(tup: tuple ) -> int:
    """Documentation
    Parameters:
      tup: Tuple containing the of the article and the word that we will check

    Out:
        iter: The number of time where the word appear in a article
    """
    try :
      comm, word = tup[0], tup[1]
    except :
      return 0

    iter: int = 0

    if not isinstance(comm, str):
        comm: str = str(comm)

    if comm != None :
        for j in range(len(comm)):
            if comm[j] == word:
                iter += 1
    return iter 

In [None]:
# Count the number of sentence
def phrases( text : str) -> int:
    """Documentation
    Parameters:
        text: Text of the article

    Out:
        n:  The number of sentence in a article
    """
    n : int = 0
   
    if not isinstance(text, str):
        text: str = str(text)

    if (text != None):
        text = text.replace("..", ".")
        text = text.replace("...", ".")
        text = text.replace("!", ".")
        text = text.replace("!!", ".")
        text = text.replace("!!!", ".")
        text = text.replace("?", ".")
        text = text.replace("??", ".")
        text = text.replace("???", ".")
        text = text.replace("?!", ".")
        text = text.replace("!?", ".")
        n = len(sent_tokenize(text))

    return n

In [None]:
def sentiment_analisys_positive(text: str) -> int:
    """Documentation
    Parameters:
        text: An article
    Out (if exists):
        The positive score corresponding to the article
    """
    scores = analyzer.polarity_scores(text)
    return(scores['pos'])


def sentiment_analisys_negative(text: str) -> int:
    """Documentation
    Parameters:
        text: An article
    Out (if exists):
        The negative score corresponding to the article
    """
    scores = analyzer.polarity_scores(text)
    return(scores['neg'])


def get_polarity(text: str) -> int:
    """Documentation
    Parameters:
        text: An article
    Out (if exists):
        The polarity score corresponding to the article
    """
    return(TextBlob(text).sentiment.polarity)


def get_sentiment_sujectivity(text: str) -> int:
    """Documentation
    Parameters:
        text: An article
    Out (if exists):
        The subjectivity score corresponding to the article
    """
    return(TextBlob(text).sentiment.subjectivity)

In [None]:
list_mot_cle :list = df_lexique["mots_cle"].values.tolist()

def compute_features(data: pd.DataFrame) -> pd.DataFrame:
    """Documentation
    Parameters:
        data: A dataframe with the initial content
    Out (if exists):
        data: A dataframe with new features based on the articles
    """

    # Compute features on the content
    data["Nb_key_words"] : np.DataFrame = data["art_content"].apply(count_key_words)
    data["Nb_key_words_title"] : np.DataFrame = data["art_title"].apply( count_key_words)
    data["Nb_words"]: np.DataFrame = data["art_content"].apply(nb_word)
    data["Nb_words_title"]: np.DataFrame = data["art_title"].apply(nb_word)
    data["Nb_sentences"]: np.DataFrame = data["art_content"].apply(phrases)

    data["average_word_sentence"]: np.DataFrame = data["Nb_words"] / data["Nb_sentences"]
    data["ratio_word_title_on_word"]: np.DataFrame  = data["Nb_words_title"] / data["Nb_words"]

    data['exclamation']=list(zip(data['art_content'].values, np.repeat('!',len(data))))
    data["exclamation"]: np.DataFrame  = data["art_content"].apply(Word_Apparition)

    data['interrogation']=list(zip(data['art_content'].values, np.repeat('?',len(data))))
    data["interrogation"]: np.DataFrame  = data["art_content"].apply(Word_Apparition)


    data['ratio_key_words']: np.DataFrame  = data['Nb_key_words']/data['Nb_words']
    data['ratio_key_words']: np.DataFrame  = data['ratio_key_words'].fillna(0)
    data['ratio_key_sentences']: np.DataFrame  = data['Nb_key_words']/data['Nb_sentences']
    data['ratio_key_sentences']: np.DataFrame  = data['ratio_key_sentences'].fillna(0)
    data['ratio_key_word_title']: np.DataFrame  = data['Nb_key_words_title'] / data['Nb_words_title']
    data['ratio_key_word_title']: np.DataFrame  = data['ratio_key_word_title'].fillna(0)

    # Compute features on URL
    data["netloc"]: np.DataFrame  = data["art_url"].apply(lambda x: urlparse(x).netloc)
    data["netloc.com"]: np.DataFrame  = data["netloc"].apply(
        lambda x: re.findall("\.[a-z]+", x))
    data["nb_netloc.com"]: np.DataFrame  = data["netloc.com"].apply(lambda x: len(x))
    data["path"]: np.DataFrame  = data["art_url"].apply(lambda x: urlparse(x).path)
    data["nb_word_path"]: np.DataFrame  = data["path"].apply(lambda x: len(
        x.replace("/", " ").replace("-", " ").replace("_", " ").split()))

    data.drop(["path", "netloc", "nb_netloc.com"], axis=1, inplace=True)

    # Compute features on sentiment analisys
    data["content_postive_score"]: np.DataFrame  = data['art_content'].apply(
        sentiment_analisys_positive)
    data["title_postive_score"]: np.DataFrame  = data['art_title'].apply(
        sentiment_analisys_positive)

    data["content_negative_score"]: np.DataFrame  = data['art_content'].apply(
        sentiment_analisys_negative)
    data["title_negative_score"]: np.DataFrame  = data['art_title'].apply(
        sentiment_analisys_negative)

    data["content_polarity_score"]: np.DataFrame  = data['art_content'].apply(get_polarity)
    data["title_polarity_score"]: np.DataFrame  = data['art_title'].apply(get_polarity)

    data["content_subjectivity_score"]: np.DataFrame  = data['art_content'].apply(
        get_sentiment_sujectivity)
    data["title_subjectivity_score"]: np.DataFrame  = data['art_title'].apply(
        get_sentiment_sujectivity)

    return data

# We use the function to compute the features

In [None]:
%%time
data: pd.DataFrame  = compute_features(data)

CPU times: user 9min 11s, sys: 2.41 s, total: 9min 14s
Wall time: 9min 17s


In [None]:
data.head()

In [None]:
#data.query("art_lang == 'fr'").to_csv(
#    "/content/drive/MyDrive/G5 Inter-Promo 2021/Données/Input/Data_With_Features_Syntax.csv", index=False)