Created on Thursday 14 January 2021  

**Group 3 - Representation**  
**The objective of this notebook is to compute weighted (from IDF Inverse Document Frequency) word embedding for each art_content with the polyglot model** 

@authors : Arthur CARLET, Guillaume BERNARD, Neima MARCO, Nesrine AIDER, Lou-Ann CHAUSSE, Fannie MATHEY

# Libraries

## Install :

In [None]:
# Polyglot's model installation :

!pip install icu
!pip install pyicu
!pip install pycld2
!pip install morfessor
!pip install -U polyglot
!polyglot download embeddings2.fr
!polyglot download pos2.fr
!polyglot download sgns2.fr

#lemmatizer
!pip install git+https://github.com/ClaudeCoulombe/FrenchLefffLemmatizer.git

Collecting icu
[?25l  Downloading https://files.pythonhosted.org/packages/89/d8/0972fa39747faea092e8105103f261e01d6cefe262cbe036df8b0b8ada44/icu-0.0.1-py3-none-any.whl (49kB)
[K     |██████▋                         | 10kB 17.9MB/s eta 0:00:01[K     |█████████████▏                  | 20kB 15.8MB/s eta 0:00:01[K     |███████████████████▉            | 30kB 10.2MB/s eta 0:00:01[K     |██████████████████████████▍     | 40kB 8.6MB/s eta 0:00:01[K     |████████████████████████████████| 51kB 3.1MB/s 
[?25hInstalling collected packages: icu
Successfully installed icu-0.0.1
Collecting pyicu
[?25l  Downloading https://files.pythonhosted.org/packages/31/46/fa08c8efae2951e67681ec24319f789fc1a74e2096dd74373e34c79319de/PyICU-2.6.tar.gz (233kB)
[K     |████████████████████████████████| 235kB 6.6MB/s 
[?25hBuilding wheels for collected packages: pyicu
  Building wheel for pyicu (setup.py) ... [?25l[?25hdone
  Created wheel for pyicu: filename=PyICU-2.6-cp36-cp36m-linux_x86_64.whl size=

## Imports :

In [None]:
#imports

import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from string import punctuation 
import pickle

# Polyglot : 
from icu import Locale
import polyglot
from polyglot.text import Text, Word

#nltk
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('words')
from nltk.corpus import words,wordnet,stopwords
from nltk.tokenize import word_tokenize

# Gensim
import gensim
from gensim.models import Word2Vec

#lemmatizer
from french_lefff_lemmatizer.french_lefff_lemmatizer import FrenchLefffLemmatizer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


# Data Import

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Data loading
DATA_PATH = '/content/drive/MyDrive/PIP 2021/Données//Deduplicated'
df = pd.read_json(DATA_PATH + '/df_concat_G1_G2_v0.json')

# Lemmatization

In [None]:
# Loading stopwords
stop = stopwords.words('french')

lemmatizer = FrenchLefffLemmatizer()

In [None]:
def preprocess(sentence: str) -> list:
    """Documentation
    Parameters:
        sentence: sentence to preprocess 

    Out:
        sentence: sentence preprocessed
    """
    sentence = str(sentence.lower())
    sentence = nltk.word_tokenize(sentence)
    sentence = [word_sen for word_sen in sentence if word_sen not in stop and word_sen.isalpha() and len(word_sen) > 2]
    return sentence

In [None]:
dictio = {'VERB': 'v', 'ADJ': 'adj', 'DET': 'det', 'NOUN': 'nc', 'AUX': 'v', 'ADP': 'prep', 'ADV': 'adv', 'CONJ': 'coo',
          'INTJ': 'nc', 'NUM': 'nc', 'PART': 'nc', 'PRON': 'cln', 'PUNCT': 'poncts', 'PROPN': 'np', 'SCONJ': 'csu', 'SYM': 'nc', 'X': 'nc'}


def lem_word(word: str) -> (str, bool, np.ndarray):
    """Documentation
    Parameters:
        word: word to lemmatize 

    Out:
        resu: word lemmatized
        is_valid: wether or not the word is part of polyglot's vocab
        vect: word's embedding if isValid is True, else it returns a vector full of zeroes

    References:
        1. https://spacy.io/universe/project/spacy-lefff
        2. https://github.com/ClaudeCoulombe/FrenchLefffLemmatizer
        3. https://polyglot.readthedocs.io/en/latest/
    """
    # default values initialisation
    resu = word
    vect = np.zeros(256)
    is_valid = False

    # computes pos_tag and word embedding
    text = Text(word, hint_language_code='fr')

    try:  # crashes if word isn't in model's vocab
        pos_tag = text.pos_tags[0][1]
        tag = dictio[pos_tag]
        resu = lemmatizer.lemmatize(text, tag) #we need pos_tag to make this function work
        if resu != '':  # different cases of pos tagging output
            if (type(resu) == list):
                if len(resu) == 0:
                    resu = ''
                else:
                    is_valid = True
                    resu = resu[0][0]
            else:
                isValid = True
        if is_valid:
            vect = np.array(Word(resu, language='fr').vector)
        return resu, is_valid, vect
    except:
        return resu, is_valid, vect

# Word Embedding 

In [None]:
word_to_lem = {}  # dict that takes a word as input and returns its lemmatized representation


def compute_word_embeddings(df: pd.core.frame.DataFrame) -> pd.core.frame.DataFrame:
    """Documentation
    Parameters:
        df: dataframe containing the content

    Out:
        df_res : dataframe containing id,content,list of words in the content, all embedding linked to
                the vector and the content lemmatized
    """
    # initialisation
    art_word = []
    art_lem = []
    art_vect = []
    art_lem_join = [] #we need to stocks the sentence lemmatized to compute idf

    for content in tqdm(df['art_content']):
        words = preprocess(content)
        words_new = []  # contains words known from polyglot
        words_vect = []  # contains lemmatized words
        words_lem = []  # contains embedding of lemmatized words
        for w in words:
            lem, is_valid, w_embedding = lem_word(w)  # resu, isValid, vect
            if is_valid:  # if word in polglot's vocabulary
                words_new.append(w)
                if w not in word_to_lem.keys():  # dictionnary that take a word as input and returns its lemma
                    word_to_lem[w] = lem
                words_lem.append(str(lem))
                words_vect.append(w_embedding)
        art_word.append(words_new)
        words_join = ' '.join(words_lem)
        art_lem_join.append(words_join)
        art_lem.append(words_lem)
        art_vect.append(words_vect)
    df_res = pd.DataFrame({'art_id': df['art_id'], 'art_content': df['art_content'],
                           'art_word': art_word, 'art_lem': art_lem, 'art_vect': art_vect, 'art_lem_join': art_lem_join})
    return df_res

In [None]:
df_with_embeddings = compute_word_embeddings(df)

HBox(children=(FloatProgress(value=0.0, max=7544.0), HTML(value='')))




In [None]:
df_with_embeddings.head()

Unnamed: 0,art_id,art_content,art_word,art_lem,art_vect,art_lem_join
0,1,La FNCDG et l’ANDCDG ont publié en septembre l...,"[édition, panorama, emploi, territorial, cette...","[édition, panorama, emploi, territorial, ce, é...","[[3.3952472, -4.7512593, -0.87234586, 3.186644...",édition panorama emploi territorial ce édition...
1,2,Malgré la levée des mesures de confinement le ...,"[malgré, levée, mesures, confinement, mai, plu...","[malgré, levée, mesure, confinement, mai, plup...","[[-0.2212756, 0.57220876, -0.5409405, 0.656040...",malgré levée mesure confinement mai plupart me...
2,25,Quels étaient les objectifs poursuivis par le ...,"[objectifs, gouvernement, cadre, cette, réform...","[objectif, gouvernement, cadre, ce, réforme, f...","[[2.6337967, 1.5813266, 1.1626679, -0.25885695...",objectif gouvernement cadre ce réforme fonctio...
3,27,"La journée thématique, qui aura lieu durant le...","[journée, thématique, lieu, durant, salon, thè...","[journée, thématique, lieu, durant, salon, thè...","[[2.9263377, -0.33556777, -1.4783581, 2.622460...",journée thématique lieu durant salon thème ser...
4,28,La 1ère journée thématique en région sur le th...,"[journée, thématique, région, thème, vers, nou...","[journée, thématique, région, thème, vers, nou...","[[2.9263377, -0.33556777, -1.4783581, 2.622460...",journée thématique région thème vers nouveau m...


## Compute IDF for each stemmed word

We want to penalize the embedding for words that are too common. To do that we will make weigth based on their IDF values

In [None]:
# Calculation of the idf value of each word
vectorizer = TfidfVectorizer()

# we use the lemmatized sentence to compute tf-idf in order to have a same idf for same word
tfidf_matrix = vectorizer.fit_transform(df_with_embeddings['art_lem_join'])

# Dictionary containing the idf value of each word
dic_weights = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))

In [None]:
len(dic_weights.keys()) #approximately 10 000 unique lemmatized words

10755

In [None]:
def get_idf(word: str) -> float:
    """Documentation
    Parameters:
        word: the word we want to get the idf 

    Out:
        float: the idf value
    """
    if word in word_to_lem.keys():
        # dictionnary that take a word as input and returns its lemma, was filled in compute_word_embeddings function
        word_lemmatized = word_to_lem[word]
        return dic_weights[word_lemmatized]
    else:
        return 0

In [None]:
def get_weight_vector(tokens: list) -> np.ndarray:
    """Documentation
    Parameters:
        tokens: list of words in a list

    Out:
        float: array of idf values
    """
    return np.array([get_idf(token) for token in tokens])

In [None]:
#save tfidf model
with open('/content/drive/MyDrive/PIP 2021/Pos Tagging/Guillaume/tfidf_lem.pickle', 'wb') as f1:
    pickle.dump(vectorizer, f1)

## Weigthing the word embeddings

In [None]:
def compute_weighted_average_embeddings(row: pd.Series):
    """Documentation
    Parameters:
        row: row from the dataframe containing the lemmatized sentence
            and the embedding for each words of the sentence

    Out:
        list: array of embedding values

    References:
        https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

    """
    lem_words = row["art_lem"]
    embedded_sentence = row['art_vect']
    weights = [get_idf(word) for word in lem_words]
    if np.array_equal(weights, np.zeros(len(weights))):
        sentence_embedding = np.zeros(len(weights))
    else:
        sentence_embedding = np.average(
            embedded_sentence, weights=weights, axis=0)
    return sentence_embedding

In [None]:
df_with_embeddings['word_embedding'] = [compute_weighted_average_embeddings(
    df_with_embeddings.iloc[x]) for x in tqdm(range(len(df_with_embeddings)))]

HBox(children=(FloatProgress(value=0.0, max=7534.0), HTML(value='')))




In [None]:
output=df_final_2[["art_id", "word_embedding"]]

In [None]:
output.head()

Unnamed: 0,art_id,word_embedding
0,1,"[2.224703109314392, 0.6259871201526235, -0.212..."
1,2,"[2.0636399535205308, 0.9396294129786164, 0.120..."
2,25,"[1.6940103703618323, 1.2793173774418638, 0.317..."
3,27,"[2.127060536324472, 0.7990025027405475, -0.163..."
4,28,"[1.5101993388851078, 0.6776250067902838, 0.019..."


In [None]:
df_final_2[["art_id", "word_embedding"]].to_json(
    '/content/drive/MyDrive/PIP 2021/Données/polyglot_embeddings_lem.json', orient="records")

---