Created on Tuesday 12 January 2021  

**Group 3 - Representation**  
**The objective of this notebook is to create a word embedding representation with FastText**

@authors : Lingeshwari Ramlugon, Thibault Gallou

---

# Word Embedding with FastText - non lemmatized

## Import Libraires

In [None]:
from string import punctuation
from operator import itemgetter
import pandas as pd
import numpy as np
import unicodedata
import re

# sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# gensim
import gensim
from gensim.models.keyedvectors import KeyedVectors
from gensim.models import Word2Vec

# nltk
import nltk
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')

## Mount the drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## FastText implementation

Pretrained model in the french section: https://fasttext.cc/docs/en/crawl-vectors.html

In [None]:
# Loading the FastText model pretrained on french data
model = KeyedVectors.load_word2vec_format(
    '/content/drive/MyDrive/PIP 2021/Word Embedding/Modele/cc.fr.300.txt', binary=False)

In [None]:
print(model.vocab)

## Cleaning vocabulary

In [None]:
def strip_accents(s: str) -> str:
    """ Documentation
    Returns the vocabulary without accent
    """

    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

In [None]:
# Remove the pos tag to only keep the french word, and convert to lowercase
model.vocab = {strip_accents(
    k.split('_')[0].lower()): v for k, v in model.vocab.items()}

In [None]:
# Import of the french stopwords
stop = stopwords.words('french')
vocab = list(model.vocab.keys())

In [None]:
# Vocabulary words contain no punctuation, no french stopword and have more than 2 characters
vocab = [v for v in vocab if v not in stop and v.isalpha() and len(v) > 2]

In [None]:
# New cleaned vocabulary
model.vocab = dict(zip(vocab, itemgetter(*vocab)(model.vocab)))

In [None]:
model.vocab

## Import deduplicated data

In [None]:
# Import the cleaned data, without lemmatization
data = pd.read_json(
    '/content/drive/MyDrive/PIP 2021/Données/Deduplicated/df_concat_G1_G2_v0.json')
data.head()

In [None]:
# Keep only id and title of articles
data = data[["art_id","art_content"]]
data.head()

Unnamed: 0,art_id,art_content
0,1,La FNCDG et l’ANDCDG ont publié en septembre l...
1,2,Malgré la levée des mesures de confinement le ...
2,25,Quels étaient les objectifs poursuivis par le ...
3,27,"La journée thématique, qui aura lieu durant le..."
4,28,La 1ère journée thématique en région sur le th...


In [None]:
# Create function to clean articles
stop = stopwords.words('french')
punctuations = punctuation+"’”“‘…„—᾿‐–‑′•›‹⁄―‚→（）『』》《。↓↵'͞ʻʿ'"+'″￼'


def preprocess_text(sen: str) -> str:
    """ Documentation 
    Removes punctuation, stopwords, numbers, accents, white spaces, and lemmatize a text
    """

    # Convert to lowercase
    sentence = sen.lower()
    # Remove space(begin, end)
    sentence = str(sentence).strip()
    # Remove white space
    sentence = str(sentence).strip()

    # Remove punctuation
    for p in punctuations:
        sentence = sentence.replace(p, " ")

    # Remove accent
    sentence = ''.join((c for c in unicodedata.normalize(
        'NFD', sentence) if unicodedata.category(c) != 'Mn'))

    # Remove number
    sentence = ''.join([i for i in sentence if not i.isdigit()])

    # Remove words which len are <2 or >50
    sentence = ' '.join([w for w in sentence.split() if len(w) > 1])
    sentence = ' '.join([w for w in sentence.split() if len(w) < 50])

    return sentence

In [None]:
# Clean elements of column 'art_content'
# Create new column 'art_content_clean'
data['art_content_clean'] = data['art_content'].apply(preprocess_text)
data['art_content_clean'] = data['art_content_clean'].apply(
    lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [None]:
data.head()

Unnamed: 0,art_id,art_content,art_content_clean
0,1,La FNCDG et l’ANDCDG ont publié en septembre l...,fncdg andcdg publie septembre eme edition pano...
1,2,Malgré la levée des mesures de confinement le ...,malgre levee mesures confinement mai plupart m...
2,25,Quels étaient les objectifs poursuivis par le ...,quels etaient objectifs poursuivis gouvernemen...
3,27,"La journée thématique, qui aura lieu durant le...",journee thematique lieu durant salon preventic...
4,28,La 1ère journée thématique en région sur le th...,ere journee thematique region theme vers nouve...


In [None]:
# Create function to tokenize each article
def existing_tokens(sentence: str) -> list:
    """ Documentation 
    Tokenize a sentence, and returns only the tokens existing in the vocabulary of the model
    Parameters:
        sentence: sentence to preprocess
    Out :
        list of words
    """

    # Keeps words or the article only if they are in the model vocabulary
    sentence = set(nltk.word_tokenize(str(sentence)))
    intersection = sentence.intersection(set(model.vocab.keys()))
    return list(intersection)

In [None]:
# Apply the function existing_tokens to the column 'art_content_clean'
data['art_content_clean'] = data['art_content_clean'].apply(existing_tokens)

In [None]:
# save data on drive
#data.to_json(r'/content/drive/MyDrive/PIP 2021/Données/FastText/article_with_vocab_intersection.json', orient='records')

## Import cleaned tokenized data

In [None]:
data = pd.read_json(
   '/content/drive/MyDrive/PIP 2021/Données/FastText/article_with_vocab_intersection.json')
data.head()

Unnamed: 0,art_id,art_content,art_content_clean
0,1,La FNCDG et l’ANDCDG ont publié en septembre l...,"[panorama, collectivites, offerts, confrontes,..."
1,2,Malgré la levée des mesures de confinement le ...,"[prises, telecharger, levee, sante, reprise, c..."
2,25,Quels étaient les objectifs poursuivis par le ...,"[directions, mutualisation, primaute, saisi, o..."
3,27,"La journée thématique, qui aura lieu durant le...","[deroulera, colloque, innovantes, domaines, du..."
4,28,La 1ère journée thématique en région sur le th...,"[communaute, deroulera, durant, publique, edit..."


In [None]:
# Remove sentences that doesn't contain any word of the model vocabulary
data = data[data['art_content_clean'].apply(lambda x: len(x) != 0)]

In [None]:
# Build sentences again from tokens
data['art_content_not_tokenized'] = [
    ' '.join(sentence) for sentence in data['art_content_clean']]

In [None]:
data.head()

## TFIDF

We want to get the IDF of each word in the vocab in order to balance the word embedding values

In [None]:
# Calculation of the idf value of each word
vectorizer = TfidfVectorizer()

# fit_transform needs non-tokenized sentences
x = vectorizer.fit_transform(data['art_content_not_tokenized'])

In [None]:
# Dictionary containing the idf value of each word
# vectorizer.idf_ gives the idf value of each word
dic_weights = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))

## Compute the embeddings of each sentence

In [None]:
# Create function to calculate mean vector for each article
# Output vector can be weighted (boolean set to True) or unweighted (boolean set to False)
def vect_mean(sentence: list, weighted: bool) -> list:
    """ Documentation 
    Returns a list representing the average of the array's words' sentence 
    weighted with it's idf value
    Out :
      list of 500 coordinates
    """

    if weighted:
        # Weight of each word
        if len(sentence) == 1:
            return model[sentence][0]
        else:
            poids = list(itemgetter(*sentence)(dic_weights))
            return np.average(model[sentence], axis=0, weights=poids)
    else:
        return np.mean(model[sentence], axis=0)

In [None]:
# Apply the function vect_mean with weights
data['vect_content_title_weighted'] = data.art_content_clean.apply(
    lambda x: vect_mean(x, True))

# Apply the function vect_mean without weights
data['vect_content_title_unweighted'] = data.art_content_clean.apply(
    lambda x: vect_mean(x, False))

In [None]:
data.head()

In [None]:
# Final DataFrame containing the 3 columns Id of the article (art_id) and intrinsic vector (vect_art) weighted and non weighted
final_data = data.drop(
    columns=['art_content', 'art_content_clean', 'art_content_not_tokenized'])
final_data.head()

Unnamed: 0,art_id,vect_content_title_weighted,vect_content_title_unweighted
0,1,"[0.003699094353281652, 0.005638245658862409, 0...","[0.0037260866, 0.005599999, 0.017443476, 0.000..."
1,2,"[0.0022219835185113062, -0.0002882939163113805...","[0.0015073532, 0.002485294, 0.0010749996, 0.00..."
2,25,"[6.080691828573314e-05, 0.015057803622787417, ...","[-0.0013683748, 0.015395545, 0.00013407573, 0...."
3,27,"[0.0015955066926961702, 0.01679796948292174, 0...","[-0.00032615347, 0.02102154, 0.0033769228, 0.0..."
4,28,"[-0.0017359903365168625, 0.006722339617114802,...","[-0.0016072913, 0.009063543, 0.0067187487, 0.0..."


## Save dataframe of weighted vector of each article

In [None]:
# Exporting the final data
#final_data.to_json(r'/content/drive/MyDrive/PIP 2021/Données/FastText/article_embeddings.json', orient='records')