Created on January 7 January 2021  

**Group 3 - Representation**  
**The objective of this notebook is to create a representation of our data using a gensim model** 

@author : Jules Boutibou

# Libraries

In [None]:
from string import punctuation
from tqdm import tqdm
from operator import itemgetter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec
from nltk.corpus import stopwords
import pandas as pd
import numpy as np
import unicodedata
import gensim
import re
from google.colab import drive
import nltk
nltk.download('punkt')
nltk.download('stopwords')
tqdm.pandas()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Link to the drive

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


# Word2Vec

In [None]:
# Import a pre-trained model (trained on french 2018 Wikipedia).
# References : Found on https://zenodo.org/record/3241447#.X_Wu9elKh24
found_model = gensim.models.KeyedVectors.load_word2vec_format(
    '/content/drive/MyDrive/PIP 2021/Word Embedding/Modele/Pretrained_model/modele_simple.bin', binary=True)

# Import the pre-trained model on all the articles and titles
our_model = gensim.models.KeyedVectors.load_word2vec_format(
    '/content/drive/MyDrive/PIP 2021/Word Embedding/Modele/Pretrained_model/model_trained_on_articles.txt', binary=False)

# Vocabulary cleaning

In [None]:
def strip_accents(s: str) -> str:
    """
    Returns the sentence without accent
    """

    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

In [None]:
# Remove the pos tag to only keep the french word, ant lower it
found_model.vocab = {strip_accents(
    k.split('_')[0].lower()): v for k, v in found_model.vocab.items()}

In [None]:
# French stopwords importation
stop = stopwords.words('french')
vocab = list(found_model.vocab.keys())

In [None]:
# Remove the vocabulary that is a french stopword
vocab = [v for v in vocab if v not in stop]

# Keep only alphabet or ' character
vocab = [v for v in vocab if v.isalpha() or "'" in v]

# Keep word after ' char
# i.e. l'accord --> keeps accord instead of laccord after removing punctuation
vocab = [v.split("'")[1] if "'" in v else v for v in vocab]

# Keep word that has a length bigger than 2 characters
vocab = [v for v in vocab if len(v) > 2]

# Keep words existing in the original vocabulary model
vocab = set(vocab).intersection(set(found_model.vocab))

In [None]:
# New cleaned vocabulary
found_model.vocab = dict(zip(vocab, itemgetter(*vocab)(found_model.vocab)))

All of these preprocesses have already been done on the model we trained on the corpus.

# Word2Vec application on our data

In [None]:
# Import the cleaned data, without lemmatization
data = pd.read_json('/content/drive/MyDrive/PIP 2021/Données/Deduplicated/df_concat_G1_G2_v0.json')
data = data[['art_id', 'art_title', 'art_content']]
data.head()

Unnamed: 0,art_id,art_title,art_content
0,1,9ème édition du Panorama de l’emploi territorial,La FNCDG et l’ANDCDG ont publié en septembre l...
1,2,ACTUALITÉS FNCDG / COVID19,Malgré la levée des mesures de confinement le ...
2,25,"Interview de M. Olivier DUSSOPT, Secretaire d’...",Quels étaient les objectifs poursuivis par le ...
3,27,Journée Thématique FNCDG « Les services de san...,"La journée thématique, qui aura lieu durant le..."
4,28,Journée Thématique FNCDG « Vers de nouveaux mo...,La 1ère journée thématique en région sur le th...


In [None]:
def preprocessing(sentences) -> list:
    """
    Takes a column containing sentences, and return the processed text.
    Removes punctuation, stopwords, numbers, accents, white spaces, and lemmatize a text
    Parameters :
      sentences : pd.dataframe column
    """

    processed_sentences = []

    for sentence in tqdm(sentences):

        # Convert to lowercase
        sentence = sentence.lower()

        # Remove space(begin, end)
        sentence = str(sentence).strip()

        # Remove white space
        sentence = str(sentence).strip()

        # Remove accent
        sentence = ''.join((c for c in unicodedata.normalize(
            'NFD', sentence) if unicodedata.category(c) != 'Mn'))

        # Remove number
        sentence = ''.join([i for i in sentence if not i.isdigit()])

        # Remove other non-alphabets symbols with space (i.e. keep only alphabets and whitespaces and char ')
        sentence = re.sub("[^a-zA-Z ']", '', sentence)

        words = sentence.split()

        # Keep word after ' char
        # i.e. l'accord --> keeps accord instead of laccord after removing punctuation
        sentence = [w.split("'")[1] if "'" in w else w for w in words]

        # Keep words that have length of more than 2, remove those with length 1 or 2
        processed_sentences.append(
            ' '.join([w for w in sentence if len(w) > 2 and len(w) < 50]))

    return processed_sentences

In [None]:
# Applying the cleaning function on the column art_content
data['art_content'] = preprocessing(data['art_content'])
data['art_content'] = data['art_content'].apply(
    lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

# Applying the cleaning function on the column art_title
data['art_title'] = preprocessing(data['art_title'])
data['art_title'] = data['art_title'].apply(lambda x: ' '.join(
    [word for word in x.split() if word not in (stop)]))

100%|██████████| 7544/7544 [00:19<00:00, 389.48it/s]
100%|██████████| 7544/7544 [00:00<00:00, 20647.13it/s]


In [None]:
def existing_tokens(sentence: str, model) -> list:
    """
    Tokenize a sentence, and returns only the tokens existing in the vocabulary of the model
    Parameters :
      sentence : sentence we want to tokenize and filter 
      model : pre-trained word2vec skip-gram model
    Out :
      list of words
    """

    # Keeps words or the article only if they are in the model vocabulary
    sentence = set(nltk.word_tokenize(str(sentence)))
    intersection = sentence.intersection(set(model.vocab.keys()))
    return list(intersection)

In [None]:
# Apply the function existing_tokens for each duo found_model&our_model / art_content&art_title
data['content_found_model'] = data['art_content'].apply(
    lambda x: existing_tokens(x, found_model))
data['content_our_model'] = data['art_content'].apply(
    lambda x: existing_tokens(x, our_model))
data['title_found_model'] = data['art_title'].apply(
    lambda x: existing_tokens(x, found_model))
data['title_our_model'] = data['art_title'].apply(
    lambda x: existing_tokens(x, our_model))

In [None]:
# Remove sentences that doesn't contain any word of the corresponding vocabulary model
content_found_model = data[data['content_found_model'].apply(
    lambda x: len(x) != 0)]
content_our_model = data[data['content_our_model'].apply(
    lambda x: len(x) != 0)]
title_found_model = data[data['title_found_model'].apply(
    lambda x: len(x) != 0)]
title_our_model = data[data['title_our_model'].apply(lambda x: len(x) != 0)]

In [None]:
# Calculation of the idf value of each word
vectorizer_content_found_model = TfidfVectorizer()
vectorizer_content_our_model = TfidfVectorizer()
vectorizer_title_found_model = TfidfVectorizer()
vectorizer_title_our_model = TfidfVectorizer()

# fit_transform needs a non-tokenized sentence
x_content_found_model = vectorizer_content_found_model.fit_transform(
    [' '.join(sentence)
     for sentence in content_found_model['content_found_model']]
)
x_content_our_model = vectorizer_content_our_model.fit_transform(
    [' '.join(sentence)
     for sentence in content_our_model['content_our_model']]
)
x_title_found_model = vectorizer_title_found_model.fit_transform(
    [' '.join(sentence)
     for sentence in title_found_model['title_found_model']]
)
x_title_our_model = vectorizer_title_our_model.fit_transform(
    [' '.join(sentence)
     for sentence in title_our_model['title_our_model']]
)

In [None]:
def dic_weights(vectorizer):

  """ 
  Returns a dictionnary containing corpus' words as key, and it's idf_value as value
  Parameters :
    vectorizer : sklearn.feature_extraction.text.TfidVectorizer
  Out :
    dictionary : words as key, idf value as value
  """

  return dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))

In [None]:
# Using the dic_weights function for each duo found_model&our_model / art_content&art_title
dic_content_found_model = dic_weights(vectorizer_content_found_model)
dic_content_our_model = dic_weights(vectorizer_content_our_model)
dic_title_found_model = dic_weights(vectorizer_title_found_model)
dic_title_our_model = dic_weights(vectorizer_title_our_model)

In [None]:
def vect_mean(sentence: list, model, dic) -> list:
    """
    Returns a list representing the average of the array's words' sentence 
    weighted with it's idf value
    Parameters :
      sentence : tokenized words of a sentence type list
      model : pre-trained word2vec skip-gram model
      dic : dictionary containing idf_value of each word
    Out :
      sentence embedding
    """

    # Weight of each word
    if len(sentence) == 1:
        return model[sentence][0]
    else:
        poids = list(itemgetter(*sentence)(dic))
        return np.average(model[sentence], axis=0, weights=poids)

In [None]:
# Apply the function to calculate the embeddings for each duo found_model&our_model / art_content&art_title
data['vect_art_found_model'] = content_found_model.content_found_model.apply(
    lambda x: vect_mean(x, found_model, dic_content_found_model))
data['vect_art_our_model'] = content_our_model.content_our_model.apply(
    lambda x: vect_mean(x, our_model, dic_content_our_model))
data['vect_title_found_model'] = title_found_model.title_found_model.apply(
    lambda x: vect_mean(x, found_model, dic_title_found_model))
data['vect_title_our_model'] = title_our_model.title_our_model.apply(
    lambda x: vect_mean(x, our_model, dic_title_our_model))

# Exportation

In [None]:
# Final DataFrame containing the column Id of the article (art_id) and the 4 calculated embedding
final_data = data[['art_id', 'vect_art_found_model', 'vect_art_our_model', 'vect_title_found_model', 'vect_title_our_model']]
final_data.head()

Unnamed: 0,art_id,vect_art_found_model,vect_art_our_model,vect_title_found_model,vect_title_our_model
0,1,"[-0.06533379122757822, 0.07057429621559858, 0....","[-0.12480441236156328, 0.0765136591815489, 0.1...","[-0.07113228683669678, 0.11271336567454315, 0....","[-0.12933454550715504, 0.06898164503889337, 0...."
1,2,"[-0.07426156787665815, 0.06082738646515634, 0....","[-0.09378982401034014, 0.07844326181610713, 0....","[-0.07399426, 0.036537983, 0.025026318, -0.057...","[-0.047668009415286657, -0.12919949592780508, ..."
2,25,"[-0.07710211662206251, 0.05970139053177871, 0....","[-0.0575083978669747, 0.021040486342290866, 0....","[-0.046578286884785186, 0.022779421749476936, ...","[-0.10663828383062753, 0.0007694621669294691, ..."
3,27,"[-0.045228940506052764, 0.07690330944376708, 0...","[-0.09709182256147646, -0.0007106088381618878,...","[-0.060727528655831205, 0.10974791665845304, 0...","[-0.05749720781059726, -0.04028034379641815, 0..."
4,28,"[-0.060625831064918353, 0.0752820756392532, 0....","[-0.09045600879906518, -0.008876421196522053, ...","[-0.0852355797657426, 0.10057970966708417, 0.0...","[-0.031851939993983974, -0.0339284274483017, 0..."


In [None]:
# Exporting the final data
final_data.to_json(r'/content/drive/MyDrive/PIP 2021/Données/Word2Vec/article_and_titles_embeddings.json', orient='records')

---