Created on Wednesday 13 January 2021  

**Group 3 - Representation**  
**The objective of this notebook is to train a word2vec model with our corpus** 

@authors : Jules Boutibou

---

# Libraries

In [None]:
from tqdm import tqdm
from gensim.models import Word2Vec
from nltk.corpus import stopwords
import numpy as np
import pandas as pd
import os
import re
import time
import nltk
import unicodedata
nltk.download('stopwords')
tqdm.pandas()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Importing and cleaning data

In [None]:
# Import dataframe without duplicates
data = pd.read_json(
    '/content/drive/MyDrive/PIP 2021/Données/Deduplicated/df_concat_G1_G2_v0.json')

# Keep only id and content of articles
data = data[["art_id", "art_content", "art_title"]]

# Preprocessing

In [None]:
def preprocessing(sentences) -> list:
    """
    Takes a column containing sentences, and return the processed text.
    Removes punctuation, stopwords, numbers, accents, white spaces, and lemmatize a text
    """

    processed_sentences = []

    for sentence in tqdm(sentences):

        # Convert to lowercase
        sentence = sentence.lower()

        # Remove space(begin, end)
        sentence = str(sentence).strip()

        # Remove white space
        sentence = str(sentence).strip()

        # Remove accent
        sentence = ''.join((c for c in unicodedata.normalize(
            'NFD', sentence) if unicodedata.category(c) != 'Mn'))

        # Remove number
        sentence = ''.join([i for i in sentence if not i.isdigit()])

        # Remove other non-alphabets symbols with space (i.e. keep only alphabets, whitespaces and char ')
        sentence = re.sub("[^a-zA-Z ']", '', sentence)

        words = sentence.split()

        # Keep word after ' char
        # i.e. l'accord --> keeps accord instead of laccord
        sentence = [w.split("'")[1] if "'" in w else w for w in words]

        # Keep words that have length of more than 2, remove those with length 1 or 2
        processed_sentences.append(
            ' '.join([w for w in sentence if len(w) > 2 or len(w) < 50]))

    return processed_sentences

In [None]:
# Import french stopwords
stop = stopwords.words('french')

# Cleaning content column
data['art_content_clean'] = preprocessing(data['art_content'])
data['art_content_clean'] = data['art_content_clean'].apply(
    lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

# Cleaning title column
data['art_title_clean'] = preprocessing(data['art_title'])
data['art_title_clean'] = data['art_title_clean'].apply(
    lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

100%|██████████| 7544/7544 [00:16<00:00, 467.68it/s]
100%|██████████| 7544/7544 [00:00<00:00, 23901.01it/s]


# Model training

### Concatenation of art_content and art_title to train the model

In [None]:
sentences_content = list(
    data['art_content_clean'].progress_apply(str.split).values)
sentences_title = list(
    data['art_title_clean'].progress_apply(str.split).values)
train_sentences = [*sentences_content, *sentences_title]

100%|██████████| 7544/7544 [00:00<00:00, 16430.04it/s]
100%|██████████| 7544/7544 [00:00<00:00, 37602.10it/s]


### Model training

In [None]:
# Training the word2vec skip-gram model
model = Word2Vec(sentences=train_sentences,
                 sg=1,  # sg = 1 --> skip-gram model
                 size=500,
                 workers=4,
                 window=5)

In [None]:
# Saving the model
model.wv.save_word2vec_format(
    '/content/drive/MyDrive/PIP 2021/Word Embedding/Modele/Pretrained_model/model_trained_on_articles.txt')