Created on Wednesday 13 January 2021  

**Group 3 - Representation**  
**The objective of this notebook is to create Doc2Vec model** 

@authors : Diallo Thierno Mamadou, Thibault Gallou

---

## Import libraries

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle
from string import punctuation
import unicodedata
import multiprocessing
from sklearn import utils
from google.colab import drive

#nltk
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')

#gensim
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

## Mounting the drive

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


## Importing data

Loading de json file as a dataframe

In [None]:
data = pd.read_json(
    '/content/drive/MyDrive/PIP 2021/Marine/Transaction/df_deduplicated_v4.json')
data.head()

Unnamed: 0,art_id,art_content,art_content_html,art_extract_datetime,art_lang,art_title,art_url,src_name,src_type,src_url,src_img,art_auth,art_tag
0,1,La FNCDG et l’ANDCDG ont publié en septembre l...,"<p style=""text-align: justify;"">La FNCDG et l’...",22 septembre 2020,fr,9ème édition du Panorama de l’emploi territorial,http://fncdg.com/9eme-edition-du-panorama-de-l...,FNCDG,xpath_source,http://fncdg.com/actualites/,http://fncdg.com/wp-content/uploads/2020/09/im...,,
1,2,Malgré la levée des mesures de confinement le ...,"<p style=""text-align: justify;"">Malgré la levé...",17 mars 2020,fr,ACTUALITÉS FNCDG / COVID19,http://fncdg.com/actualites-covid19/,FNCDG,xpath_source,http://fncdg.com/actualites/,http://fncdg.com/wp-content/uploads/2020/03/co...,,
2,25,Quels étaient les objectifs poursuivis par le ...,"<p style=""text-align: justify;""><strong>Quels ...",24 octobre 2019,fr,"Interview de M. Olivier DUSSOPT, Secretaire d’...",http://fncdg.com/interview-de-m-olivier-dussop...,FNCDG,xpath_source,http://fncdg.com/actualites/,http://fncdg.com/wp-content/uploads/2019/10/in...,,
3,27,"La journée thématique, qui aura lieu durant le...","<p style=""text-align: justify;""><strong>La jo...",31 mai 2017,fr,Journée Thématique FNCDG « Les services de san...,http://fncdg.com/journee-thematique-fncdg-les-...,FNCDG,xpath_source,http://fncdg.com/actualites/,http://fncdg.com/wp-content/uploads/2017/05/pu...,,
4,28,La 1ère journée thématique en région sur le th...,"<p style=""text-align: justify;"">La 1<sup>ère</...",13 mars 2017,fr,Journée Thématique FNCDG « Vers de nouveaux mo...,http://fncdg.com/journee-thematique-fncdg-vers...,FNCDG,xpath_source,http://fncdg.com/actualites/,http://fncdg.com/wp-content/uploads/2017/03/Sa...,,



# Preprocessing


In [None]:
stop = stopwords.words('french')
punctuations = punctuation+"’”“‘…„—᾿‐–‑′•›‹⁄―‚→（）『』》《。↓↵'͞ʻʿ'"+'″￼'

In [None]:
def preprocess_text(sentence: str) -> str:
    """ Documentation
    Parameters:
        sentence: input parameter, must be a string 
    Out : 
        sentence: sentence in lowercase without accent, white spaces,punctuations...
    """

    # Convert to lowercase
    sentence = sentence.lower()
    # remove space(begin, end)
    sentence = str(sentence).strip()
    # remove white space
    sentence = str(sentence).strip()

    # Remove punctuation
    for p in punctuations:
        sentence = sentence.replace(p, " ")
    # remove accent
    sentence = ''.join((c for c in unicodedata.normalize(
        'NFD', sentence) if unicodedata.category(c) != 'Mn'))

    # Remove number
    sentence = ''.join([i for i in sentence if not i.isdigit()])

    # Remove words which len are <2 or >50
    sentence = ' '.join([w for w in sentence.split() if len(w) > 1])
    sentence = ' '.join([w for w in sentence.split() if len(w) < 50])

    return sentence

In [None]:
# Creation of new column 'art_content_clean' by applying the function preprocess_text to the column 'art_content'
data['art_content_clean'] = data['art_content'].apply(preprocess_text)

# Removing stopwors and word with special characters
data['art_content_clean'] = data['art_content_clean'].apply(
    lambda x: ' '.join([word for word in x.split() if word not in (stop) and word.isalnum()]))

In [None]:
# Keep only needed column
data = data[['art_id', 'art_content', 'art_content_clean']].copy()
data 

Unnamed: 0,art_id,art_content,art_content_clean
0,1,La FNCDG et l’ANDCDG ont publié en septembre l...,fncdg andcdg publie septembre eme edition pano...
1,2,Malgré la levée des mesures de confinement le ...,malgre levee mesures confinement mai plupart m...
2,25,Quels étaient les objectifs poursuivis par le ...,quels etaient objectifs poursuivis gouvernemen...
3,27,"La journée thématique, qui aura lieu durant le...",journee thematique lieu durant salon preventic...
4,28,La 1ère journée thématique en région sur le th...,ere journee thematique region theme vers nouve...
...,...,...,...
7485,12256,01/10/2020 - 18:20 Ouverture le 2 octobre 2020...,ouverture octobre offre publique achat volonta...
7486,12257,MEDICREA : Ouverture de l'offre publique d'ach...,medicrea ouverture offre publique achat volont...
7487,12258,© Fournis par La Tribune 14 startups différent...,fournis tribune startups differentes reussi me...
7488,12259,Ce communiqué ne constitue pas une offre d'acq...,communique constitue offre acquerir titres com...


### Tokenization

In [None]:
# This function tokenizes a sentence and removes word that length is less than 2 (puctuation)


def tokenize_text(text: str) -> str:
    """ Documentation
    text :input parameter, must be a string 
    Returns the text transforme to a liste of word
    Out : list of word
    """
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
          # this condition is important when we have a punctuation in the list, like '.' or ',' ...
            if len(word) < 2:
                continue
            tokens.append(word.lower())
    return tokens

We here apply the function tokenize_text on the column ' art_content_clean '

In [None]:
content_token = data["art_content_clean"].apply(
    lambda r: TaggedDocument(words=tokenize_text(r), tags=['']))
content_token

0       ([fncdg, andcdg, publie, septembre, eme, editi...
1       ([malgre, levee, mesures, confinement, mai, pl...
2       ([quels, etaient, objectifs, poursuivis, gouve...
3       ([journee, thematique, lieu, durant, salon, pr...
4       ([ere, journee, thematique, region, theme, ver...
                              ...                        
7485    ([ouverture, octobre, offre, publique, achat, ...
7486    ([medicrea, ouverture, offre, publique, achat,...
7487    ([fournis, tribune, startups, differentes, reu...
7488    ([communique, constitue, offre, acquerir, titr...
7489    ([ouverture, octobre, offre, publique, achat, ...
Name: art_content_clean, Length: 7490, dtype: object

## Model loading

In [None]:
cores = multiprocessing.cpu_count()
# using a Doc2Vec model to construct our model
model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5,
                     hs=0, min_count=2, sample=0, workers=cores)


# Building the model's vocabulary
model_dbow.build_vocab([x for x in tqdm(content_token.values)])

100%|██████████| 7490/7490 [00:00<00:00, 2201495.23it/s]


## Training the model 

In [None]:
for epoch in range(30):
    model_dbow.train(utils.shuffle([x for x in tqdm(
        content_token.values)]), total_examples=len(content_token.values), epochs=1)

    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha

100%|██████████| 7490/7490 [00:00<00:00, 1408570.01it/s]
100%|██████████| 7490/7490 [00:00<00:00, 710416.70it/s]
100%|██████████| 7490/7490 [00:00<00:00, 2074989.23it/s]
100%|██████████| 7490/7490 [00:00<00:00, 1998812.56it/s]
100%|██████████| 7490/7490 [00:00<00:00, 1247432.38it/s]
100%|██████████| 7490/7490 [00:00<00:00, 941029.74it/s]
100%|██████████| 7490/7490 [00:00<00:00, 1024201.64it/s]
100%|██████████| 7490/7490 [00:00<00:00, 636324.43it/s]
100%|██████████| 7490/7490 [00:00<00:00, 2766408.68it/s]
100%|██████████| 7490/7490 [00:00<00:00, 2621877.56it/s]
100%|██████████| 7490/7490 [00:00<00:00, 1228313.14it/s]
100%|██████████| 7490/7490 [00:00<00:00, 1076273.15it/s]
100%|██████████| 7490/7490 [00:00<00:00, 786051.57it/s]
100%|██████████| 7490/7490 [00:00<00:00, 2835575.14it/s]
100%|██████████| 7490/7490 [00:00<00:00, 2265801.44it/s]
100%|██████████| 7490/7490 [00:00<00:00, 1190019.96it/s]
100%|██████████| 7490/7490 [00:00<00:00, 1190831.92it/s]
100%|██████████| 7490/7490 [00:00<0

CPU times: user 6min 28s, sys: 2.51 s, total: 6min 31s
Wall time: 3min 26s


### Saving the model

In [None]:
cd /content/drive/MyDrive/PIP 2021/JSON Livrables/Word embedding

/content/drive/.shortcut-targets-by-id/1rbYUNlkKxB9Hwnq4rzA9cyLE1ylsWSGm/PIP 2021/JSON Livrables/Word embedding


In [None]:
filename = 'doc2vec_model.sav'
pickle.dump(model_dbow, open(filename, 'wb'))

### Function that return the vector of a document 

In [None]:
def vec_for_learning(model, tagged_docs) -> list:
    '''Documentation

    Parameters:
      model: Doc2Vec model on which based the inference
      docs: Corpus which contain the sentences 

    Out: 
      vec: Vectorial representation of our corpus

    '''

    sents = tagged_docs.values
    vec = [model.infer_vector(doc.words, steps=20) for doc in sents]
    return vec

Applying the function vec_for_learning to our model (model_dbow) and corpus ( content_token)

In [None]:
content_vec = vec_for_learning(model_dbow, content_token)

CPU times: user 2min 38s, sys: 178 ms, total: 2min 39s
Wall time: 2min 39s


In [None]:
# Add embedding vectors as a new column
content_vec_arr = np.asarray(content_vec)
data['content_vec'] = content_vec_arr.tolist()

# Creating a new dataframe with only useful columns
final_data = data[['art_id', 'content_vec']].copy()
final_data.head(10)

Unnamed: 0,art_id,content_vec
0,1,"[-0.31037163734436035, 0.3990325331687927, -0...."
1,2,"[-0.32287871837615967, 0.40140676498413086, -0..."
2,25,"[-0.285581111907959, 0.3660408854484558, -0.60..."
3,27,"[-0.28079551458358765, 0.33940136432647705, -0..."
4,28,"[-0.2760593295097351, 0.3473186194896698, -0.5..."
5,30,"[-0.3048103153705597, 0.3746854364871979, -0.7..."
6,31,"[-0.3086700141429901, 0.3691590130329132, -0.6..."
7,32,"[-0.30181318521499634, 0.37804317474365234, -0..."
8,34,"[-0.33670511841773987, 0.4235552251338959, -0...."
9,35,"[-0.3195928633213043, 0.3870709538459778, -0.6..."


In [None]:
final_data.to_json(
    r'/content/drive/MyDrive/PIP 2021/JSON Livrables/Word embedding/Doc2Vec_v1.json', orient='records')

---