Created on Monday 11 January 2021  

**Group 3 - Representation**  
**The objective of this notebook is to create 1-gram and 2-gram TFIDF representation** 

@authors : Fatima Seck, Jingmeng Yang, Sacha Di Rienzo

---

In [None]:
!pip install git+https://github.com/ClaudeCoulombe/FrenchLefffLemmatizer.git
!python -m spacy download fr_core_news_sm
!python -m spacy download fr
!pip install spacy_lefff

# Import libraries

In [None]:
import pandas as pd
import pickle
import unicodedata
from string import punctuation
from scipy.spatial import distance
from french_lefff_lemmatizer.french_lefff_lemmatizer import FrenchLefffLemmatizer
import warnings 
import spacy
from spacy_lefff import LefffLemmatizer, POSTagger
from itertools import combinations 
from tqdm import tqdm 
from collections import Counter

#nltk
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords,words
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('words')

#sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
warnings.filterwarnings('ignore')

# I Data Import and cleaning

## I.1 import and clean data with lem

In [None]:
cd /content/drive/MyDrive/Colab Notebooks/

/content/drive/MyDrive/Colab Notebooks


In [None]:
# Import dataset
df_1 = pd.read_json('df_deduplicated_v4.json')
df_2 = pd.read_json('df_final_clean_with_lem_v0.json')
# Merge DataFrame
df_with_lem = df_1.merge(df_2, how = 'left', left_on = 'art_id', right_on = 'art_id')
# Keep columns : art_id, art_content_x and art_content_clean_with_lem
df_with_lem = df_with_lem[['art_id', 'art_content_x', 'art_content_clean_with_lem']]
df_with_lem = df_with_lem.rename(columns = {'art_content_x': 'art_content'})
df_with_lem['art_id'] = df_with_lem['art_id'].astype(int)
# Drop missing values in art content column
df_with_lem =  df_with_lem.dropna(subset = ['art_content_clean_with_lem'])
# Index has to match with the dropped rows -> reset_index
df_with_lem = df_with_lem.reset_index(drop = True)
df_with_lem

Unnamed: 0,art_id,art_content,art_content_clean_with_lem
0,1,La FNCDG et l’ANDCDG ont publié en septembre l...,fncdg andcdg publier septembre eme edition pa...
1,2,Malgré la levée des mesures de confinement le ...,malgre levee mesure confinement mai plupart m...
2,25,Quels étaient les objectifs poursuivis par le ...,quels etaient objectif poursuivre gouvernemen...
3,27,"La journée thématique, qui aura lieu durant le...",journee thematique lieu durant salon preventi...
4,28,La 1ère journée thématique en région sur le th...,ere journee thematique region theme ver nouve...
...,...,...,...
7476,12256,01/10/2020 - 18:20 Ouverture le 2 octobre 2020...,ouverture octobre offre public achat volontai...
7477,12257,MEDICREA : Ouverture de l'offre publique d'ach...,medicrea ouverture offrir public achat volont...
7478,12258,© Fournis par La Tribune 14 startups différent...,fournir tribune startups differentes reussi m...
7479,12259,Ce communiqué ne constitue pas une offre d'acq...,communique constituer offrir acquerir titre c...


## I.2 Data import and cleaning

In [None]:
# Import dataset
df_deduplicated = pd.read_json('df_deduplicated_v4.json')
df_deduplicated = df_deduplicated[['art_id', 'art_title']]

# Spacy lemmatization
nlp = spacy.load('fr')
french_lemmatizer = LefffLemmatizer()
nlp.add_pipe(french_lemmatizer, name='lefff')


# Lemmatization function
def lemmatize_spacy(text: str) -> str:
    """Documentation

    Parameters:
     text: text to lemmatize

    Out:
     new_text: the same text after lemmatization

    """
    text = nlp(text)
    new_text = ""
    for word in text:
        new_word = word._.lefff_lemma if word._.lefff_lemma else word
        new_text = new_text+" "+str(new_word)
    return new_text


stop = stopwords.words('french')
punctuations = punctuation + "’”“‘…„—᾿‐–‑′•›‹⁄―‚→（）『』》《。↓↵'͞ʻʿ'"+'″￼'


def preprocess_text(sen: str) -> str:
    """Documentation

    Parameters:
      sen: sentence to preprocess

    Out:
      out: the same sentence after deleting spaces, punctuations...

    """
    # Convert to lowercase
    sentence = sen.lower()
    # Remove space(begin, end)
    sentence = str(sentence).strip()
    # Remove white space
    sentence = str(sentence).strip()

    # Remove punctuation
    for p in punctuations:
        sentence = sentence.replace(p, " ")
    # Remove accent
    sentence = ''.join((c for c in unicodedata.normalize(
        'NFD', sentence) if unicodedata.category(c) != 'Mn'))

    # Remove number
    sentence = ''.join([i for i in sentence if not i.isdigit()])

    # Remove words which len are <2 or >50
    sentence = ' '.join([w for w in sentence.split() if len(w) > 1])
    sentence = ' '.join([w for w in sentence.split() if len(w) < 50])
    return sentence


# Apply functions
df_deduplicated['art_title_prepd'] = df_deduplicated['art_title'].apply(
    preprocess_text)
df_deduplicated['art_title_prepd'] = df_deduplicated['art_title_prepd'].apply(
    lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
df_deduplicated["art_title_prepd"] = tqdm(
    df_deduplicated["art_title_prepd"].apply(lemmatize_spacy))
df_deduplicated

2021-01-14 15:30:14,386 - spacy_lefff.lefff - INFO - New LefffLemmatizer instantiated.
2021-01-14 15:30:14,388 - spacy_lefff.lefff - INFO - Reading lefff data...
2021-01-14 15:30:15,003 - spacy_lefff.lefff - INFO - Successfully loaded lefff lemmatizer


100%|██████████| 7490/7490 [00:00<00:00, 969011.01it/s]


Unnamed: 0,art_id,art_title,art_title_prepd
0,1,9ème édition du Panorama de l’emploi territorial,eme edition panorama emploi territorial
1,2,ACTUALITÉS FNCDG / COVID19,actualites fncdg covid
2,25,"Interview de M. Olivier DUSSOPT, Secretaire d’...",interview olivier dussopt secretaire etat aup...
3,27,Journée Thématique FNCDG « Les services de san...,journee thematique fncdg service sante securi...
4,28,Journée Thématique FNCDG « Vers de nouveaux mo...,journee thematique fncdg vers nouveau modes g...
...,...,...,...
7485,12256,MEDICREA,medicrea
7486,12257,La bourse en ligne : MEDICREA : Ouverture de l...,bourse ligne medicrea ouverture offrir public...
7487,12258,"Ÿnsect, Mirakl, Sendinblue, ManoMano, Doctolib...",ynsect mirakl sendinblue manomano doctolib to...
7488,12259,Medicrea International : Mise à disposition No...,medicrea international mise disposition note ...


## I.3 Data cleaned and stemmed import

In [None]:
# Import dataset
df_concat = pd.read_json("df_concat_G1_G2_v0_clean_V0.json")
# Keep columns : art_id and art_content_clean_with_stem
df_concat = df_concat[['art_id', 'art_content_clean_with_stem']]
df_concat

Unnamed: 0,art_id,art_content_clean_with_stem
0,1,fncdg andcdg publ septembr eme edit panoram em...
1,2,malgr leve mesur confin mai plupart mesur sani...
2,25,quel etaient object poursuiv gouvern cadr cet ...
3,27,journe themat lieu dur salon preventic them se...
4,28,ere journe themat region them ver nouveau mod ...
...,...,...
7539,G2_usine-digitale_462,etre sur vill futur besoin infrastructur resil...
7540,G2_usine-digitale_517,necessair pris conscienc vill enjeux cybersecu...
7541,G2_usine-digitale_696,etre sur vill futur besoin infrastructur resil...
7542,G2_usine-digitale_785,comment nouvel mobilit vont facon vill futur c...


# II 1-GRAM TF-IDF

## II.1 Lemmatized data

In [None]:
vectorizer_1g = TfidfVectorizer(max_features = 1500, use_idf = True)

In [None]:
# Create TF-IDF matrix and keep top 1500 max_features ordered by term frequency 
vect_tf_with_lem_1g = vectorizer_1g.fit_transform(df_with_lem['art_content_clean_with_lem'])
tf_idf_1 = pd.DataFrame(vect_tf_with_lem_1g.toarray(), columns = vectorizer.get_feature_names())
# Turn data into the form of list
tf_idf_1['tf_idf_1_gram'] = tf_idf_1.values.tolist()
tf_idf_1 = tf_idf_1.filter(['tf_idf_1_gram'])
# Merge DataFrames
df_with_lem_tf_idf_1 = pd.merge(df_with_lem, tf_idf_1,  left_index = True, right_index = True)
# Keep columns: art_id and tf_idf_1_gram
df_with_lem_tf_idf_1 = df_with_lem_tf_idf_1[['art_id', 'tf_idf_1_gram']]
df_with_lem_tf_idf_1 = df_with_lem_tf_idf_1.reset_index(drop = True)
df_with_lem_tf_idf_1

Unnamed: 0,art_id,tf_idf_1_gram
0,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,25,"[0.0, 0.03073946656325203, 0.0, 0.0, 0.0, 0.0,..."
3,27,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,28,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...
7476,12256,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.037..."
7477,12257,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.035..."
7478,12258,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.031461122156988694..."
7479,12259,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


## II.2 Title Data

In [None]:
# Create TF-IDF matrix and keep top 1500 max_features ordered by term frequency 
vect_title_1g = vectorizer_1g.fit_transform(df_deduplicated['art_title_prepd'])
tf_idf_1_gram_title = pd.DataFrame(vect_title_1g.toarray(), columns = vectorizer.get_feature_names(), index = df_deduplicated.index)
# Turn data into the form of list
tf_idf_1_gram_title['tf_idf_title_1_gram']= tf_idf_1_gram_title.values.tolist()
tf_idf_1_gram_title = tf_idf_1_gram_title.filter(['tf_idf_title_1_gram'])
# Merge DataFrames
df_deduplicated_tf_idf_1g_title = pd.merge(df_deduplicated, tf_idf_1_gram_title,  left_index = True, right_index = True)
# Keep columns: art_id and tf_idf_title_1_gram
df_deduplicated_tf_idf_1g_title = df_deduplicated_tf_idf_1g_title[['art_id', 'tf_idf_title_1_gram']]
df_deduplicated_tf_idf_1g_title

Unnamed: 0,art_id,tf_idf_title_1_gram
0,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,25,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,27,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,28,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...
7485,12256,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
7486,12257,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
7487,12258,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
7488,12259,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


## II.3 Stemmed Data

In [None]:
# Create TF-IDF matrix and keep top 1500 max_features ordered by term frequency 
vect_concat_1g = vectorizer_1g.fit_transform(df_concat['art_content_clean_with_stem'])
tf_idf_1_gram_concat = pd.DataFrame(vect_concat_1g.toarray(), columns = vectorizer.get_feature_names())
# Turn data into the form of list
tf_idf_1_gram_concat['tf_idf_1_gram'] = tf_idf_1_gram_concat.values.tolist()
tf_idf_1_gram_concat = tf_idf_1_gram_concat.filter(['tf_idf_1_gram'])
# Merge DataFrames
df_concat_tf_idf_1g = pd.merge(df_concat, tf_idf_1_gram_concat,  left_index = True, right_index = True)
# Keep columns: art_id and tf_idf_1_gram
df_concat_tf_idf_1g = df_concat_tf_idf_1g[['art_id', 'tf_idf_1_gram']]
df_concat_tf_idf_1g = df_concat_tf_idf_1g.reset_index(drop = True)
df_concat_tf_idf_1g

Unnamed: 0,art_id,tf_idf_1_gram
0,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,25,"[0.018063225702737783, 0.0, 0.0, 0.0, 0.0, 0.0..."
3,27,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,28,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...
7539,G2_usine-digitale_462,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
7540,G2_usine-digitale_517,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
7541,G2_usine-digitale_696,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
7542,G2_usine-digitale_785,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


# III 2-GRAM TF-IDF 

## III.1 Lemmatized Data

In [None]:
vectorizer_2g = TfidfVectorizer(max_features = 1500, use_idf = True, ngram_range = (2, 2))

In [None]:
# TF-IDF 2 gram -> ngram_range = (2, 2)
# 1500 max_features ordered by term frequency across the corpus.
vect_tf_with_lem_2g = vectorizer_2g.fit_transform(df_with_lem['art_content_clean_with_lem'])
tf_idf_2 = pd.DataFrame(vect_tf_with_lem_2g.toarray(), columns = vectorizer.get_feature_names())
# Turn data into the form of list
tf_idf_2['TF_IDF_2_gram']= tf_idf_2.values.tolist()
tf_idf_2 = tf_idf_2.filter(['TF_IDF_2_gram'])
# Merge DataFrames
df_with_lem_tf_idf_2 = pd.merge(tf_idf_2, df_with_lem, left_index = True, right_index = True)
# Keep columns: art_id and TF_IDF_2_gram
df_with_lem_tf_idf_2 = df_with_lem_tf_idf_2[['art_id','TF_IDF_2_gram']]
df_with_lem_tf_idf_2 = df_with_lem_tf_idf_2.reset_index(drop = True)
df_with_lem_tf_idf_2

Unnamed: 0,art_id,TF_IDF_2_gram
0,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,25,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,27,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,28,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...
7476,12256,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
7477,12257,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
7478,12258,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
7479,12259,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


## III.2 Title Data

In [None]:
# TF-IDF 2 gram -> ngram_range = (2, 2)
# 1500 max_features ordered by term frequency across the corpus.
vect_title_2g = vectorizer_2g.fit_transform(df_deduplicated['art_title_prepd'])
tf_idf_2_gram_title = pd.DataFrame(vect_title_2g.toarray(), columns = vectorizer.get_feature_names())
# Turn data into the form of list
tf_idf_2_gram_title['tf_idf_title_2_gram'] = tf_idf_2_gram_title.values.tolist()
tf_idf_2_gram_title = tf_idf_2_gram_title.filter(['tf_idf_title_2_gram'])
# Merge DataFrames
df_deduplicated_tf_idf_2g_title = pd.merge(df_deduplicated, tf_idf_2_gram_title,  left_index = True, right_index = True)
# Keep columns: art_id and tf_idf_title_2_gram
df_deduplicated_tf_idf_2g_title = df_deduplicated_tf_idf_2g_title[['art_id', 'tf_idf_title_2_gram']]
df_deduplicated_tf_idf_2g_title

Unnamed: 0,art_id,tf_idf_title_2_gram
0,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,25,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,27,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,28,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...
7485,12256,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
7486,12257,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
7487,12258,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
7488,12259,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


## III.3 Stemmed Data

In [None]:
# TF-IDF 2 gram -> ngram_range = (2, 2)
# 1500 max_features ordered by term frequency across the corpus.
vect_concat_2g = vectorizer_2g.fit_transform(
    df_concat['art_content_clean_with_stem'])
tf_idf_2_gram_concat = pd.DataFrame(
    vect_concat_2g.toarray(), columns=vectorizer.get_feature_names())
# Turn data into the form of list
tf_idf_2_gram_concat['TF_IDF_2_gram'] = tf_idf_2_gram_concat.values.tolist()
tf_idf_2_gram_concat = tf_idf_2_gram_concat.filter(['TF_IDF_2_gram'])
# Merge DataFrames
df_concat_tf_idf_2g = pd.merge(
    df_concat, tf_idf_2_gram_concat,  left_index=True, right_index=True)
# Keep columns: art_id and TF_IDF_2_gram
df_concat_tf_idf_2g = df_concat_tf_idf_2g[['art_id', 'TF_IDF_2_gram']]
df_concat_tf_idf_2g = df_concat_tf_idf_2g.reset_index(drop=True)
df_concat_tf_idf_2g

Unnamed: 0,art_id,TF_IDF_2_gram
0,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,25,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,27,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,28,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...
7539,G2_usine-digitale_462,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
7540,G2_usine-digitale_517,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
7541,G2_usine-digitale_696,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
7542,G2_usine-digitale_785,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


# IV Export data

## IV.1 1-GRAM TF-IDF models 

In [None]:
df_with_lem_tf_idf_1.to_json(
    r'df_TF_IDF_1_GRAM_df_with_lem_vf.json', orient='records')
filename = 'model_Tf-Idf_1-gram_df_clean_with_lem_vf.sav'
pickle.dump(vect_tf_with_lem_1g, open(filename, 'wb'))

df_deduplicated_tf_idf_1g_title.to_json(
    r'df_TF_IDF_TITLE_1_GRAM_vf.json', orient='records')
filename = 'model_Tf-Idf_1-gram_title_vf.sav'
pickle.dump(vect_title_1g, open(filename, 'wb'))

df_concat_tf_idf_1g.to_json(
    r'df_TF_IDF_1_GRAM_df_concat_G1_G2_clean_stem_vf.json', orient='records')
filename = 'model_Tf-Idf_1-gram_df_concat_G1_G2_vf.sav'
pickle.dump(vect_concat_1g, open(filename, 'wb'))

## IV.2 2-GRAM TF-IDF models 

In [None]:
df_with_lem_tf_idf_2.to_json(
    r'df_TF_IDF_2_GRAM_df_with_lem_vf.json', orient='records')
filename = 'model_Tf-Idf_2-gram_df_clean_with_lem_vf.sav'
pickle.dump(vect_tf_with_lem_2g, open(filename, 'wb'))

df_deduplicated_tf_idf_2g_title.to_json(
    r'df_TF_IDF_TITLE_2_GRAM_vf.json', orient='records')
filename = 'model_Tf-Idf_2-gram_title_vf.sav'
pickle.dump(vect_title_2g, open(filename, 'wb'))

df_concat_tf_idf_2g.to_json(
    r'df_TF_IDF_2_GRAM_df_concat_G1_G2_clean_stem_vf.json', orient='records')
filename = 'model_Tf-Idf_2-gram_df_concat_G1_G2_vf.sav'
pickle.dump(vect_concat_2g, open(filename, 'wb'))

---