Created on Monday 11 January 2021  

**Group 3 - Representation**  
**The objective of this notebook is to create a BOW (Bag Of Words) representation** 

@authors : Fatima Seck, Hassan Hadda

<h2> Import Libraries

In [None]:
import numpy as np
import pickle
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import string    
from collections import Counter
from google.colab import drive

# I- Import & Preparation of data

In [None]:
#Mount the drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
DATA_PATH = 'drive/MyDrive/'

## I.1 Importing & Cleaning Data df_concat_G1_G2_clean

In [None]:
df_concat = pd.read_json(DATA_PATH + "df_concat_G1_G2_v0_clean_V0.json")
# Keep columns : art_id, art_content_clean_with_stem.
df_concat = df_concat[["art_id", "art_content_clean_with_stem"]]
# Find the NaN index in the dataframe
df_concat[df_concat.isna().any(axis=1)]
# Drop missing values in art_content_clean_with_lem
df_concat = df_concat.dropna(subset=['art_content_clean_with_stem'])
df_concat

Unnamed: 0,art_id,art_content_clean_with_stem
0,1,fncdg andcdg publ septembr eme edit panoram em...
1,2,malgr leve mesur confin mai plupart mesur sani...
2,25,quel etaient object poursuiv gouvern cadr cet ...
3,27,journe themat lieu dur salon preventic them se...
4,28,ere journe themat region them ver nouveau mod ...
...,...,...
7539,G2_usine-digitale_462,etre sur vill futur besoin infrastructur resil...
7540,G2_usine-digitale_517,necessair pris conscienc vill enjeux cybersecu...
7541,G2_usine-digitale_696,etre sur vill futur besoin infrastructur resil...
7542,G2_usine-digitale_785,comment nouvel mobilit vont facon vill futur c...


## I.2 Importing & Cleaning

In [None]:
df_1 = pd.read_json(DATA_PATH +"df_deduplicated_v4.json")
df_2 = pd.read_json(DATA_PATH + "df_final_clean_without_lem_v0.json")
df_without_lem = df_1.merge(df_2, how = 'left', left_on = 'art_id', right_on = 'art_id')
# Keep columns : art_id, art_content_x and art_content_clean_with_lem
df_without_lem = df_without_lem[["art_id", "art_content_x", "art_content_clean_without_lem"]]
df_without_lem = df_without_lem.rename(columns = {'art_content_x': 'art_content'})
df_without_lem["art_id"] = df_without_lem["art_id"].astype(int)

# Find the NaN index in the dataframe
df_without_lem[df_without_lem.isna().any(axis=1)]
# Drop missing values in art_content_clean_with_lem
df_without_lem =  df_without_lem.dropna(subset=['art_content_clean_without_lem'])
df_without_lem

Unnamed: 0,art_id,art_content,art_content_clean_without_lem
0,1,La FNCDG et l’ANDCDG ont publié en septembre l...,fncdg andcdg publie septembre eme edition pano...
1,2,Malgré la levée des mesures de confinement le ...,malgre levee mesures confinement mai plupart m...
2,25,Quels étaient les objectifs poursuivis par le ...,quels etaient objectifs poursuivis gouvernemen...
3,27,"La journée thématique, qui aura lieu durant le...",journee thematique lieu durant salon preventic...
4,28,La 1ère journée thématique en région sur le th...,ere journee thematique region theme vers nouve...
...,...,...,...
7485,12256,01/10/2020 - 18:20 Ouverture le 2 octobre 2020...,ouverture octobre offre publique achat volonta...
7486,12257,MEDICREA : Ouverture de l'offre publique d'ach...,medicrea ouverture offre publique achat volont...
7487,12258,© Fournis par La Tribune 14 startups différent...,fournis tribune startups differentes reussi me...
7488,12259,Ce communiqué ne constitue pas une offre d'acq...,communique constitue offre acquerir titres com...


# II. 1-Gram Bag of Words 

## II.1 1-Gram BOW  of data with stem

In [None]:
# Model CountVectorizer to create bag of words matrix
count_1g = CountVectorizer(min_df=0.009)

In [None]:
# Create the 1-gram bag of words features matrix for column art_content_clean_with_stem.
bag_of_words_concat_1g = count_1g.fit_transform(df_concat.art_content_clean_with_stem)
matrix_bow_concat_1g = bag_of_words_concat_1g.toarray()
matrix_bow_concat_1g

In [None]:
# bow dataframe creation
feature_names = count_1g.get_feature_names()
bow_concat_1g = pd.DataFrame(matrix_bow_concat_1g, columns=feature_names)
bow_concat_1g

In [None]:
# Add a column BOW_1-gram containing all values of other columns
bow_concat_1g['BOW_1-gram'] = bow_concat_1g.values.tolist()
bow_concat_1g = bow_concat_1g.filter(['art_id', 'BOW_1-gram'])
bow_concat_1g

In [None]:
# keep only the vectors of BOW 1-gram for each column art_content_clean_with_stem.
# Merge data and BOW for 1-gram.
df_concat_1g = pd.concat([df_concat, bow_concat_1g], axis=1)
df_concat_1g = df_concat_1g.filter(['art_id', 'BOW_1-gram'])
df_concat_1g  # the final data with art_id & BOW_1-gram

Unnamed: 0,art_id,BOW_1-gram
0,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,25,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, ..."
3,27,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,28,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...
7539,G2_usine-digitale_462,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
7540,G2_usine-digitale_517,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
7541,G2_usine-digitale_696,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
7542,G2_usine-digitale_785,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


## II.2 1-Gram BOW  of data cleaned

In [None]:
# Creation of a 1-gram BOW representation for data cleaned without lemmatization or stemming.
bag_of_words_without_lem_1g = count_1g.fit_transform(
    df_without_lem.art_content_clean_without_lem)
matrix_bow_without_lem_1g = bag_of_words_without_lem_1g.toarray()
matrix_bow_without_lem_1g

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 2],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 1]])

In [None]:
# creation of a BOW dataframe
feature_names = count_1g.get_feature_names()
bow_without_lem_1g = pd.DataFrame(
    matrix_bow_without_lem_1g, columns=feature_names)

# Add a column BOW_1-gram containing all values of other columns
bow_without_lem_1g['BOW_1-gram'] = bow_without_lem_1g.values.tolist()
bow_without_lem_1g = bow_without_lem_1g.filter(['art_id', 'BOW_1-gram'])
bow_without_lem_1g

Unnamed: 0,BOW_1-gram
0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...
7476,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
7477,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
7478,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
7479,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [None]:
# Merge df_without_lem & bow_without_lem_1g.
df_without_lem_1g = pd.concat([df_without_lem, bow_without_lem_1g], axis=1)
# keep only the vectors of BOW for each column art_content_clean_without_lem.
df_without_lem_1g = df_without_lem_1g.filter(['art_id', 'BOW_1-gram'])
df_without_lem_1g = df_without_lem_1g.dropna(subset=['BOW_1-gram'])
df_without_lem_1g  # the final data with art_id & BOW_1-gram

# III 2-Gram Bag of Words 

In [None]:
count_2g = CountVectorizer(ngram_range=(2,2), min_df = 0.009)

## III.1 2-Gram BOW  of data with stem

In [None]:
# Creation of a 2-gram BOW representation for data cleaned with stemming.
bag_of_words_concat_2g = count_2g.fit_transform(
    df_concat.art_content_clean_with_stem)
matrix_bow_concat_2g = bag_of_words_concat_2g.toarray()
matrix_bow_concat_2g

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
# Dataframe creation of 2-gram BOW
feature_names = count_2g.get_feature_names()
bow_concat_2g = pd.DataFrame(matrix_bow_concat_2g, columns=feature_names)

# Add a column BOW_2-gram containing all values of other columns.
bow_concat_2g['BOW_2-gram'] = bow_concat_2g.values.tolist()
bow_concat_2g = bow_concat_2g.filter(['art_id', 'BOW_2-gram'])
bow_concat_2g

Unnamed: 0,BOW_2-gram
0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...
7539,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
7540,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
7541,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
7542,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [None]:
# keep only the vectors of BOW 2-gram for each column art_content_clean_with_stem.
# Merge data and BOW for 2-gram.
df_concat_2g = pd.concat([df_concat, bow_concat_2g], axis=1)
df_concat_2g = df_concat_2g.filter(['art_id', 'BOW_2-gram'])
df_concat_2g

Unnamed: 0,art_id,BOW_2-gram
0,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,25,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,27,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,28,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...
7539,G2_usine-digitale_462,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
7540,G2_usine-digitale_517,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
7541,G2_usine-digitale_696,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
7542,G2_usine-digitale_785,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


## III.2 2-Gram BOW  of data cleaned

In [None]:
# Creation of a 2-gram BOW representation for data cleaned without lemmatization or stemming.
bag_of_words_without_lem_2g = count_2g.fit_transform(
    df_without_lem.art_content_clean_without_lem)
matrix_bow_without_lem_2g = bag_of_words_without_lem_2g.toarray()
matrix_bow_without_lem_2g

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
# creation of the dataframe
feature_names = count_2g.get_feature_names()
bow_without_lem_2g = pd.DataFrame(
    matrix_bow_without_lem_2g, columns=feature_names)
bow_without_lem_2g['BOW_2-gram'] = bow_without_lem_2g.values.tolist()
bow_without_lem_2g = bow_without_lem_2g.filter(['art_id', 'BOW_2-gram'])
bow_without_lem_2g

Unnamed: 0,BOW_2-gram
0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...
7476,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
7477,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
7478,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
7479,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [None]:
# Merge df_without_lem & bow_without_lem_2g.
df_without_lem_2g = pd.concat([df_without_lem, bow_without_lem_2g], axis=1)
# keep only the vectors of BOW for each column art_content_clean_without_lem.
df_without_lem_2g = df_without_lem_2g.filter(['art_id', 'BOW_2-gram'])
df_without_lem_2g = df_without_lem_2g.dropna(subset=['BOW_2-gram'])
df_without_lem_2g 

Unnamed: 0,art_id,BOW_2-gram
0,1.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,2.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,25.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
3,27.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,28.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...
7476,12239.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
7477,12241.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
7478,12245.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
7479,12247.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


# IV Exportation of data BOW & Models

In [None]:
cd /content/drive/MyDrive/Colab Notebooks/

/content/drive/MyDrive/Colab Notebooks


## IV.1 Export data BOW

In [None]:
# Export df_concat_1g as a json file.
df_concat_1g.to_json("df_BOW_1-gram_df_concat_G1_G2_vf.json")

In [None]:
# Export df_concat_2g as a json file.
df_concat_2g.to_json("df_BOW_2-gram_df_concat_G1_G2_vf.json")

In [None]:
# Export df_without_lem_1g as a json file.
df_without_lem_1g.to_json("df_BOW_1-gram_df_clean_without_lem_vf.json")

In [None]:
# Export df_without_lem_2g as a json file.
df_without_lem_2g.to_json("df_BOW_2-gram_df_clean_without_lem_vf.json")

## IV.2 Model exportation

In [None]:
# Export Model bag_of_words_concat_1g.
model_df_concat_1g = 'model_BOW_1-gram_df_concat_G1_G2_vf.sav'
pickle.dump(bag_of_words_concat_1g, open(model_df_concat_1g, 'wb'))

In [None]:
# Export Model bag_of_words_concat_2g.
model_df_concat_2g = 'model_BOW_2-gram_df_concat_G1_G2_vf.sav'
pickle.dump(bag_of_words_concat_2g, open(model_df_concat_2g, 'wb'))

In [None]:
# Export Model bag_of_words_without_lem_1g.
model_df_wo_lem_1g = 'model_BOW_1-gram_df_clean_without_lem_vf.sav'
pickle.dump(bag_of_words_without_lem_1g, open(model_df_wo_lem_1g, 'wb'))

In [None]:
# Export Model bag_of_words_without_lem_2g.
model_df_wo_lem_2g = 'model_BOW_2-gram_df_clean_without_lem_vf.sav'
pickle.dump(bag_of_words_without_lem_2g, open(model_df_wo_lem_2g, 'wb'))