# Jalon 2 : Vectorisation et modélisation 

### Chargement du dataset 

In [26]:
import pandas as pd
import pickle

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text
from sklearn.decomposition import NMF



In [27]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [28]:
DATASET_FILE = "/content/drive/MyDrive/E3/TLN/dataset_cleaned.csv"

dataset_cleaned = pd.read_csv(DATASET_FILE)
dataset_cleaned 

Unnamed: 0,text,stars,length,text_cleaned
0,I've only had food from here once and it wasn'...,1,68,food memorable panang curry balance flavor lik...
1,I will never return here again. Ever. I was ...,1,87,NOT_return ever sit booth wait dinner come scu...
2,I wish my experience was great as others. I di...,1,166,wish experience great others din wednesday nig...
3,Are the rosemary grapefruit scones supposed to...,1,81,rosemary grapefruit scone suppose taste like w...
4,Our takeout order was half wrong. Food was mis...,1,32,takeout order half wrong food miss portion siz...
...,...,...,...,...
24995,I was a loyal fan of Aroy before the ownership...,5,75,loyal fan aroy ownership change apprehensive v...
24996,Stopped here for a bite while wandering around...,5,55,stopped bite wander around faneuil hall pleasa...
24997,"A quiet place with excellent food, great music...",5,32,quiet place excellent food great music helpful...
24998,Super delicious food. Awesome vibe. I suffered...,5,41,super delicious food awesome vibe suffer disne...


# 1. Vectorisation 

In [29]:
#Prendre que les avis négatifs 

dataset_cleaned_neg = dataset_cleaned['text_cleaned'][dataset_cleaned.stars.isin([1,2])]
print(dataset_cleaned_neg)

0       food memorable panang curry balance flavor lik...
1       NOT_return ever sit booth wait dinner come scu...
2       wish experience great others din wednesday nig...
3       rosemary grapefruit scone suppose taste like w...
4       takeout order half wrong food miss portion siz...
                              ...                        
9995    NOT_order hot pot much vegetable NOT_enough me...
9996    work bmc hear new place open co worker decide ...
9997    went dinner drink good food could give well or...
9998    food mediocre NOT_horrible NOT_great sausage b...
9999    second time think food decent first time come ...
Name: text_cleaned, Length: 10000, dtype: object


In [30]:
#Vectorisation avec Tfidf sur les avis négatifs 

def vectoriser(dataset): 

    vectorizer = TfidfVectorizer(max_df= .8, min_df= .01)
    data_fit = vectorizer.fit_transform(dataset)
    data_features_name = vectorizer.get_feature_names_out()

    return data_fit, data_features_name, vectorizer

In [31]:
#Affiche la matrice TFIDF
tf_idf_data_fit = vectoriser(dataset_cleaned_neg)[0]
tf_idf_data_features_name = vectoriser(dataset_cleaned_neg)[1]
tf_idf_vectorizer = vectoriser(dataset_cleaned_neg)[2]

matrice_vectorise = pd.DataFrame(tf_idf_data_fit.toarray(),columns= tf_idf_data_features_name)
matrice_vectorise

Unnamed: 0,00,10,100,11,12,15,18,20,25,30,...,wrong,yeah,year,yell,yelp,yes,yesterday,yet,young,zero
0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.392054,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.449125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9996,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9997,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9998,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# 2. Modélisation 

In [32]:
#Fonction qui permet d'afficher les topics

def display_topics(model, feature_names, num_top_words,topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i] \
             for i in topic.argsort()[:-num_top_words - 1:-1]]))

In [33]:
#Appliquer le model NMF sur notre dataset 

nmf_model = NMF(15)
doc_topic = nmf_model.fit_transform(matrice_vectorise)
df_doc_topic = pd.DataFrame(doc_topic)
print(df_doc_topic)



            0         1         2         3         4         5         6   \
0     0.022666  0.059785  0.000000  0.000000  0.035144  0.000000  0.000916   
1     0.000000  0.000000  0.000000  0.000000  0.000000  0.023439  0.000000   
2     0.000000  0.029695  0.000000  0.003404  0.016175  0.024094  0.000000   
3     0.014106  0.059784  0.000000  0.000000  0.000000  0.014988  0.000189   
4     0.028186  0.005280  0.000000  0.051476  0.016914  0.000000  0.000000   
...        ...       ...       ...       ...       ...       ...       ...   
9995  0.000000  0.033681  0.000510  0.003406  0.000000  0.000000  0.000000   
9996  0.035065  0.055921  0.063290  0.008249  0.000000  0.001815  0.001684   
9997  0.000000  0.052679  0.000000  0.035735  0.021293  0.033942  0.000559   
9998  0.000000  0.038048  0.010037  0.000000  0.016436  0.000000  0.000000   
9999  0.000000  0.018091  0.000000  0.002683  0.013365  0.038148  0.000000   

            7         8         9         10        11        1



In [34]:
#Afficher les topics avec la fonction display_topics

display_topics(nmf_model,tf_idf_vectorizer.get_feature_names_out(),5)


Topic  0
place, like, really, get, look

Topic  1
good, taste, like, dish, sauce

Topic  2
pizza, crust, cheese, topping, slice

Topic  3
order, take, delivery, get, wrong

Topic  4
food, service, bad, good, slow

Topic  5
table, come, server, waitress, ask

Topic  6
burger, fry, bun, onion, cheese

Topic  7
wait, minute, hour, 30, 15

Topic  8
chicken, rice, fry, wing, sauce

Topic  9
bar, drink, beer, bartender, night

Topic  10
time, last, first, location, second

Topic  11
say, customer, tell, call, manager

Topic  12
sandwich, salad, cheese, bread, lettuce

Topic  13
sushi, roll, fish, rice, salmon

Topic  14
go, back, get, win, use


In [35]:
TOPICS = ({     0 :'Cadre du lieu',
                1 :'Plats en sauce',
                2 :'Menu pizza ',
                3 :'Service livraison et commandes',
                4 :'Qualité des plats ',
                5 :'Qualité du service',
                6 :'Menu burger',
                7 :'Temps attente',
                8 :'Menu chicken',
                9 :'Service bar ',
                10:'Localisation du lieu',
                11:'Relation client',
                12:'Menu sandwich',
                13:'Menu sushis',
                14:'Clients revenus'
          })

In [36]:
#Créer le fichier pickle du modèle entrainé

with open('/content/drive/MyDrive/E3/TLN/model _entraine_Lehna','wb') as file:
  pickle.dump(nmf_model,file)

In [37]:
#Créer le fichier pickle du vectoriseur utilisé 

with open('/content/drive/MyDrive/E3/TLN/vectoriseur_utilise_Lehna','wb') as file:
  pickle.dump(tf_idf_vectorizer,file)
