In [1]:
import pickle
from pickle import *

In [2]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

from wordcloud import WordCloud

import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import contractions

In [3]:
tokenizer = RegexpTokenizer(r'\w+')

def tokenize_text(text):
    text_processed = " ".join(tokenizer.tokenize(text))
    return text_processed

lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    
    tokens_tagged = nltk.pos_tag(nltk.word_tokenize(text))
    lemmatized_text_list = list()
    
    for word, tag in tokens_tagged:
        if tag.startswith('J'):
            lemmatized_text_list.append(lemmatizer.lemmatize(word,'a')) # Lemmatise adjectives. Not doing anything since we remove all adjective
        elif tag.startswith('V'):
            lemmatized_text_list.append(lemmatizer.lemmatize(word,'v')) # Lemmatise verbs
        elif tag.startswith('N'):
            lemmatized_text_list.append(lemmatizer.lemmatize(word,'n')) # Lemmatise nouns
        elif tag.startswith('R'):
            lemmatized_text_list.append(lemmatizer.lemmatize(word,'r')) # Lemmatise adverbs
        else:
            lemmatized_text_list.append(lemmatizer.lemmatize(word)) # If no tags has been found, perform a non specific lemmatisation
    
    return " ".join(lemmatized_text_list)


def normalize_text(text):
    return " ".join([word.lower() for word in text.split()])
def contraction_text(text):
    return contractions.fix(text)
negative_words = ['not', 'no', 'never', 'nor', 'hardly', 'barely']
negative_prefix = "NOT_"
def get_negative_token(text):
    tokens = text.split()
    negative_idx = [i+1 for i in range(len(tokens)-1) if tokens[i] in negative_words]
    for idx in negative_idx:
        if idx < len(tokens):
            tokens[idx]= negative_prefix + tokens[idx]
    
    tokens = [token for i,token in enumerate(tokens) if i+1 not in negative_idx]
    
    return " ".join(tokens)
from spacy.lang.en.stop_words import STOP_WORDS

def remove_stopwords(text):
    english_stopwords = stopwords.words("english") + list(STOP_WORDS) + ["tell", "restaurant"]
    
    return " ".join([word for word in text.split() if word not in english_stopwords])


In [4]:
def preprocess_text(text):
    
    # Tokenize review
    text = tokenize_text(text)
    
    # Lemmatize review
    text = lemmatize_text(text)
    
    # Normalize review
    text = normalize_text(text)
    
    # Remove contractions
    text = contraction_text(text)

    # Get negative tokens
    text = get_negative_token(text)
    
    # Remove stopwords
    text = remove_stopwords(text)
    
    return text
    

In [5]:
import pandas as pd
dataset_df=pd.read_csv("dataset.csv")

In [6]:
%%time
dataset_df["text_cleaned"] = dataset_df["text"].apply(preprocess_text)
dataset_df

CPU times: total: 2min 24s
Wall time: 2min 24s


Unnamed: 0,text,stars,text_cleaned
0,I've only had food from here once and it wasn'...,1,food memorable panang curry balance flavor lik...
1,I will never return here again. Ever. I was ...,1,NOT_return sit booth wait dinner come scurry m...
2,I wish my experience was great as others. I di...,1,wish experience great din wednesday night week...
3,Are the rosemary grapefruit scones supposed to...,1,rosemary grapefruit scone suppose taste like b...
4,Our takeout order was half wrong. Food was mis...,1,takeout order half wrong food miss portion siz...
...,...,...,...
24995,I was a loyal fan of Aroy before the ownership...,5,loyal fan aroy ownership change apprehensive v...
24996,Stopped here for a bite while wandering around...,5,stopped bite wander faneuil hall pleasantly su...
24997,"A quiet place with excellent food, great music...",5,quiet place excellent food great music helpful...
24998,Super delicious food. Awesome vibe. I suffered...,5,super delicious food awesome vibe suffer disne...


In [18]:
vectoriseur_pickle=open('C:/Users/USER/Desktop/ESEO COURS/vectoriseur_file','rb')
vectoriseur=load(vectoriseur_pickle)
print(x)

TfidfVectorizer(max_df=0.75, min_df=0.01)


In [19]:
model_pickle=open('C:/Users/USER/Desktop/ESEO COURS/model_file','rb')
model=load(model_pickle)
print(y)

NMF(n_components=15)


In [9]:
topics1={0:'les perssonels et les tables',
       1:'mauvaise gout des plats greek',
       2:'mauvaise pizza et retard de livraison',
       3:'retard de pre-commmande et de commande',
       4:'Qualite des repas et des serveurs ne sont pas au attendu',
       5:'mauvais endroit',
       6:'Burger',
       7:'beaucoup attente',
       8:'les poulets et les salades ne sont pas a la hauteur',
       9:'mauvais bar et mauvaise boisson',
       10:'prix elevé par rapport a la quantité',
       11:'livraison',
       12:'sandwich',
       13:'suchi',
       14:'mauvais environnement'}


In [10]:
text_neg=["I've only had food from here once and it wasnt good at all"]
text_pos=["I have a lot of dietary restrictions and this i very liked"]
print(type(text_neg))

<class 'list'>


In [11]:
text=x.transform(text_neg)
top = y.transform(text)
print(top)

[[0.         0.04917078 0.         0.         0.16155017 0.
  0.00053345 0.         0.         0.         0.         0.
  0.         0.         0.        ]]




In [12]:
text2=x.transform(text_pos)
top2 = y.transform(text)
print(top2)

[[0.         0.04917078 0.         0.         0.16155017 0.
  0.00053345 0.         0.         0.         0.         0.
  0.         0.         0.        ]]




In [13]:
from textblob import TextBlob
test=TextBlob("I have a lot of dietary restrictions and this i very liked")
test.sentiment.polarity

0.78

In [14]:
test2=TextBlob("I've only had food from here once and it wasnt good at all")
test2.sentiment.polarity

0.35

In [15]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [16]:
def predict_topics(model, vectorizer, n_topics, text):
        polarity=TextBlob(text).sentiment.polarity
        if polarity<0:
            text=preprocess_text(text)
            text=[text]
            vectorized=vectorizer.transform(text)
            topics_correlations=model.transform(vectorized)
            unsorted_topics_correlations=topics_correlations[0].copy()
            topics_correlations[0].sort()
            sorted=topics_correlations[0][::-1]
            print(sorted)
            topics=[]
            for i in range(n_topics):
                corr_value= sorted[i]
                result = np.where(unsorted_topics_correlations == corr_value)[0]
                topics.append(topics1.get(result[0]))
            print(topics)
        else:
            return polarity

In [21]:
predict_topics(model,vectoriseur,5,"succhi very bad")

[0.07636051 0.0194393  0.00904017 0.00832783 0.00337243 0.00244721
 0.         0.         0.         0.         0.         0.
 0.         0.         0.        ]
['Qualite des repas et des serveurs ne sont pas au attendu', 'suchi', 'livraison', 'sandwich', 'mauvaise pizza et retard de livraison']


