In [109]:
# Imports
import pandas as pd
import numpy as np

#for tokenisation
import spacy
nlp = spacy.load('en_core_web_sm')

#for lemmatisation
import nltk
from nltk.corpus import wordnet as wn

#for Topic Modelling
import gensim
from gensim import corpora
import pyLDAvis.gensim

from tqdm import tqdm_notebook as tqdm

In [85]:
review_df = pd.read_csv('reviews.csv',index_col=0)
print( review_df.shape)
review_df.head()

(92466, 8)


Unnamed: 0,overall_rating,total_reviews,date,stars,review_text,link_2_reviews,review_target,date_range
0,65.0,10.0,2019-05-01,50.0,,https://www.just-eat.co.uk/restaurants-haweli-...,Haweli,2019-05-01
1,65.0,10.0,2019-04-15,25.0,,https://www.just-eat.co.uk/restaurants-haweli-...,Haweli,2019-04-15
2,65.0,10.0,2019-04-03,100.0,very tasty and quick delivery,https://www.just-eat.co.uk/restaurants-haweli-...,Haweli,2019-04-03
3,65.0,10.0,2019-03-16,100.0,Very good food,https://www.just-eat.co.uk/restaurants-haweli-...,Haweli,2019-03-16
4,65.0,10.0,2019-03-09,75.0,Had to add scipies to an indian food myself :)...,https://www.just-eat.co.uk/restaurants-haweli-...,Haweli,2019-03-09


In [97]:
review_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49382 entries, 2 to 92464
Data columns (total 8 columns):
overall_rating    49382 non-null float64
total_reviews     49382 non-null float64
date              49382 non-null object
stars             49382 non-null float64
review_text       49382 non-null object
link_2_reviews    49382 non-null object
review_target     49382 non-null object
date_range        49382 non-null object
dtypes: float64(3), object(5)
memory usage: 3.4+ MB


In [96]:
review_df.isnull().sum()

overall_rating    0
total_reviews     0
date              0
stars             0
review_text       0
link_2_reviews    0
review_target     0
date_range        0
dtype: int64

In [95]:
print(review_df.shape)
review_df = review_df.dropna()
print(review_df.shape)

(49382, 8)
(49382, 8)


#### Keeping only Nouns, Adjectives, Adverbs and Verbs.

In [94]:
example_text = review_df.review_text[6]
example_text

'I was glad to see this restaurant was on Just Eat because I’d eaten at it a few years ago and it was excellent! Despite the few and bad reviews I made an order based on this past experience.But this order was awful! Poppadums broken to smitherines (crumbs), and the curries were absolutely tasteless, meat over done (lamb - tough) and not at all (chilli) hot! Real shame - won’t order from here again.'

In [92]:
tokenised_doc = nlp(example_text)

In [98]:
word_list = []

for token in tokenised_doc:
    
    if (token.pos_ == 'VERB') or (token.pos_ == 'NOUN') or (token.pos_ == 'ADJ') or (token.pos_ == 'ADV'):
        
        word_list.append(token.lemma_)
    
print ( word_list)

['be', 'glad', 'see', 'restaurant', 'be', 'just', 'eat', 'eat', 'few', 'year', 'ago', 'be', 'excellent', 'few', 'bad', 'review', 'make', 'order', 'base', 'past', 'experience', 'order', 'be', 'awful', 'poppadum', 'break', 'smitherine', 'crumb', 'curry', 'be', 'absolutely', 'tasteless', 'meat', 'do', 'lamb', 'tough', 'not', 'at', 'chilli', 'hot', 'real', 'shame', 'not', 'order', 'here', 'again']


In [99]:
word_list = []

for token in tokenised_doc:
    
    if (token.pos_ == 'VERB') or (token.pos_ == 'NOUN') or (token.pos_ == 'ADJ') or (token.pos_ == 'ADV'):
        
        word_list.append(token)
    
print ( word_list)

[was, glad, see, restaurant, was, Just, Eat, eaten, few, years, ago, was, excellent, few, bad, reviews, made, order, based, past, experience, order, was, awful, Poppadums, broken, smitherines, crumbs, curries, were, absolutely, tasteless, meat, done, lamb, tough, not, at, chilli, hot, Real, shame, n’t, order, here, again]


### Tokenising function

Removing URLs & Screennames, lowercasing, and filtering Nouns, Verbs, Adjectives and Adverbs

In [103]:
# Tokeniser function
def desc_tokenize(text, descriptive_words = True):
    lda_tokens = []
    tokens = nlp(text)
    for token in tokens:
        
        # In this version check if the 'descriptive_words' flag is true.  If so perform the following actions
        if descriptive_words == True:
            if token.orth_.isspace():
                continue
            elif token.like_url:
                lda_tokens.append('URL')
            elif token.orth_.startswith('@'):
                lda_tokens.append('SCREEN_NAME') 
            elif (token.pos_ == 'VERB') or (token.pos_ == 'NOUN') or (token.pos_ == 'ADJ') or (token.pos_ == 'ADV'):
                lda_tokens.append(token.lower_)
        else:
            if token.orth_.isspace():
                continue
            elif token.like_url:
                lda_tokens.append('URL')
            elif token.orth_.startswith('@'):
                lda_tokens.append('SCREEN_NAME') 
            else:
                lda_tokens.append(token.lower_)
                
    return lda_tokens

In [104]:
desc_tokenize(example_text, descriptive_words=True)

['was',
 'glad',
 'see',
 'restaurant',
 'was',
 'just',
 'eat',
 'eaten',
 'few',
 'years',
 'ago',
 'was',
 'excellent',
 'few',
 'bad',
 'reviews',
 'made',
 'order',
 'based',
 'past',
 'experience',
 'order',
 'was',
 'awful',
 'poppadums',
 'broken',
 'smitherines',
 'crumbs',
 'curries',
 'were',
 'absolutely',
 'tasteless',
 'meat',
 'done',
 'lamb',
 'tough',
 'not',
 'at',
 'chilli',
 'hot',
 'real',
 'shame',
 'n’t',
 'order',
 'here',
 'again']

### Tokenising function

Removing URLs & Screennames, lowercasing, and filtering for Nouns, Verbs, Adjectives and Adverbs, and taking the root of each word.

In [105]:
# Tokeniser function
def root_tokenize(text, descriptive_words = True, root_word = True):
    lda_tokens = []
    tokens = nlp(text)
    for token in tokens:
        
        if descriptive_words == True:
            if token.orth_.isspace():
                continue
            elif token.like_url:
                lda_tokens.append('URL')
            elif token.orth_.startswith('@'):
                lda_tokens.append('SCREEN_NAME') 
            elif (token.pos_ == 'VERB') or (token.pos_ == 'NOUN') or (token.pos_ == 'ADJ') or (token.pos_ == 'ADV'):
                
                # If root_word if True then append the lemmatised word
                if root_word == True:
                    lda_tokens.append(token.lemma_)
                else:
                    lda_tokens.append(token.lower_)
        else:
            if token.orth_.isspace():
                continue
            elif token.like_url:
                lda_tokens.append('URL')
            elif token.orth_.startswith('@'):
                lda_tokens.append('SCREEN_NAME') 
            else:
                # If root_word if True then append the lemmatised word
                if root_word == True:
                    lda_tokens.append(token.lemma_)
                else:
                    lda_tokens.append(token.lower_)
                
    return lda_tokens

In [106]:
# Example of function in action
root_tokenize(example_text, descriptive_words=True,root_word=True)

['be',
 'glad',
 'see',
 'restaurant',
 'be',
 'just',
 'eat',
 'eat',
 'few',
 'year',
 'ago',
 'be',
 'excellent',
 'few',
 'bad',
 'review',
 'make',
 'order',
 'base',
 'past',
 'experience',
 'order',
 'be',
 'awful',
 'poppadum',
 'break',
 'smitherine',
 'crumb',
 'curry',
 'be',
 'absolutely',
 'tasteless',
 'meat',
 'do',
 'lamb',
 'tough',
 'not',
 'at',
 'chilli',
 'hot',
 'real',
 'shame',
 'not',
 'order',
 'here',
 'again']

In [120]:
%%time
review_df['clean_text'] = review_df['review_text'].map(root_tokenize)

Wall time: 9min 24s


In [121]:
root_tokenize(example_text)

['be',
 'glad',
 'see',
 'restaurant',
 'be',
 'just',
 'eat',
 'eat',
 'few',
 'year',
 'ago',
 'be',
 'excellent',
 'few',
 'bad',
 'review',
 'make',
 'order',
 'base',
 'past',
 'experience',
 'order',
 'be',
 'awful',
 'poppadum',
 'break',
 'smitherine',
 'crumb',
 'curry',
 'be',
 'absolutely',
 'tasteless',
 'meat',
 'do',
 'lamb',
 'tough',
 'not',
 'at',
 'chilli',
 'hot',
 'real',
 'shame',
 'not',
 'order',
 'here',
 'again']

## Topic Modelling

In [122]:
# Building dictionary and corpus
dictionary = corpora.Dictionary(review_df.clean_text)

corpus = [dictionary.doc2bow(text) for text in review_df.clean_text]

In [125]:
corpus

[[(0, 1), (1, 1), (2, 1), (3, 1)],
 [(3, 1), (4, 1), (5, 1)],
 [(4, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1)],
 [(13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1)],
 [(11, 2),
  (16, 3),
  (19, 1),
  (20, 1),
  (21, 1),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 5),
  (27, 1),
  (28, 1),
  (29, 1),
  (30, 1),
  (31, 1),
  (32, 2),
  (33, 1),
  (34, 1),
  (35, 2),
  (36, 1),
  (37, 1),
  (38, 1),
  (39, 1),
  (40, 1),
  (41, 1),
  (42, 1),
  (43, 1),
  (44, 1),
  (45, 1),
  (46, 1),
  (47, 1),
  (48, 1),
  (49, 1),
  (50, 1),
  (51, 1),
  (52, 1),
  (53, 1)],
 [(5, 1), (11, 1)],
 [],
 [(3, 1), (5, 1), (54, 1), (55, 1), (56, 1)],
 [(7, 1),
  (14, 1),
  (26, 3),
  (57, 1),
  (58, 1),
  (59, 1),
  (60, 1),
  (61, 1),
  (62, 1),
  (63, 1),
  (64, 1),
  (65, 1),
  (66, 1)],
 [(11, 1),
  (26, 1),
  (31, 1),
  (67, 1),
  (68, 1),
  (69, 1),
  (70, 1),
  (71, 1),
  (72, 1),
  (73, 1)],
 [(3, 1),
  (5, 1),
  (7, 1),
  (11, 2),
  (16, 3),
  (20, 1),
  (26, 10),
  (

## Now look at different numbers of topics

In [126]:
%%time
number_topics = 3

lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics=number_topics, id2word=dictionary, passes=15)

Wall time: 5min 4s


In [127]:
# Look at the topics the model settled on
topics = lda_model.print_topics(num_words=10)

for topic in topics:
    print(topic)

(0, '0.091*"food" + 0.055*"be" + 0.051*"good" + 0.043*"great" + 0.030*"very" + 0.025*"always" + 0.025*"order" + 0.022*"again" + 0.022*"delivery" + 0.020*"time"')
(1, '0.118*"be" + 0.036*"not" + 0.025*"have" + 0.019*"chicken" + 0.015*"order" + 0.013*"taste" + 0.013*"very" + 0.011*"food" + 0.011*"good" + 0.009*"sauce"')
(2, '0.093*"be" + 0.042*"not" + 0.038*"order" + 0.036*"food" + 0.033*"have" + 0.020*"do" + 0.017*"time" + 0.014*"restaurant" + 0.013*"late" + 0.013*"arrive"')


In [81]:
lda_display = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary, sort_topics = False)
pyLDAvis.display(lda_display)

In [50]:
%%time
number_topics = 3

lda_model2 = gensim.models.ldamodel.LdaModel(corpus, num_topics=number_topics, id2word=dictionary, passes=15)

Wall time: 4min 41s


In [51]:
lda_display2 = pyLDAvis.gensim.prepare(lda_model2, corpus, dictionary, sort_topics = False)
pyLDAvis.display(lda_display2)

In [28]:
%%time
number_topics = 8

lda_model3 = gensim.models.ldamodel.LdaModel(corpus, num_topics=number_topics, id2word=dictionary, passes=15)

CPU times: user 3min 17s, sys: 468 ms, total: 3min 18s
Wall time: 3min 18s


In [29]:
lda_display3 = pyLDAvis.gensim.prepare(lda_model3, corpus, dictionary, sort_topics = False)
pyLDAvis.display(lda_display3)

In [30]:
%%time
number_topics = 6

lda_model3 = gensim.models.ldamodel.LdaModel(corpus, num_topics=number_topics, id2word=dictionary, passes=15)

CPU times: user 3min 7s, sys: 389 ms, total: 3min 8s
Wall time: 3min 8s


In [31]:
lda_display4 = pyLDAvis.gensim.prepare(lda_model3, corpus, dictionary, sort_topics = False)
pyLDAvis.display(lda_display4)

In [32]:
review_df.review_target.value_counts()

Tops Pizza                                    202
Pepe's Piri Piri                              178
Holy Cow                                      110
Paya                                           95
Chicken Cottage                                85
The Grill                                      77
Pizza 2 Night                                  75
Great Wall                                     71
Bengal Spice                                   65
Dixy Chicken                                   64
The Tiffin Tin                                 63
Roosters Piri Piri                             63
Peri Peri Original                             60
Perfect Fried Chicken                          59
Yum Yum                                        56
Bella Napoli Pizzeria (Wood Fire Oven)         53
Favorite Chicken & Ribs                        50
Chicken Express                                49
You Me Sushi                                   48
China Chef                                     47
