#Imports

In [2]:
# add path
import sys
sys.path.append('../Deliverable_2/')

import pandas as pd 
import numpy as np
from numpy.random import randint
import seaborn as sns
from collections import Counter, OrderedDict
import cld2

# import utils functions and classes
from DataLoader import TADataLoader
from TextPreprocessing import TextPreprocessor
from LemmatizeCorpus import LemmatizeCorpus
from stem_corpus import stem_corpus
from Embeddor import Embeddor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from ast import literal_eval

import gensim.downloader as api

import nltk
nltk.download('punkt')
nltk.download('stopwords')


import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams["figure.figsize"] = (10, 6)
sns.set_palette("husl")

ModuleNotFoundError: No module named 'utils.loader_utils'

In [4]:
import tensorflow_addons

 The versions of TensorFlow you are currently using is 2.2.0 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


ImportError: cannot import name 'keras_tensor' from 'tensorflow.python.keras.engine' (/opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/keras/engine/__init__.py)

#Loading the data

In [None]:
loader = TADataLoader()
df_restos, df_reviews = loader.load_restos(drop_duplicates=True),\
                        loader.load_reviews(drop_duplicates=True)
#df_reviews = df_reviews.iloc[:5000]
preprocessor = TextPreprocessor(df_reviews, column_to_clean='review_content')
preprocessor.transform(n_grams=False)
corpus = preprocessor.corpus

#LSI

We first perform LSI with respect to the reviews on the the two first dimensions

In [None]:
embeddor = Embeddor(corpus=corpus) #use our custom class to compute the embedding
embeddor.transform(vec_method="tfidf", how="SVD", n=2)
lsi = embeddor.review_embedding
lsi['corpus'] = corpus  #add columns to link the embedding to the review
lsi['rating'] = df_reviews['review_rating']
lsi.head()

Now, we compute LSI with as many dimensions as needed to explain a given percentage of the variance. 

In [None]:
embeddor = Embeddor(corpus=corpus)
embeddor.transform(vec_method="tfidf", how="SVD", n="n_opt", threshold=0.5)
lsi = embeddor.review_embedding
lsi['corpus'] = corpus
lsi['rating'] = df_reviews['review_rating']
lsi.head()

#Word2vec

Now we will use the word2vec embedding

In [None]:
embeddor = Embeddor(corpus=corpus)
embeddor.transform(vec_method="word2vec", how="PCA", n="n_opt", threshold=0.95)
word2vec_embed = embeddor.review_embedding
word2vec_model = embeddor.model
word2vec_embed['corpus'] = corpus
word2vec_embed['rating'] = df_reviews['review_rating']
word2vec_embed.head()

Thanks to the word2vec embedding we can see the proximity between terms in our dataset:

In [None]:
word2vec_model.wv.most_similar("waiter", topn=10)

As we observe what appears to be waiters' names, the embedding is quite accurate

#Fasttext

Thnaks to our class we can also use a fattext embedding

In [None]:
embeddor = Embeddor(corpus=corpus)
embeddor.transform(vec_method="fasttext", how="PCA", n="n_opt", threshold=0.95)
fasttext_embed = embeddor.review_embedding
fasttext_model = embeddor.model
fasttext_embed['corpus'] = corpus
fasttext_embed['rating'] = df_reviews['review_rating']
fasttext_embed.head()

In [None]:
fasttext_model.wv.most_similar("waiter", topn=10)

The most similar words appear to be slightly less accurate thant the ones produced by the word2vec method

#Data augmentation

We observe that our dataset is not balanced which isn't a good point if we want to cluster the different topics or if we want to build a classification model.

In [None]:
sns.countplot(x='review_rating', data=df_reviews)
plt.title('Number of reviews per rating')
plt.xlabel('Rating')
plt.ylabel(' ')
plt.show()

To overcome the skewness of our dataset we perform data augmentation. In order to achieve this we create new reviews with similar terms (according to the general word2vec pretrained model). 

At this point we can either load the augmented corpus that has been obtained or rerun the following steps.

In [None]:
#load augmented corpus
path = 'balanced_corpus.txt'
with open(path, 'r') as f:
  corpus_augmented = literal_eval(f.read())

##Creating augmented corpus

In [None]:
#load pretrained model for augmentation
wv = api.load('word2vec-google-news-300')


In [None]:
#corpus per rating
for i in range(1,6):
    exec(f'r{i} = df_reviews[df_reviews.loc[:,"review_rating"]==i]')
    exec(f"preprocessor = TextPreprocessor(r{i}, column_to_clean='review_content')")
    preprocessor.transform(n_grams=False)
    exec(f'corpus{i} = preprocessor.corpus')

In [None]:
def similar(corp):
    for i in range(len(corp)):
        print(i)                                      
        for j in range(len(corp[i])):
            n = randint(2)
            try:
                corp[i][j] = wv.most_similar(corp[i][j], topn=3)[n][0]
            except:
                continue
    return corp

In [None]:
#we augment the number of reviews for ratings 1 and 2
corpus1_augmented = corpus1.copy()
corpus1_augmented = similar(corpus1_augmented)
corpus2_augmented = corpus2.copy()
corpus2_augmented = similar(corpus2_augmented)

In [None]:
#create a balanced datset
corpus_augmented = corpus1 + corpus1_augmented[1:36] + corpus2\
                   + corpus2_augmented + corpus3[:110] + corpus4[:110]\
                   + corpus5[:110]

##Augmented corpus results

In [None]:
embeddor = Embeddor(corpus=corpus_augmented)
embeddor.transform(vec_method="tfidf", how="SVD", n="n_opt", threshold=0.5)
lsi = embeddor.review_embedding
lsi

In [None]:
ratings = [1]*104 + [2]*116 + [3]*110 + [4]*110 + [5]*110

In [None]:
embeddor = Embeddor(corpus=corpus_augmented)
embeddor.transform(vec_method="tfidf", how="SVD", n="n_opt", threshold=0.5)
lsi = embeddor.review_embedding
lsi['corpus'] = corpus_augmented
lsi['rating'] = ratings
lsi.head()

In [None]:
sns.countplot(x='rating', data=lsi)
plt.title('Number of reviews per rating after data augmentation')
plt.xlabel('Rating')
plt.ylabel(' ')
plt.show()

Let's see how the different topics are linked to the quality of the reviews (for the first 25 directions):

In [None]:
for i in range(25):
    rat_list = []
    lsi[f'SV{i+1}'] = np.abs(lsi[f'SV{i+1}'])
    top_words = lsi.sort_values(f'SV{i+1}', ascending=False).index[:20]
    print(f"Average rating for topic {i} is : ")
    for ind in list(top_words):
        rat_list.append(ratings[ind])
    print(sum(rat_list)/20)

We can now visualize how the quality of a review is related to a topic. In the following example the topic 1 is mostly associated with good reviews while the second topic is mostly associated with bad reviews. 

In [None]:
fig, ax = plt.subplots(figsize=(10,10))

colors = ['red', 'orange', 'yellow', 'yellowgreen', 'green']

for val in lsi['rating'].unique():
    topic_1 = np.abs(lsi[lsi['rating']==val]['SV1'].values)
    topic_2 = np.abs(lsi[lsi['rating']==val]['SV66'].values)
    color = colors[val-1]
    ax.scatter(topic_1, topic_2, alpha=0.7, label=val, color=color)
    
ax.set_xlabel('First Topic')
ax.set_ylabel('Second Topic')
ax.axvline(linewidth=0.5)
ax.axhline(linewidth=0.5)
ax.legend()

Let's see what are the top reviews associated with each topic. We observe that "food" is often mentioned in the reviews associated with the first topics while the service appears several times in the reviews associated with the second topic. We conclude that food is the differentiating factor for restaurants and bars while service is only a requirement. 


In [None]:
for i in [0, 65]:
    rat_list = []
    lsi[f'SV{i+1}'] = np.abs(lsi[f'SV{i+1}'])
    top_words = lsi.sort_values(f'SV{i+1}', ascending=False).index[:5]
    print(f"Top review for topic {i} is: ")
    for ind in list(top_words):
        print(lsi.corpus[ind])

#Classification Model

##LSI embedding

###Logistic regression

In [None]:
embeddor = Embeddor(corpus=corpus_augmented)
embeddor.transform(vec_method="tfidf", how="SVD", n="n_opt", threshold=0.5)
lsi = embeddor.review_embedding
lsi['rating'] = ratings
lsi

In [None]:
#prep df 
lsi.dropna(inplace=True)
X_train, X_test, y_train, y_test = train_test_split(lsi.iloc[:, :-1],
                                                    lsi.iloc[:, -1], 
                                                    test_size=0.2, 
                                                    random_state=0, 
                                                    shuffle=True)
# check distribution of ratings
print("Value counts for Train reviews")
print(y_train.value_counts())
print("Value counts for Test resviews")
print(y_test.value_counts())

In [None]:
logreg = LogisticRegression(multi_class='multinomial', solver='lbfgs')
logreg.fit(X_train, y_train)
# cross validation
scores_log = cross_val_score(logreg, X_train, y_train, cv = 10, 
                            scoring='f1_weighted')
print('Cross-validation scores:{}'.format(scores_log))
avg_score_log = np.mean(scores_log)
print('Average cross-validation score:{}'.format(avg_score_log))

In [None]:
#classification report 
pred_logreg = logreg.predict(X_test)
print(classification_report(y_test, pred_logreg))

In [None]:
#confusion matrix
data = confusion_matrix(y_test, pred_logreg, normalize='true')
df_cm = pd.DataFrame(data, columns=[1, 2, 3, 4, 5], index=[1, 2, 3, 4, 5])
df_cm.index.name = 'True label'
df_cm.columns.name = 'Predicted label'
plt.figure(figsize = (9, 6))
plt.title('Confusion Matrix')
sns.set(font_scale=1.2) 
sns.heatmap(df_cm, cmap="Blues", annot=True, annot_kws={"size": 12}, fmt='.1g')

###Random Forest

In [None]:
#random forest classifier
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

scores_rf = cross_val_score(rf, X_train, y_train, cv = 10, 
                            scoring='f1_weighted')
print('Cross-validation scores:{}'.format(scores_rf))
avg_score_rf = np.mean(scores_rf)
print('Average cross-validation score:{}'.format(avg_score_rf))

In [None]:
pred_labels_rf = rf.predict(X_test)
print(classification_report(y_test, pred_labels_rf))

In [None]:
#confusion matrix
data = confusion_matrix(y_test, pred_labels_rf, normalize='true')
df_cm = pd.DataFrame(data, columns=[1, 2, 3, 4, 5], index=[1, 2, 3, 4, 5])

df_cm.index.name = 'True label'
df_cm.columns.name = 'Predicted label'
plt.figure(figsize = (9, 6))
plt.title('Normalized confusion matrix')
sns.set(font_scale=1.2) 
sns.heatmap(df_cm, cmap="Blues", annot=True, annot_kws={"size": 12}, fmt='.1g'); 

The random forest classifier shows better result than the logistic regression classifier.

##Word2vec embedding

We use a random forest classifier with the word2vec embedding as input.

In [None]:
embeddor = Embeddor(corpus=corpus_augmented)
embeddor.transform(vec_method="word2vec", how="PCA", n=100) #n_opt wasn't offering good results
word2vec_embed = embeddor.review_embedding
word2vec_embed['rating'] = ratings
word2vec_embed

In [None]:
word2vec_embed.dropna(inplace=True)
X_train, X_test, y_train, y_test = train_test_split(word2vec_embed.iloc[:, :-1],
                                                    word2vec_embed.iloc[:, -1], 
                                                    test_size=0.2, 
                                                    random_state=0, 
                                                    shuffle=True)
# check distribution of ratings
print("Value counts for Train reviews")
print(y_train.value_counts())
print("Value counts for Test resviews")
print(y_test.value_counts())

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

scores_rf = cross_val_score(rf, X_train, y_train, cv = 10, 
                            scoring='f1_weighted')
print('Cross-validation scores:{}'.format(scores_rf))
avg_score_rf = np.mean(scores_rf)
print('Average cross-validation score:{}'.format(avg_score_rf))

In [None]:
#classification reports
pred_labels_rf = rf.predict(X_test)
print(classification_report(y_test, pred_labels_rf))

In [None]:
#confusion matrix
data = confusion_matrix(y_test, pred_labels_rf, normalize='true')
df_cm = pd.DataFrame(data, columns=[1, 2, 3, 4, 5], index=[1, 2, 3, 4, 5])

df_cm.index.name = 'True label'
df_cm.columns.name = 'Predicted label'
plt.figure(figsize = (9, 6))
plt.title('Normalized confusion matrix')
sns.set(font_scale=1.2) 
sns.heatmap(df_cm, cmap="Blues", annot=True, annot_kws={"size": 12}, fmt='.1g'); 

##Fasttext Embedding

We use a random forest classifier with the fasstext embedding as input.

In [None]:
embeddor = Embeddor(corpus=corpus_augmented)
embeddor.transform(vec_method="fasttext", how="PCA", n=100) #n_opt wasn't offering good results
fasttext = embeddor.review_embedding
fasttext['rating'] = ratings
fasttext

In [None]:
fasttext.dropna(inplace=True)
X_train, X_test, y_train, y_test = train_test_split(fasttext.iloc[:, :-1],
                                                    fasttext.iloc[:, -1], 
                                                    test_size=0.2, 
                                                    random_state=0, 
                                                    shuffle=True)
# check distribution of ratings
print("Value counts for Train reviews")
print(y_train.value_counts())
print("Value counts for Test resviews")
print(y_test.value_counts())

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

scores_rf = cross_val_score(rf, X_train, y_train, cv = 10, 
                            scoring='f1_weighted')
print('Cross-validation scores:{}'.format(scores_rf))
avg_score_rf = np.mean(scores_rf)
print('Average cross-validation score:{}'.format(avg_score_rf))

In [None]:
#classification reports
pred_labels_rf = rf.predict(X_test)
print(classification_report(y_test, pred_labels_rf))

In [None]:
#confusion matrix
data = confusion_matrix(y_test, pred_labels_rf, normalize='true')
df_cm = pd.DataFrame(data, columns=[1, 2, 3, 4, 5], index=[1, 2, 3, 4, 5])

df_cm.index.name = 'True label'
df_cm.columns.name = 'Predicted label'
plt.figure(figsize = (9, 6))
plt.title('Normalized confusion matrix')
sns.set(font_scale=1.2) 
sns.heatmap(df_cm, cmap="Blues", annot=True, annot_kws={"size": 12}, fmt='.1g'); 

#Final remark

None of our classifier is showing convincing results. However we are aware that several points can be improved: more data can be scrapped, data augmentation should be performed at a higher scale, a deep learning method could outperform the ones we have shown.

It is interesting to note that even if the dataset is balanced between the classes, it appears easier to predict accurately good reviews (rated 4 or 5) than negative ones (rated 1 or 2).