In [157]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")
data_path = "data/"
places = pd.read_csv(data_path + "places_final.csv")
reviews = pd.read_csv(data_path + "reviews_final.csv")

Create copy of `reviews` dataframe.

In [158]:
df = pd.merge(reviews, places[["gPlusPlaceId", "price"]], how="left", on="gPlusPlaceId")
df.dropna(subset=["reviewTextClean", "price"], inplace=True)
df = df.loc[:, ["price", "rating", "gPlusPlaceId", "reviewTextClean"]].reset_index(drop=True)

Create BoW representation of `reviewTextClean`. We find that there are 54.767 words in the vocabulary of our corpus. In comparison, there are 171.476 words in current use in the [Oxford Dictionary](https://en.wikipedia.org/wiki/List_of_dictionaries_by_number_of_words).

In [159]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
bow_counts = count_vect.fit_transform(df['reviewTextClean'].values)
bow_counts.shape

(96008, 54767)

It is hard to imagine that 96.008 reviews use almost half of all words in the English language. Therefore, let us have a look at the words

In [160]:
sorted(count_vect.vocabulary_.items(), key= lambda item: item[1])

[('00', 0),
 ('000', 1),
 ('007', 2),
 ('00s', 3),
 ('01', 4),
 ('02', 5),
 ('07', 6),
 ('07775092749', 7),
 ('09s', 8),
 ('0k', 9),
 ('0nce', 10),
 ('10', 11),
 ('100', 12),
 ('1000', 13),
 ('100000', 14),
 ('1000of', 15),
 ('1000x', 16),
 ('10011', 17),
 ('10012', 18),
 ('10018', 19),
 ('10024', 20),
 ('10028', 21),
 ('10036', 22),
 ('100ft', 23),
 ('100g', 24),
 ('100hoxton', 25),
 ('100m', 26),
 ('100pp', 27),
 ('100th', 28),
 ('100x', 29),
 ('100x100', 30),
 ('101', 31),
 ('102', 32),
 ('1020', 33),
 ('103', 34),
 ('1030', 35),
 ('1039', 36),
 ('1045am', 37),
 ('104th', 38),
 ('105', 39),
 ('106th', 40),
 ('108', 41),
 ('1080', 42),
 ('1080p', 43),
 ('1099', 44),
 ('10am', 45),
 ('10ave', 46),
 ('10cc', 47),
 ('10ft', 48),
 ('10gbp', 49),
 ('10girls', 50),
 ('10h30pm', 51),
 ('10ish', 52),
 ('10jrs', 53),
 ('10kg', 54),
 ('10lbs', 55),
 ('10livres', 56),
 ('10m', 57),
 ('10min', 58),
 ('10mins', 59),
 ('10minute', 60),
 ('10minutes', 61),
 ('10oz', 62),
 ('10p', 63),
 ('10pm', 64)

We see that there are words consisting solely of numbers, which we will remove. When looking at the individual words closer (not shown here), we found several things worth noting with some specific examples included.
1. Misspelling of words 
    - "establiemenet" which should have been "establishment"
2. Spoken language
    - To better express themselves, reviewers can write "aaamaaazing" instead of "amazing"
3. Languages other than English
    - Occurence of "aangeboden" which is a Dutch word meaning "offered"

Due to these three points, we end up with a significantly larger vocabulary than what is being used in reality. As a result, we get `MemoryError` during model building since the  document-term matrix for BoW representation has unnecessarily many columns/words. A good approach would be do to use pretrained word vectors from [GloVe](https://nlp.stanford.edu/projects/glove/) to get around running out of memory. However, we cannot use this approach if a word suffers from any one of the three highlighted issues above. Therefore, we create a function `post_clean` to fix these three issues.

In [166]:
import nltk
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
import enchant
from langdetect import detect, LangDetectException
from deep_translator import GoogleTranslator
import numpy as np
def post_clean(text):
    try: # Reviews can consist of japanese or chinese characters that are not detectable in langdetect
        language = detect(text)
        if language == "en":
            eng_dict = enchant.Dict("en_GB")
            tokens = word_tokenize(text)
            non_numbers = [s for s in tokens if not all(c.isdigit() for c in s)] # remove words containing solely of numbers
            for i, token in enumerate(non_numbers):
                in_eng = eng_dict.check(token)
                if not in_eng:
                    try:
                        non_numbers[i] = eng_dict.suggest(token)[0]
                    except IndexError: # When token contains numbers list becomes empty
                        continue
            return ' '.join(non_numbers)
        else:
            stopwords =  nltk.corpus.stopwords.words('english')
            lemmatizer = WordNetLemmatizer()
            translator = GoogleTranslator(source="auto", target="en")
            translated = translator.translate(text)
            tokens = word_tokenize(translated)
            non_numbers = [s for s in tokens if not all(c.isdigit() for c in s)] # remove words containing solely of numbers
            final = [lemmatizer.lemmatize(word) for word in non_numbers] # lemmatize words
            return " ".join([w for w in final if w not in stopwords]) # Finally removes stopwords
    except LangDetectException:
            return np.NaN

In [167]:
from tqdm import tqdm
review_vector = []
for review in tqdm(df['reviewTextClean']):
    review_vector.append(post_clean(review))

100%|██████████| 96008/96008 [3:38:32<00:00,  7.32it/s]   


In [168]:
df['reviewTextClean'] = review_vector
df.to_csv("data/NLP_data.csv", index=False)

# Model building

In [169]:
df = pd.read_csv("data/NLP_data.csv")

In [177]:
reviews[(reviews.gPlusPlaceId == "107346748950819090586") & (reviews.rating == 3.0)]

Unnamed: 0,rating,reviewerName,reviewText,gPlusPlaceId,gPlusUserId,reviewTextClean
63006,3.0,Kurton Asger,Delicious food coupled with top notch service....,107346748950819090586,104518909231201256766,delicious food coupled top notch service nothi...
64947,3.0,João Miranda,"Boa comida, mas pouca variadade",107346748950819090586,104652388907247352282,boa comida pouca variadade
157798,3.0,Andrew Seftel,Wonderfully authentic Mexican cuisine. Worth p...,107346748950819090586,111146438532945651873,wonderfully authentic mexican cuisine worth pe...


In [170]:
df

Unnamed: 0,price,rating,gPlusPlaceId,reviewTextClean
0,3.0,3.0,106689630448064755324,cute hotel good amenity nice location great cr...
1,3.0,4.0,108256990636148259283,love place massage facial technician best loun...
2,2.0,5.0,105947477166033397439,service amazing line go fast
3,1.0,5.0,107098981103934500500,get chicken green salad yum
4,1.0,5.0,108585910849109169666,never falafel bar yum super crunchy sweet pota...
...,...,...,...,...
96003,2.0,4.0,110548558285915713747,delicious pizza price good downfall amount peo...
96004,3.0,1.0,107346748950819090586,went mole festival Saturday night worst restau...
96005,2.0,5.0,110403843200459675752,42nd street photo camera WA looking best price...
96006,1.0,4.0,101421411984715145689,true life saver


In [10]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
bow_counts = count_vect.fit_transform(df['reviewTextClean'].values)
bow_counts.shape

(96008, 53176)

In [38]:
print("Length of vocabulary:", len(count_vect.vocabulary_))
print("Length of corpus:", bow_counts.shape[0])

Length of vocabulary: 53176
Length of corpus: 96008


Create tf-idf representation of `reviewTextClean`

In [39]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer()
bow_tfidf = tfidf.fit_transform(bow_counts)

In [40]:
from sklearn.model_selection import train_test_split
y = df.price.values-1

# Split into development and test split
X_bow_dev, X_bow_test, X_tfidf_dev, X_tfidf_test, y_dev, y_test = train_test_split(bow_counts, bow_tfidf, y, test_size=0.2, stratify=y)

# Split development into training and validation
X_bow_train, X_bow_val, X_tfidf_train, X_tfidf_val, y_train, y_val = train_test_split(X_bow_dev, X_tfidf_dev, y_dev, test_size=0.1, stratify=y_dev)

In [41]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
def classification_metrics(header_text, clf, X_train, X_test, y_train, y_test):
    print("="*35)
    print(" "*int((35-len(header_text))/2), header_text)
    print("="*35)
    y_pred = clf.predict(X_train)
    print("Train Accuracy:", accuracy_score(y_train, y_pred))
    y_pred = clf.predict(X_test)
    print("Val Accuracy:", accuracy_score(y_test, y_pred))
    print("F1 Score:", f1_score(y_test, y_pred, average="weighted"))
    print(confusion_matrix(y_test, y_pred))

In [42]:
# Dummy classifier predicting most frequent class
from sklearn.dummy import DummyClassifier
dummy = DummyClassifier(strategy="most_frequent")
dummy.fit(X_bow_train, y_train)
y_pred = dummy.predict(X_bow_val)
classification_metrics("Dummy classifier", dummy, X_bow_train, X_bow_val, y_train, y_val)

          Dummy classifier
Train Accuracy: 0.558249547920434
Val Accuracy: 0.5582606431454238
F1 Score: 0.40000361564167053
[[   0 1382    0]
 [   0 4288    0]
 [   0 2011    0]]


In [43]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(solver="saga")
lr.fit(X_tfidf_train, y_train)
classification_metrics("LogisticRegression tf-idf", lr, X_tfidf_train, X_tfidf_val, y_train, y_val)

      LogisticRegression tf-idf
Train Accuracy: 0.7275226039783002
Val Accuracy: 0.6490040359328213
F1 Score: 0.6288895516058658
[[ 482  843   57]
 [ 218 3610  460]
 [  31 1087  893]]


In [None]:
import xgboost
xgb = xgboost.XGBClassifier(eval_metric="mlogloss", use_label_encoder=False)
xgb.fit(X_bow_train, y_train)
classification_metrics("XGBoost BoW", xgb, X_bow_train, X_bow_val, y_train, y_val)

In [38]:
xgb = xgboost.XGBClassifier(eval_metric="mlogloss", use_label_encoder=False)
xgb.fit(X_tfidf_train, y_train)
classification_metrics("XGBoost tf-idf", xgb, X_tfidf_train, X_tfidf_val, y_train, y_val)

          XGBoost tf-idf
Train Accuracy: 0.6870596745027124
Val Accuracy: 0.626220544199974
F1 Score: 0.5770760772029238
[[ 362  990   30]
 [ 134 3928  226]
 [  12 1479  520]]
