#  Data Munchers

Hotel reviews project.
Natural Language Processing (NPL) - Sentiment Analysis, based on this article: https://towardsdatascience.com/detecting-bad-customer-reviews-with-nlp-d8b36134dc7e?fbclid=IwAR16qD6SyOc93UT07MhWQy5JaMZA1dqEXBF6C26sTHEEzM0z-K7WYA7bIdU

In [42]:
import pandas as pd

# load cleaned data
reviews = pd.read_csv('../Data/simple_clean_data.csv')

# column Review: cleaned data
# column Origin: original data

In [43]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')

# Create and add 4 columns (neg, neu, pos, compound) by Vader to the dataset

sid = SentimentIntensityAnalyzer()
reviews["Sentiments"] = reviews["Origin"].apply(lambda x: sid.polarity_scores(x))
reviews = pd.concat([reviews.drop(['Sentiments'], axis=1), reviews['Sentiments'].apply(pd.Series)], axis=1)
reviews.head()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Judit\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Unnamed: 0,Reviewer_Score,Review,Origin,neg,neu,pos,compound
0,1,cookie arrival balloon cake room husband birth...,Cookie on arrival balloons and cake in our ro...,0.0,1.0,0.0,0.0
1,0,twin bed requested double booked month advance...,Twin beds when requested double and booked 9 ...,0.167,0.833,0.0,-0.5267
2,1,location far away many famous place however im...,Location far away to many famous place howeve...,0.069,0.586,0.345,0.9433
3,1,excellent location helpfull feendy staff,Excellent location Helpfull and feendy staff,0.0,0.575,0.425,0.5719
4,1,room little bit noisy noise coming street exce...,the room was a little bit noisy with the nois...,0.065,0.636,0.3,0.7334


In [44]:
# add number of characters column
reviews["nb_chars"] = reviews["Origin"].apply(lambda x: len(x))

# add number of words column
reviews["nb_words"] = reviews["Origin"].apply(lambda x: len(x.split(" ")))

In [45]:
# highest positive sentiment reviews (with more than 5 words)
reviews[reviews["nb_words"] >= 5].sort_values("pos", ascending = False)[["Origin", "Review", "pos"]].head(10)

Unnamed: 0,Origin,Review,pos
82,Very comfortable beds excellent breakfast,comfortable bed excellent breakfast,0.716
13,Great hotel excellent location with very frie...,great hotel excellent location friendly staff,0.693
30,Room temperature a little warm Great location...,room temperature little warm great location fr...,0.63
73,The bed was very comfortable the staff friend...,bed comfortable staff friendly helpful,0.578
85,The room is lovely and view was amazing,room lovely view amazing,0.559
62,Front staff was friendly and spoke great Engl...,front staff friendly spoke great english,0.549
58,i had a great experience so i did like everyt...,great experience like everything handy phone g...,0.523
63,Loved the interior design all themed around L...,loved interior design themed around la scala o...,0.519
91,Awesome concept Really enjoyed it so much I d...,awesome concept really enjoyed much definitely...,0.516
89,Very small twin room and shower but still goo...,small twin room shower still good value money ...,0.511


In [46]:
# lowest negative sentiment reviews (with more than 5 words)
reviews[reviews["nb_words"] >= 5].sort_values("neg", ascending = False)[["Origin", "Review", "neg"]].head(10)

Unnamed: 0,Origin,Review,neg
33,Poor wifi connection Poor quality of breakfast,poor wifi connection poor quality breakfast,0.554
22,No room service after 11 30 pm No bar working...,room service pm bar working august massage ser...,0.385
69,Couldn t open windows in room only AC not gre...,open window room ac great great location helpf...,0.319
64,Nothing Very romantic with extremely pleasant...,nothing romantic extremely pleasant staff,0.224
21,Hotel is less than its star rating Extremely ...,hotel le star rating extremely poor customer s...,0.224
70,Room maid service was not adequate Location,room maid service adequate location,0.217
98,Everything was ok no big surprises The breakf...,everything ok big surprise breakfast better ex...,0.216
65,The room could have been better Breakfast was...,room could better breakfast great shame room b...,0.213
9,Our room was TINY We had absolutely no room t...,room tiny absolutely room change kept getting ...,0.211
79,Nothing Very clean and comfortable Modern fur...,nothing clean comfortable modern furnishing sp...,0.19


In [47]:
'''
# create doc2vec vector columns
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(reviews["Review"].apply(lambda x: x.split(" ")))]

# train a Doc2Vec model with our text data
model = Doc2Vec(documents, vector_size=5, window=2, min_count=1, workers=4)

# transform each document into a vector data
doc2vec = reviews["Review"].apply(lambda x: model.infer_vector(x.split(" "))).apply(pd.Series)
doc2vec.columns = ["doc2vec_vector_" + str(x) for x in doc2vec.columns]
reviews = pd.concat([reviews, doc2vec], axis=1)

reviews
'''

'\n# create doc2vec vector columns\nfrom gensim.test.utils import common_texts\nfrom gensim.models.doc2vec import Doc2Vec, TaggedDocument\n\ndocuments = [TaggedDocument(doc, [i]) for i, doc in enumerate(reviews["Review"].apply(lambda x: x.split(" ")))]\n\n# train a Doc2Vec model with our text data\nmodel = Doc2Vec(documents, vector_size=5, window=2, min_count=1, workers=4)\n\n# transform each document into a vector data\ndoc2vec = reviews["Review"].apply(lambda x: model.infer_vector(x.split(" "))).apply(pd.Series)\ndoc2vec.columns = ["doc2vec_vector_" + str(x) for x in doc2vec.columns]\nreviews = pd.concat([reviews, doc2vec], axis=1)\n\nreviews\n'

In [48]:
'''
# add tf-idfs columns (Term Frequency — Inverse Document Frequency)
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(min_df = 10)
tfidf_result = tfidf.fit_transform(reviews["Review"]).toarray()
tfidf_df = pd.DataFrame(tfidf_result, columns = tfidf.get_feature_names())
tfidf_df.columns = ["word_" + str(x) for x in tfidf_df.columns]
tfidf_df.index = reviews.index
reviews = pd.concat([reviews, tfidf_df], axis=1)
reviews
'''

'\n# add tf-idfs columns (Term Frequency\u200a—\u200aInverse Document Frequency)\nfrom sklearn.feature_extraction.text import TfidfVectorizer\n\ntfidf = TfidfVectorizer(min_df = 10)\ntfidf_result = tfidf.fit_transform(reviews["Review"]).toarray()\ntfidf_df = pd.DataFrame(tfidf_result, columns = tfidf.get_feature_names())\ntfidf_df.columns = ["word_" + str(x) for x in tfidf_df.columns]\ntfidf_df.index = reviews.index\nreviews = pd.concat([reviews, tfidf_df], axis=1)\nreviews\n'