In [41]:
import pandas as pd
# read data
reviews_df = pd.read_csv("/Users/liorlavi/Downloads/nyt-comments/CommentsMarch2018.csv", nrows=10000)
reviews_df.head()

Unnamed: 0,approveDate,articleID,articleWordCount,commentBody,commentID,commentSequence,commentTitle,commentType,createDate,depth,...,status,timespeople,trusted,updateDate,userDisplayName,userID,userLocation,userTitle,userURL,typeOfMaterial
0,1520043821,5a974697410cf7000162e8a4,1207,If the choice is between mining for bitcoin - ...,26188943.0,26188943.0,<br/>,comment,1520029445,1.0,...,approved,1,0,1520043821,Steve,46903103.0,Florida,,,News
1,1520043790,5a974697410cf7000162e8a4,1207,"<br/>To me, Bitcoin (et al) appears to be an e...",26189292.0,26189292.0,<br/>,comment,1520031265,1.0,...,approved,1,0,1520043790,MyOpinion,82778.0,NYC,,,News
2,1520043789,5a974697410cf7000162e8a4,1207,Bitcoin is a pyramid scheme backed by nothing ...,26189645.0,26189645.0,<br/>,comment,1520033172,1.0,...,approved,1,0,1520043789,Bert Gold,3013548.0,"Frederick, Maryland",,,News
3,1520043788,5a974697410cf7000162e8a4,1207,What does it cost in energy to dig up and refi...,26189102.0,26189102.0,<br/>,comment,1520030291,1.0,...,approved,1,0,1520043788,James Demers,70245222.0,Brooklyn,,,News
4,1520043787,5a974697410cf7000162e8a4,1207,You forgot to mention stock buybacks.,26189683.0,26189683.0,<br/>,comment,1520033404,1.0,...,approved,1,0,1520043787,Bill,66424344.0,California,,,News


In [42]:
# The next step consists in cleaning the text data with various operations:
# return the wordnet object value corresponding to the POS tag
import nltk
#nltk.download()
from nltk.corpus import wordnet


def get_wordnet_pos(pos_tag):
    if pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('N'):
        return wordnet.NOUN
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [43]:
import string
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('averaged_perceptron_tagger')
    
def clean_text(text):
    # lower text
    text = text.lower()
    # tokenize text and remove puncutation
    text = [word.strip(string.punctuation) for word in text.split(" ")]
    # remove words that contain numbers
    text = [word for word in text if not any(c.isdigit() for c in word)]
    # remove stop words
    stop = stopwords.words('english')
    text = [x for x in text if x not in stop]
    # remove empty tokens
    text = [t for t in text if len(t) > 0]
    # pos tag text
    pos_tags = pos_tag(text)
    # lemmatize text
    text = [WordNetLemmatizer().lemmatize(t[0], get_wordnet_pos(t[1])) for t in pos_tags]
    # remove words with only one letter
    text = [t for t in text if len(t) > 1]
    # join all
    text = " ".join(text)
    return(text)

# clean text data
reviews_df["commentBody_clean"] = reviews_df["commentBody"].apply(lambda x: clean_text(x))


[nltk_data] Error loading averaged_perceptron_tagger: <urlopen error
[nltk_data]     [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify
[nltk_data]     failed: unable to get local issuer certificate
[nltk_data]     (_ssl.c:1056)>


In [44]:
# To clean textual data, we call our custom ‘clean_text’ function that performs several transformations:
#lower the text
#tokenize the text (split the text into words) and remove the punctuation
#remove useless words that contain numbers
#remove useless stop words like ‘the’, ‘a’ ,’this’ etc.
#Part-Of-Speech (POS) tagging: assign a tag to every word to define if it corresponds to a noun, a verb etc. using the WordNet lexical database
#lemmatize the text: transform every word into their root form (e.g. rooms -> room, slept -> sleep)

In [45]:
# add sentiment anaylsis columns
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

sid = SentimentIntensityAnalyzer()
reviews_df["sentiments"] = reviews_df["commentBody"].apply(lambda x: sid.polarity_scores(x))
reviews_df = pd.concat([reviews_df.drop(['sentiments'], axis=1), reviews_df['sentiments'].apply(pd.Series)], axis=1)

[nltk_data] Error loading vader_lexicon: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1056)>


In [46]:
# We first start by adding sentiment analysis features because we can guess that customers reviews are highly linked to how they felt about their stay at the hotel. 
# We use Vader, which is a part of the NLTK module designed for sentiment analysis. 
# Vader uses a lexicon of words to find which ones are positives or negatives. 
# It also takes into account the context of the sentences to determine the sentiment scores. 
# For each text, Vader returns 4 values:
#a neutrality score
#a positivity score
#a negativity score
#an overall score that summarizes the previous scores
#We will integrate those 4 values as features in our dataset.

In [47]:
# add number of characters column
reviews_df["nb_chars"] = reviews_df["commentBody_clean"].apply(lambda x: len(x))

# add number of words column
reviews_df["nb_words"] = reviews_df["commentBody_clean"].apply(lambda x: len(x.split(" ")))

In [48]:
# Next, we add some simple metrics for every text:
#number of characters in the text
#number of words in the text

In [49]:
# create doc2vec vector columns
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(reviews_df["commentBody_clean"].apply(lambda x: x.split(" ")))]

# train a Doc2Vec model with our text data
model = Doc2Vec(documents, vector_size=5, window=2, min_count=1, workers=4)

# transform each document into a vector data
doc2vec_df = reviews_df["commentBody_clean"].apply(lambda x: model.infer_vector(x.split(" "))).apply(pd.Series)
doc2vec_df.columns = ["doc2vec_vector_" + str(x) for x in doc2vec_df.columns]
reviews_df = pd.concat([reviews_df, doc2vec_df], axis=1)

In [50]:
# The next step consist in extracting vector representations for every review. 
# The module Gensim creates a numerical vector representation of every word in the corpus by using the contexts in which they appear (Word2Vec). This is performed using shallow neural networks. What’s interesting is that similar words will have similar representation vectors.
# Each text can also be transformed into numerical vectors using the word vectors (Doc2Vec). 
# Same texts will also have similar representations and that is why we can use those vectors as training features.
# We first have to train a Doc2Vec model by feeding in our text data. By applying this model on our reviews, 
# we can get those representation vectors.

In [51]:
# add tf-idfs columns
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(min_df = 10)
tfidf_result = tfidf.fit_transform(reviews_df["commentBody_clean"]).toarray()
tfidf_df = pd.DataFrame(tfidf_result, columns = tfidf.get_feature_names())
tfidf_df.columns = ["word_" + str(x) for x in tfidf_df.columns]
tfidf_df.index = reviews_df.index
reviews_df = pd.concat([reviews_df, tfidf_df], axis=1)

In [52]:
# Finally we add the TF-IDF (Term Frequency — Inverse Document Frequency) values for every word and every document.
# But why not simply counting how many times each word appears in every document? 
# The problem with this method is that it doesn’t take into account the relative importance of words in the texts. 
# A word that appears in almost every text would not likely bring useful information for analysis. 
# On the contrary, rare words may have a lot more of meanings.
# The TF-IDF metric solves this problem:
# TF computes the classic number of times the word appears in the text
# IDF computes the relative importance of this word which depends on how many texts the word can be found
# We add TF-IDF columns for every word that appear in at least 10 different texts to filter some of them and reduce the size of the final output.

In [53]:
reviews_df.sample(10)

Unnamed: 0,approveDate,articleID,articleWordCount,commentBody,commentID,commentSequence,commentTitle,commentType,createDate,depth,...,word_yet,word_york,word_yorker,word_yorkers,word_you,word_young,word_your,word_youth,word_zero,word_zone
960,1520003388,5a97cf5d410cf7000162e9b7,1391,Louvre: €15<br/>Brooklyn Mus: $16<br/>Philadel...,26174956.0,26174956.0,<br/>,comment,1519957142,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4271,1519936219,5a981947410cf7000162eaa2,1556,The most elastic component of any economic sys...,26171145.0,26171145.0,<br/>,comment,1519936213,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5687,1519961971,5a984b45410cf7000162eb58,1115,Maybe Melania sees a threat: Hicks may be her...,26174921.0,26174921.0,<br/>,comment,1519956912,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8347,1519995106,5a98961c410cf7000162ec46,1217,So much for not being afraid of the NRA. Such ...,26177961.0,26177961.0,<br/>,comment,1519994617,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2244,1519939512,5a97d9ca410cf7000162e9da,896,How completely absurd (and trite) to categoriz...,26168836.0,26168836.0,<br/>,comment,1519927003,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2810,1519969015,5a97eb6d410cf7000162ea14,2102,I read this book about 10 times as a kid. I re...,26176271.0,26176271.0,<br/>,comment,1519969011,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5281,1519957496,5a98465d410cf7000162eb49,1711,I graduated college in 2009 and I never worrie...,26173574.0,26173574.0,<br/>,comment,1519948105,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8602,1520003933,5a989bd9410cf7000162ec56,1383,"""Lawmakers need to harden some of these norms ...",26177170.0,26177170.0,<br/>,comment,1519988269,1.0,...,0.156482,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8790,1520095994,5a98a82e410cf7000162ec77,1764,These are the clear signs of a crumbling democ...,26192761.0,26192761.0,<br/>,comment,1520080914,1.0,...,0.089708,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3511,1519934707,5a97ff69410cf7000162ea4a,1510,Washington wants absolute global military supr...,26170092.0,26170092.0,<br/>,comment,1519931793,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [54]:
from functools import reduce
def make_ngram(df, col, N):
    return reduce(
        list.__add__, 
        (list("_".join(j) for j in nltk.ngrams(i.replace('↵', ' ').replace('|','').replace(':', '').split(), N))
        for i in df[col].values), [])

In [None]:
bigram = pd.Series(make_ngram(reviews_df, "commentBody_clean", 2))
trigram = pd.Series(make_ngram(reviews_df, "commentBody_clean", 3))

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image

In [None]:
fig, (ax1, ax2) = plt.subplots(2,1, figsize=(10,8), sharex=True)
pd.value_counts(bigram).head(20)[::-1].plot.barh(ax=ax1, rot=0)
pd.value_counts(trigram).head(20)[::-1].plot.barh(ax=ax2, rot=0)
ax1.set_title('Bigram')
ax2.set_title('Trigram')
ax2.set_xlabel("N-gram counts");

In [None]:
# wordcloud function

from wordcloud import WordCloud
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image

In [None]:
def show_wordcloud(data, title = 'Post-booking messages Word Cloud'):
    wordcloud = WordCloud(
        background_color = 'white',
        
        max_words = 100,
        max_font_size = 45, 
        scale = 25,
        random_state = 1
    ).generate(str(data))

    fig = plt.figure(1, figsize = (20, 10))
    plt.axis('off')
    if title: 
        fig.suptitle(title, fontsize = 20)
        fig.subplots_adjust(top = 0.9)

    plt.imshow(wordcloud)
    plt.show()
    
    # print wordcloud

In [None]:
show_wordcloud(reviews_df["commentBody_clean"])
show_wordcloud(bigram, title="bigram")
show_wordcloud(trigram, title="Trigram")

In [None]:
# Most of the words are indeed related to the hotels: room, staff, breakfast, etc. 
# Some words are more related to the customer experience with the hotel stay: perfect, loved, expensive, dislike, etc.


In [None]:
# highest positive sentiment reviews (with more than 5 words)
reviews_df[reviews_df["nb_words"] >= 5].sort_values("pos", ascending = False)[["commentBody_clean", "pos"]].head(10)


In [None]:
# lowest negative sentiment reviews (with more than 5 words)
reviews_df[reviews_df["nb_words"] >= 5].sort_values("neg", ascending = False)[["commentBody", "neg"]].head(20)


In [None]:
# Some errors can be found among the most negative reviews: 
# Vader sometimes interpret ‘no’ or ‘nothing’ as negative words whereas they are sometimes used to say that there were no problems with the hotel. 
# Fortunately, most of the reviews are indeed bad ones.