In [2]:
import pandas as pd

In [3]:
# read data
reviews_df = pd.read_csv("DisneylandReviews.csv", encoding='ISO-8859-1')

In [4]:
reviews_df.head()

Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text,Branch
0,670772142,4,2019-4,Australia,If you've ever been to Disneyland anywhere you...,Disneyland_HongKong
1,670682799,4,2019-5,Philippines,Its been a while since d last time we visit HK...,Disneyland_HongKong
2,670623270,4,2019-4,United Arab Emirates,Thanks God it wasn t too hot or too humid wh...,Disneyland_HongKong
3,670607911,4,2019-4,Australia,HK Disneyland is a great compact park. Unfortu...,Disneyland_HongKong
4,670607296,4,2019-4,United Kingdom,"the location is not in the city, took around 1...",Disneyland_HongKong


In [5]:
# split year_month to year and month columns
reviews_df[['Year','Month']] = reviews_df["Year_Month"].str.split("-",expand=True,)
# delete year_month column
del reviews_df['Year_Month']
reviews_df.head()

Unnamed: 0,Review_ID,Rating,Reviewer_Location,Review_Text,Branch,Year,Month
0,670772142,4,Australia,If you've ever been to Disneyland anywhere you...,Disneyland_HongKong,2019,4
1,670682799,4,Philippines,Its been a while since d last time we visit HK...,Disneyland_HongKong,2019,5
2,670623270,4,United Arab Emirates,Thanks God it wasn t too hot or too humid wh...,Disneyland_HongKong,2019,4
3,670607911,4,Australia,HK Disneyland is a great compact park. Unfortu...,Disneyland_HongKong,2019,4
4,670607296,4,United Kingdom,"the location is not in the city, took around 1...",Disneyland_HongKong,2019,4


In [6]:
reviews_df.head()

Unnamed: 0,Review_ID,Rating,Reviewer_Location,Review_Text,Branch,Year,Month
0,670772142,4,Australia,If you've ever been to Disneyland anywhere you...,Disneyland_HongKong,2019,4
1,670682799,4,Philippines,Its been a while since d last time we visit HK...,Disneyland_HongKong,2019,5
2,670623270,4,United Arab Emirates,Thanks God it wasn t too hot or too humid wh...,Disneyland_HongKong,2019,4
3,670607911,4,Australia,HK Disneyland is a great compact park. Unfortu...,Disneyland_HongKong,2019,4
4,670607296,4,United Kingdom,"the location is not in the city, took around 1...",Disneyland_HongKong,2019,4


In [7]:
# create the label
reviews_df["is_good_review"] = reviews_df["Rating"].apply(lambda x: 1 if x > 3 else 0)
reviews_df.head()

Unnamed: 0,Review_ID,Rating,Reviewer_Location,Review_Text,Branch,Year,Month,is_good_review
0,670772142,4,Australia,If you've ever been to Disneyland anywhere you...,Disneyland_HongKong,2019,4,1
1,670682799,4,Philippines,Its been a while since d last time we visit HK...,Disneyland_HongKong,2019,5,1
2,670623270,4,United Arab Emirates,Thanks God it wasn t too hot or too humid wh...,Disneyland_HongKong,2019,4,1
3,670607911,4,Australia,HK Disneyland is a great compact park. Unfortu...,Disneyland_HongKong,2019,4,1
4,670607296,4,United Kingdom,"the location is not in the city, took around 1...",Disneyland_HongKong,2019,4,1


In [8]:
# get a sample
reviews_df = reviews_df.sample(frac = 0.1, replace = False, random_state=42)

In [9]:
# return the wordnet object value corresponding to the POS tag
from nltk.corpus import wordnet

def get_wordnet_pos(pos_tag):
    if pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('N'):
        return wordnet.NOUN
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [10]:
import string
import nltk
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer

def clean_text(text):
    # lower text
    text = text.lower()
    # tokenize text and remove puncutation
    text = [word.strip(string.punctuation) for word in text.split(" ")]
    # remove words that contain numbers
    text = [word for word in text if not any(c.isdigit() for c in word)]
    # remove stop words
    stop = stopwords.words('english')
    text = [x for x in text if x not in stop]
    # remove empty tokens
    text = [t for t in text if len(t) > 0]
    # pos tag text
    pos_tags = pos_tag(text)
    # lemmatize text
    text = [WordNetLemmatizer().lemmatize(t[0], get_wordnet_pos(t[1])) for t in pos_tags]
    # remove words with only one letter
    text = [t for t in text if len(t) > 1]
    # join all
    text = " ".join(text)
    return(text)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sherman/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/sherman/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /Users/sherman/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [11]:
# clean text data
reviews_df["review_clean"] = reviews_df["Review_Text"].apply(lambda x: clean_text(x))

In [12]:
reviews_df.head()

Unnamed: 0,Review_ID,Rating,Reviewer_Location,Review_Text,Branch,Year,Month,is_good_review,review_clean
31092,540713188,5,Malta,Disneyland is so beautiful and large.To see al...,Disneyland_Paris,2017,9,1,disneyland beautiful large.to see need stay le...
28105,119781124,1,Canada,"The lines for rides are too long. Yes, the fas...",Disneyland_California,2011,10,0,line ride long yes fast pass help little stand...
1121,576395715,5,Australia,Loved Hong Kong Disneyland although it is much...,Disneyland_HongKong,2018,4,1,love hong kong disneyland although much small ...
17687,310041955,5,United States,Love Disneyland! We are annual pass holders an...,Disneyland_California,2015,9,1,love disneyland annual pas holder love nostalg...
23059,184009554,4,United States,The California Adventure Park is much improved...,Disneyland_California,2013,11,1,california adventure park much improve additio...


In [13]:
# add sentiment anaylsis columns
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

sid = SentimentIntensityAnalyzer()
reviews_df["sentiments"] = reviews_df["Review_Text"].apply(lambda x: sid.polarity_scores(x))
reviews_df = pd.concat([reviews_df.drop(['sentiments'], axis=1), reviews_df['sentiments'].apply(pd.Series)], axis=1)


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/sherman/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [14]:
reviews_df.head()

Unnamed: 0,Review_ID,Rating,Reviewer_Location,Review_Text,Branch,Year,Month,is_good_review,review_clean,neg,neu,pos,compound
31092,540713188,5,Malta,Disneyland is so beautiful and large.To see al...,Disneyland_Paris,2017,9,1,disneyland beautiful large.to see need stay le...,0.0,0.626,0.374,0.967
28105,119781124,1,Canada,"The lines for rides are too long. Yes, the fas...",Disneyland_California,2011,10,0,line ride long yes fast pass help little stand...,0.059,0.767,0.174,0.928
1121,576395715,5,Australia,Loved Hong Kong Disneyland although it is much...,Disneyland_HongKong,2018,4,1,love hong kong disneyland although much small ...,0.057,0.573,0.37,0.9186
17687,310041955,5,United States,Love Disneyland! We are annual pass holders an...,Disneyland_California,2015,9,1,love disneyland annual pas holder love nostalg...,0.0,0.669,0.331,0.9165
23059,184009554,4,United States,The California Adventure Park is much improved...,Disneyland_California,2013,11,1,california adventure park much improve additio...,0.0,0.831,0.169,0.9663


In [15]:
# add number of characters column
reviews_df["characters"] = reviews_df["Review_Text"].apply(lambda x: len(x))

# add number of words column
reviews_df["words"] = reviews_df["Review_Text"].apply(lambda x: len(x.split(" ")))

In [16]:
reviews_df.head()

Unnamed: 0,Review_ID,Rating,Reviewer_Location,Review_Text,Branch,Year,Month,is_good_review,review_clean,neg,neu,pos,compound,characters,words
31092,540713188,5,Malta,Disneyland is so beautiful and large.To see al...,Disneyland_Paris,2017,9,1,disneyland beautiful large.to see need stay le...,0.0,0.626,0.374,0.967,210,40
28105,119781124,1,Canada,"The lines for rides are too long. Yes, the fas...",Disneyland_California,2011,10,0,line ride long yes fast pass help little stand...,0.059,0.767,0.174,0.928,557,112
1121,576395715,5,Australia,Loved Hong Kong Disneyland although it is much...,Disneyland_HongKong,2018,4,1,love hong kong disneyland although much small ...,0.057,0.573,0.37,0.9186,160,27
17687,310041955,5,United States,Love Disneyland! We are annual pass holders an...,Disneyland_California,2015,9,1,love disneyland annual pas holder love nostalg...,0.0,0.669,0.331,0.9165,155,27
23059,184009554,4,United States,The California Adventure Park is much improved...,Disneyland_California,2013,11,1,california adventure park much improve additio...,0.0,0.831,0.169,0.9663,577,109


In [None]:
# create doc2vec vector columns
!pip install gensim
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(reviews_df["review_clean"].apply(lambda x: x.split(" ")))]

# train a Doc2Vec model with our text data
model = Doc2Vec(documents, vector_size=5, window=2, min_count=1, workers=4)

# transform each document into a vector data
doc2vec_df = reviews_df["review_clean"].apply(lambda x: model.infer_vector(x.split(" "))).apply(pd.Series)
doc2vec_df.columns = ["doc2vec_vector_" + str(x) for x in doc2vec_df.columns]
reviews_df = pd.concat([reviews_df, doc2vec_df], axis=1)



In [None]:
reviews_df.head()

In [None]:
# add tf-idfs columns
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(min_df = 10)
tfidf_result = tfidf.fit_transform(reviews_df["review_clean"]).toarray()
tfidf_df = pd.DataFrame(tfidf_result, columns = tfidf.get_feature_names())
tfidf_df.columns = ["word_" + str(x) for x in tfidf_df.columns]
tfidf_df.index = reviews_df.index
reviews_df = pd.concat([reviews_df, tfidf_df], axis=1)
reviews_df.head()

In [None]:
# show is_good_review distribution
reviews_df["is_good_review"].value_counts(normalize = True)

In [None]:
# wordcloud function

!pip install wordcloud
from wordcloud import WordCloud
import matplotlib.pyplot as plt

def show_wordcloud(data, title = None):
    wordcloud = WordCloud(
        background_color = 'white',
        max_words = 200,
        max_font_size = 40, 
        scale = 3,
        random_state = 42
    ).generate(str(data))

    fig = plt.figure(1, figsize = (20, 20))
    plt.axis('off')
    if title: 
        fig.suptitle(title, fontsize = 20)
        fig.subplots_adjust(top = 2.3)

    plt.imshow(wordcloud)
    plt.show()
    
# print wordcloud
show_wordcloud(reviews_df["Review_Text"])

In [None]:
# highest positive sentiment reviews (with more than 5 words)
reviews_df[reviews_df["words"] >= 3].sort_values("pos", ascending = False)[["Review_Text", "pos"]].head(10)

In [None]:
# lowest negative sentiment reviews (with more than 5 words)
reviews_df[reviews_df["words"] >= 5].sort_values("neg", ascending = False)[["Review_Text", "neg"]].head(10)

In [None]:
# plot sentiment distribution for positive and negative reviews

import seaborn as sns

for x in [0, 1]:
    subset = reviews_df[reviews_df['is_good_review'] == x]
    
    # Draw the density plot
    if x == 0:
        label = "Bad reviews"
    else:
        label = "Good reviews"
    sns.distplot(subset['compound'], hist = False, label = label)

In [None]:
# feature selection
label = "is_good_review"
ignore_cols = [label, "Review_Text", "review_clean", "Reviewer_Location"]
features = [c for c in reviews_df.columns if c not in ignore_cols]

# split the data into train and test
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(reviews_df[features], reviews_df[label], test_size = 0.20, random_state = 42)


In [None]:
reviews_df.dtypes

In [None]:
# train a random forest classifier
rf = RandomForestClassifier(n_estimators = 100, random_state = 42)
rf.fit(X_train, y_train)
