In [1]:
import pandas as pd

In [2]:
# read data
df = pd.read_csv("Resources/DisneylandReviews.csv", encoding='ISO-8859-1')
reviews_df = df[df["Year_Month"] != "missing"]

In [3]:
# split year_month to year and month columns
reviews_df[['Year','Month']] = reviews_df["Year_Month"].str.split("-",expand=True,)
# delete year_month column
del reviews_df['Year_Month']
reviews_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


Unnamed: 0,Review_ID,Rating,Reviewer_Location,Review_Text,Branch,Year,Month
0,670772142,4,Australia,If you've ever been to Disneyland anywhere you...,Disneyland_HongKong,2019,4
1,670682799,4,Philippines,Its been a while since d last time we visit HK...,Disneyland_HongKong,2019,5
2,670623270,4,United Arab Emirates,Thanks God it wasn t too hot or too humid wh...,Disneyland_HongKong,2019,4
3,670607911,4,Australia,HK Disneyland is a great compact park. Unfortu...,Disneyland_HongKong,2019,4
4,670607296,4,United Kingdom,"the location is not in the city, took around 1...",Disneyland_HongKong,2019,4


In [4]:
# create the label
reviews_df["is_good_review"] = reviews_df["Rating"].apply(lambda x: 1 if x > 3 else 0)
reviews_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,Review_ID,Rating,Reviewer_Location,Review_Text,Branch,Year,Month,is_good_review
0,670772142,4,Australia,If you've ever been to Disneyland anywhere you...,Disneyland_HongKong,2019,4,1
1,670682799,4,Philippines,Its been a while since d last time we visit HK...,Disneyland_HongKong,2019,5,1
2,670623270,4,United Arab Emirates,Thanks God it wasn t too hot or too humid wh...,Disneyland_HongKong,2019,4,1
3,670607911,4,Australia,HK Disneyland is a great compact park. Unfortu...,Disneyland_HongKong,2019,4,1
4,670607296,4,United Kingdom,"the location is not in the city, took around 1...",Disneyland_HongKong,2019,4,1


In [5]:
# get a sample
reviews_df = reviews_df.sample(frac = 0.1, replace = False, random_state=42)

In [6]:
# return the wordnet object value corresponding to the POS tag
from nltk.corpus import wordnet

def get_wordnet_pos(pos_tag):
    if pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('N'):
        return wordnet.NOUN
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [7]:
import string
import nltk
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer

def clean_text(text):
    # lower text
    text = text.lower()
    # tokenize text and remove puncutation
    text = [word.strip(string.punctuation) for word in text.split(" ")]
    # remove words that contain numbers
    text = [word for word in text if not any(c.isdigit() for c in word)]
    # remove stop words
    stop = stopwords.words('english')
    text = [x for x in text if x not in stop]
    # remove empty tokens
    text = [t for t in text if len(t) > 0]
    # pos tag text
    pos_tags = pos_tag(text)
    # lemmatize text
    text = [WordNetLemmatizer().lemmatize(t[0], get_wordnet_pos(t[1])) for t in pos_tags]
    # remove words with only one letter
    text = [t for t in text if len(t) > 1]
    # join all
    text = " ".join(text)
    return(text)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jessicaramosmolina/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/jessicaramosmolina/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jessicaramosmolina/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
# clean text data
reviews_df["review_clean"] = reviews_df["Review_Text"].apply(lambda x: clean_text(x))
reviews_df.head()

Unnamed: 0,Review_ID,Rating,Reviewer_Location,Review_Text,Branch,Year,Month,is_good_review,review_clean
27680,122944085,5,United States,Spotlessly clean and so organized considering ...,Disneyland_California,2011,10,1,spotlessly clean organize consider thousand vi...
35899,290851203,5,United Kingdom,"Pricey but worth it, unfortunately the big thu...",Disneyland_Paris,2015,7,1,pricey worth unfortunately big thunder mountai...
23932,168087311,4,Canada,"Disneyland is a magical place. Everyone, parti...",Disneyland_California,2013,7,1,disneyland magical place everyone particularly...
36122,282417360,5,Pakistan,Disney land is a must when visiting Paris. The...,Disneyland_Paris,2015,6,1,disney land must visit paris frozen show best ...
8028,160953212,3,India,"The HK Disneyland is definitely small, but it'...",Disneyland_HongKong,2012,9,0,hk disneyland definitely small definitely fun ...


In [9]:
# add sentiment anaylsis columns
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

sid = SentimentIntensityAnalyzer()
reviews_df["sentiments"] = reviews_df["Review_Text"].apply(lambda x: sid.polarity_scores(x))
reviews_df = pd.concat([reviews_df.drop(['sentiments'], axis=1), reviews_df['sentiments'].apply(pd.Series)], axis=1)


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/jessicaramosmolina/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [10]:
# THIS IS THE CODE I ADDED
# ***********************
# HIGHER POLARITY = more positive
# on a scale of -1 to 1, 0 is neutral

In [11]:
# THIS PROVIDES THE POLARITY COLUMN
from textblob import TextBlob

reviews_df['polarity'] = reviews_df['review_clean'].map(lambda text: TextBlob(text).sentiment.polarity)


ModuleNotFoundError: No module named 'textblob'

In [None]:
# add number of characters column
reviews_df["characters"] = reviews_df["Review_Text"].apply(lambda x: len(x))
# copy_df1["characters"] = copy_df["Review_Text"].apply(lambda x: len(x))

# add number of words column
reviews_df["words"] = reviews_df["Review_Text"].apply(lambda x: len(x.split(" ")))
# copy_df1["words"] = copy_df["Review_Text"].apply(lambda x: len(x.split(" ")))


In [None]:
# create doc2vec vector columns
!pip install gensim
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(reviews_df["review_clean"].apply(lambda x: x.split(" ")))]

# train a Doc2Vec model with our text data
model = Doc2Vec(documents, vector_size=5, window=2, min_count=1, workers=4)

# transform each document into a vector data
doc2vec_df = reviews_df["review_clean"].apply(lambda x: model.infer_vector(x.split(" "))).apply(pd.Series)
doc2vec_df.columns = ["doc2vec_vector_" + str(x) for x in doc2vec_df.columns]
reviews_df = pd.concat([reviews_df, doc2vec_df], axis=1)

In [None]:
# add tf-idfs columns
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(min_df = 10)
tfidf_result = tfidf.fit_transform(reviews_df["review_clean"]).toarray()
tfidf_df = pd.DataFrame(tfidf_result, columns = tfidf.get_feature_names())
tfidf_df.columns = ["word_" + str(x) for x in tfidf_df.columns]
tfidf_df.index = reviews_df.index
reviews_df = pd.concat([reviews_df, tfidf_df], axis=1)
reviews_df.head()

In [None]:
reviews_df.to_csv('Resources/ReviewAnalysisRyan.csv', index=False)

In [None]:
# show is_good_review distribution
reviews_df["is_good_review"].value_counts(normalize = True)

In [None]:
# wordcloud function

!pip install wordcloud
from wordcloud import WordCloud
import matplotlib.pyplot as plt

def show_wordcloud(data, title = None):
    wordcloud = WordCloud(
        background_color = 'white',
        max_words = 200,
        max_font_size = 40, 
        scale = 3,
        random_state = 42
    ).generate(str(data))

    fig = plt.figure(1, figsize = (20, 20))
    plt.axis('off')
    if title: 
        fig.suptitle(title, fontsize = 20)
        fig.subplots_adjust(top = 2.3)

    plt.imshow(wordcloud)
    plt.show()
    
# print wordcloud
show_wordcloud(reviews_df["Review_Text"])

In [None]:
# highest positive sentiment reviews (with more than 5 words)
reviews_df[reviews_df["words"] >= 5].sort_values("pos", ascending = False)[["Review_Text", "pos", "polarity", "Rating"]].head(10)

In [None]:
# lowest negative sentiment reviews (with more than 5 words)
reviews_df[reviews_df["words"] >= 5].sort_values("neg", ascending = False)[["Review_Text", "neg", "polarity", "Rating"]].head(10)

In [None]:
# SORTING BY POLARITY (POSITIVE)
reviews_df[reviews_df["words"] >= 5].sort_values("polarity", ascending = False)[["Review_Text", "pos", "polarity", "Rating"]].head(10)

In [None]:
# SORTING BY POLARITY (NEGATIVE)
reviews_df[reviews_df["words"] >= 5].sort_values("polarity", ascending = True)[["Review_Text", "neg", "polarity", "Rating", "compound"]].head(10)

In [None]:
# plot sentiment distribution for positive and negative reviews

import seaborn as sns

for x in [0, 1]:
    subset = reviews_df[reviews_df['is_good_review'] == x]
    
    # Draw the density plot
    if x == 0:
        label = "Bad reviews"
    else:
        label = "Good reviews"
    sns.distplot(subset['compound'], hist = False, label = label)

In [None]:
# feature selection
label = "is_good_review"
ignore_cols = [label, "Review_Text", "review_clean", "Reviewer_Location","Branch",""]
features = [c for c in reviews_df.columns if c not in ignore_cols]

# split the data into train and test
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC 



X_train, X_test, y_train, y_test = train_test_split(reviews_df[features], reviews_df[label], test_size = 0.20, random_state = 42)


In [None]:
reviews_df.dtypes

In [None]:
# train a random forest classifier
rf = RandomForestClassifier(n_estimators = 100, random_state = 42)
rf.fit(X_train, y_train)


In [None]:
 rf.score(X_train, y_train), rf.score(X_test, y_test)

In [None]:
predictions = rf.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

In [None]:
# show feature importance
feature_importances_df = pd.DataFrame({"feature": features, "importance": rf.feature_importances_}).sort_values("importance", ascending = False)
feature_importances_df.head(20)

In [None]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
clf.score(X_train, y_train), clf.score(X_test, y_test)

In [None]:
train_scores = []
test_scores = []
for k in range(1, 20, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    train_score = knn.score(X_train, y_train)
    test_score = knn.score(X_test, y_test)
    all_score = train_score + test_score
    train_scores.append(train_score)
    test_scores.append(test_score)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}, {all_score:.3f}")

In [None]:
model = SVC(kernel='linear')
model.fit(X_train, y_train)
model.score(X_train, y_train),model.score(X_test, y_test)

In [None]:
# ROC curve

from sklearn.metrics import roc_curve, auc, roc_auc_score
import matplotlib.pyplot as plt

y_pred = [x[1] for x in rf.predict_proba(X_test)]
fpr, tpr, thresholds = roc_curve(y_test, y_pred, pos_label = 1)

roc_auc = auc(fpr, tpr)

plt.figure(1, figsize = (15, 10))
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()

In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=5)
kmeans.fit(X_train)
predicted_clusters = kmeans.predict(X_test)
predicted_clusters[0]
y_test.tolist()[0]