# 1.Load data

In [30]:
# List down the columns in the dataset and the number of instances. 
import nltk
import csv
import pandas as pd
import gensim
from gensim import corpora
import operator
import re
import pyLDAvis
import pyLDAvis.gensim_models


df = pd.read_csv('DisneylandReviews.csv',encoding="latin-1")

print("The number of instances:", len(df))
df.head(3)

The number of instances: 42656


Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text,Branch
0,670772142,4,2019-4,Australia,If you've ever been to Disneyland anywhere you...,Disneyland_HongKong
1,670682799,4,2019-5,Philippines,Its been a while since d last time we visit HK...,Disneyland_HongKong
2,670623270,4,2019-4,United Arab Emirates,Thanks God it wasn t too hot or too humid wh...,Disneyland_HongKong


In [31]:
df1 = df.sample(n=10000, random_state=21) 
df1.reset_index(drop=True, inplace=True)

print("The number of selected instances:", len(df1))
df1.head(3)

The number of selected instances: 10000


Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text,Branch
0,162692522,4,2013-5,United Kingdom,The park is still great and more enjoyable for...,Disneyland_Paris
1,159916316,4,2013-4,United States,If you are looking for something different and...,Disneyland_California
2,315887295,4,2015-9,United States,We flew to Hong Kong on our way to the Philipp...,Disneyland_HongKong


# 2.Preprocess

In [32]:
# Convert all text from the “Review_Text” column to lowercase.
df1['Review_Text'] = df1['Review_Text'].str.lower()

In [113]:
# remove hashtags

def remove_hashtags_accounts_urls(text):
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
    text = re.sub(r'\d', ' ', text)

    
    return text.strip()  

df1['CleanedReview'] = df1['Review_Text'].apply(remove_hashtags_accounts_urls)


df1.head(4)
df1.iloc[:4, 6:9]

Unnamed: 0,CleanedReview,TokenizedReview,StopwordRemovedReview
0,the park is still great and more enjoyable for...,"[the, park, is, still, great, and, more, enjoy...","[park, still, great, enjoyable, second, kids, ..."
1,if you are looking for something different and...,"[if, you, are, looking, for, something, differ...","[looking, something, different, want, entertai..."
2,we flew to hong kong on our way to the philipp...,"[we, flew, to, hong, kong, on, our, way, to, t...","[flew, way, philippines, since, never, decided..."
3,here are pictures of the original mickey and m...,"[here, are, pictures, of, the, original, micke...","[pictures, original, mickey, minnie, yes, marr..."


In [67]:
# Using NLTK in to tokenize the review text. 

df1['TokenizedReview']=''

from nltk.tokenize import WordPunctTokenizer
tok = WordPunctTokenizer()

def token(review):
    return [x for x  in tok.tokenize(review) if len(x) > 1]

df1['TokenizedReview'] = df1['CleanedReview'].apply(token)

df1.head(3)

Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text,Branch,CleanedReview,TokenizedReview,StopwordRemovedReview
0,162692522,4,2013-5,United Kingdom,the park is still great and more enjoyable for...,Disneyland_Paris,the park is still great and more enjoyable for...,"[the, park, is, still, great, and, more, enjoy...","[park, still, great, enjoyable, second, kids, ..."
1,159916316,4,2013-4,United States,if you are looking for something different and...,Disneyland_California,if you are looking for something different and...,"[if, you, are, looking, for, something, differ...","[looking, something, different, want, entertai..."
2,315887295,4,2015-9,United States,we flew to hong kong on our way to the philipp...,Disneyland_HongKong,we flew to hong kong on our way to the philipp...,"[we, flew, to, hong, kong, on, our, way, to, t...","[flew, way, philippines, since, never, decided..."


In [75]:
df1['StopwordRemovedReview']=''

from nltk.corpus import stopwords
# nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
original_stop_words =  set(stopwords.words('english'))
additional_words = ['disney','disneyland','disneylands','world','rides','ride','day','time','visit','one','hk','hong','kong','paris','florida','california','us','would','could','told','went''said','say','also','even','got']
for word in additional_words:
    stop_words.add(word)

def stopword(review):
    tokens = tok.tokenize(review)
    return [x for x in tokens if x not in stop_words]
    
df1['StopwordRemovedReview'] = df1['CleanedReview'].apply(stopword)

df1.head(3)


Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text,Branch,CleanedReview,TokenizedReview,StopwordRemovedReview
0,162692522,4,2013-5,United Kingdom,the park is still great and more enjoyable for...,Disneyland_Paris,the park is still great and more enjoyable for...,"[the, park, is, still, great, and, more, enjoy...","[park, still, great, enjoyable, second, kids, ..."
1,159916316,4,2013-4,United States,if you are looking for something different and...,Disneyland_California,if you are looking for something different and...,"[if, you, are, looking, for, something, differ...","[looking, something, different, want, entertai..."
2,315887295,4,2015-9,United States,we flew to hong kong on our way to the philipp...,Disneyland_HongKong,we flew to hong kong on our way to the philipp...,"[we, flew, to, hong, kong, on, our, way, to, t...","[flew, way, philippines, since, never, decided..."


# 3.Topic Modeling

In [76]:
def topic_modeling(data,num_topics,no_below=5, no_above=.2, keep_n=5000,passes=10):
    # data = df[text_]
    # display(df.head(5))

    dictionary = corpora.Dictionary(data)
    dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=keep_n)
    
    corpus = [dictionary.doc2bow(text) for text in data]

    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                               id2word=dictionary,
                                               num_topics=num_topics, 
                                               passes=passes,
                                               alpha='auto')
    lda_display = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
    for i in range(num_topics):
        print("topic %s:\t%s" % (i, ' '.join([term for term, freq in lda_model.show_topic(i, topn=20)])))
    return lda_display,lda_model,dictionary

In [82]:
lda_display,lda_model,dictionary = topic_modeling(df1['StopwordRemovedReview'],4,10,0.3,4000,300)

topic 0:	food hotel queue fast tickets queues pass take good ticket early around expensive pm minutes want water eat restaurants buy
topic 1:	great place kids fun really see parade good much fireworks old characters went loved show love experience well worth amazing
topic 2:	people times line staff wait lines many like parks pass back first fast closed way long crowds trip years going
topic 3:	mountain space star pirates thunder jones indiana land buzz big coaster haunted caribbean roller mansion peter main splash wars small


In [134]:
pyLDAvis.display(lda_display)

In [135]:
# Infer topics for a new document
def topic_infer(i):
    new_doc = df1['StopwordRemovedReview'][i]
    print('Choosen review is: ',df1['Review_Text'][i])
#     print(new_doc)
    new_doc_bow = dictionary.doc2bow(new_doc)
#     print(new_doc_bow)
    tp= lda_model.get_document_topics(new_doc_bow)
    print('The review topic is: ',tp)
    return get_max_value(tp)

In [136]:
a = topic_infer(27)
a

Choosen review is:  what can i say ... we all know it's disney, there was queues for everything. the place it's very clean and tidy and the staff polite, from security, to ticket sales and rides staff.the rides are feeling a bit aged ... but it's fun if you like to queue for 50 mins for 5 mins fun.the worst thing ..... everything it's stupidly expensive. they have a monopoly, but why rip off people and families who want to have a fun day. bag of crisps 5 euro, burger meals for 2 adults, two tots over 50 euros !!!!!!have fun ... but hey it won't be cheap.
The review topic is:  [(0, 0.40574405), (1, 0.3783089), (2, 0.21057084)]


0

# 4.Rating prediction

In [137]:
y_train = df1["Rating"][:10000]
train_text = df1["Review_Text"][:10000]

In [138]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectKBest, mutual_info_classif

vectorizer_uni = CountVectorizer(ngram_range=(1, 1))
new_train_uni = vectorizer_uni.fit_transform(train_text)

selector = SelectKBest(mutual_info_classif, k=4000)
new_train_selected = selector.fit_transform(new_train_uni, y_train)


print("Train feature space before filtering:", new_train_uni.shape)
print("Train feature space after filtering:", new_train_selected.shape)

Train feature space before filtering: (10000, 21273)
Train feature space after filtering: (10000, 4000)


In [139]:
# import relevant libraries
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, auc
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import preprocessing
from sklearn.metrics import classification_report, confusion_matrix 

In [140]:
from sklearn.model_selection import StratifiedKFold, cross_val_score
import numpy as np
from sklearn.pipeline import Pipeline

skf = StratifiedKFold(n_splits=5)

print(skf)

i=0
f1s_nb = []
accuracy_scores = []


for train_index, test_index in skf.split(new_train_selected, y_train):

    X_train = new_train_uni[train_index]
    X_test = new_train_uni[test_index]
    Y_train = y_train[train_index]
    Y_test = y_train[test_index]
    
    top2k_naive_bayes = MultinomialNB()
    top2k_naive_bayes.fit(X_train, Y_train)
    predictions_nb = top2k_naive_bayes.predict(X_test)
    
    accuracy = accuracy_score(Y_test, predictions_nb)
    accuracy_scores.append(accuracy)
    
    f1_nb = f1_score(Y_test, predictions_nb, average = "weighted")
    f1s_nb.append(f1_nb)

    # to see all the hyper parameters
    print()
    print("Accuracy score at fold ", i+1  ," for NB: ", accuracy_score(Y_test, predictions_nb))
    print("Precision score at fold ", i+1  ," for NB: ", precision_score(Y_test, predictions_nb, average="weighted"))
    print("Recall score at fold ", i+1  ," for NB: ", recall_score(Y_test, predictions_nb, average = "weighted"))
    print("F1 score at fold ", i+1  ," for NB: ", f1_score(Y_test, predictions_nb, average = "weighted"))
    i+=1
    print(".........................\n")
    

print(f'F1 score of NB: {f1s_nb}')

mean_accuracy = np.mean(accuracy_scores)
standard_error = np.std(accuracy_scores) / np.sqrt(len(accuracy_scores))
z_score = 1.959
confidence_interval = z_score * standard_error
print(f'Accuracy score of NB: {accuracy_scores}')
print(f"Mean accuracy: {mean_accuracy}")
print(f"95% confidence interval: ({mean_accuracy - confidence_interval}, {mean_accuracy + confidence_interval})")

StratifiedKFold(n_splits=5, random_state=None, shuffle=False)

Accuracy score at fold  1  for NB:  0.592
Precision score at fold  1  for NB:  0.5378722277365123
Recall score at fold  1  for NB:  0.592
F1 score at fold  1  for NB:  0.5460110396813985
.........................


Accuracy score at fold  2  for NB:  0.59
Precision score at fold  2  for NB:  0.5300116215478108
Recall score at fold  2  for NB:  0.59
F1 score at fold  2  for NB:  0.5479622638062301
.........................


Accuracy score at fold  3  for NB:  0.5885
Precision score at fold  3  for NB:  0.5357705878160514
Recall score at fold  3  for NB:  0.5885
F1 score at fold  3  for NB:  0.541165394632271
.........................


Accuracy score at fold  4  for NB:  0.5665
Precision score at fold  4  for NB:  0.49526717515061075
Recall score at fold  4  for NB:  0.5665
F1 score at fold  4  for NB:  0.5179404548281508
.........................


Accuracy score at fold  5  for NB:  0.5805
Precision score at fold  5  for 

In [141]:
#bigram selector training
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectKBest, mutual_info_classif

vectorizer_bi = CountVectorizer(ngram_range=(1, 2))
new_train_bi = vectorizer_bi.fit_transform(train_text)

selector_bi = SelectKBest(mutual_info_classif, k=6000)

In [142]:
new_train_selected_bi = selector_bi.fit_transform(new_train_bi, y_train)


print("Train feature space before filtering:", new_train_bi.shape)
print("Train feature space after filtering:", new_train_selected_bi.shape)

Train feature space before filtering: (10000, 315579)
Train feature space after filtering: (10000, 6000)


In [147]:
skf = StratifiedKFold(n_splits=5)

print(skf)

i=0
f1s_nb = []
accuracy_scores = []


for train_index, test_index in skf.split(new_train_selected_bi, y_train):

    X_train = new_train_bi[train_index]
    X_test = new_train_bi[test_index]
    Y_train = y_train[train_index]
    Y_test = y_train[test_index]
    
    top6k_naive_bayes = MultinomialNB()
    top6k_naive_bayes.fit(X_train, Y_train)
    predictions_nb = top6k_naive_bayes.predict(X_test)
    
    accuracy = accuracy_score(Y_test, predictions_nb)
    accuracy_scores.append(accuracy)
    
    f1_nb = f1_score(Y_test, predictions_nb, average = "weighted")
    f1s_nb.append(f1_nb)

    # to see all the hyper parameters
    print()
    print("Accuracy score at fold ", i+1  ," for NB: ", accuracy_score(Y_test, predictions_nb))
    print("Precision score at fold ", i+1  ," for NB: ", precision_score(Y_test, predictions_nb, average="weighted"))
    print("Recall score at fold ", i+1  ," for NB: ", recall_score(Y_test, predictions_nb, average = "weighted"))
    print("F1 score at fold ", i+1  ," for NB: ", f1_score(Y_test, predictions_nb, average = "weighted"))
    i+=1
    print(".........................\n")
    

print(f'F1 score of NB: {f1s_nb}')

mean_accuracy = np.mean(accuracy_scores)
standard_error = np.std(accuracy_scores) / np.sqrt(len(accuracy_scores))
z_score = 1.959
confidence_interval = z_score * standard_error
print(f'Accuracy score of NB: {accuracy_scores}')
print(f"Mean accuracy: {mean_accuracy}")
print(f"95% confidence interval: ({mean_accuracy - confidence_interval}, {mean_accuracy + confidence_interval})")

StratifiedKFold(n_splits=5, random_state=None, shuffle=False)

Accuracy score at fold  1  for NB:  0.553
Precision score at fold  1  for NB:  0.4580948792772625
Recall score at fold  1  for NB:  0.553
F1 score at fold  1  for NB:  0.4382351555771909
.........................


Accuracy score at fold  2  for NB:  0.5555
Precision score at fold  2  for NB:  0.3987027251864601
Recall score at fold  2  for NB:  0.5555
F1 score at fold  2  for NB:  0.4438142550877083
.........................


Accuracy score at fold  3  for NB:  0.557
Precision score at fold  3  for NB:  0.44072769550297997
Recall score at fold  3  for NB:  0.557
F1 score at fold  3  for NB:  0.4418316073304797
.........................


Accuracy score at fold  4  for NB:  0.555
Precision score at fold  4  for NB:  0.4216624620458543
Recall score at fold  4  for NB:  0.555
F1 score at fold  4  for NB:  0.4373600645809102
.........................


Accuracy score at fold  5  for NB:  0.5615
Precision score at fold  5  for

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# 5.Sentence Topic

In [149]:
from nltk.tokenize import sent_tokenize

# Sample long paragraph
paragraph = "This is a long paragraph. It contains multiple sentences. We need to split it into individual sentences. Pandas can help with that."

def to_sentences(text):
    return sent_tokenize(text)

df1['Review_Sentence'] = df1['Review_Text'].apply(to_sentences)
df1.head(4)

Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text,Branch,CleanedReview,TokenizedReview,StopwordRemovedReview,Review_Sentence,Sentence_Token_list,Sentence_Topic_Rating,Sentence_Rating,Rating_by_Topic
0,162692522,4,2013-5,United Kingdom,the park is still great and more enjoyable for...,Disneyland_Paris,the park is still great and more enjoyable for...,"[the, park, is, still, great, and, more, enjoy...","[park, still, great, enjoyable, second, kids, ...",[the park is still great and more enjoyable fo...,"[[the, park, is, still, great, and, more, enjo...","[1, 0, 0, 0, 2, 3, 2, 0, 0, 1, 2, 1]","[5, 4, 4, 4, 4, 5, 4, 4, 5, 4, 1, 5]","[4.2, 4.666666666666667, 3.0, 5.0]"
1,159916316,4,2013-4,United States,if you are looking for something different and...,Disneyland_California,if you are looking for something different and...,"[if, you, are, looking, for, something, differ...","[looking, something, different, want, entertai...",[if you are looking for something different an...,"[[if, you, are, looking, for, something, diffe...","[1, 1]","[5, 4]","[None, 4.5, None, None]"
2,315887295,4,2015-9,United States,we flew to hong kong on our way to the philipp...,Disneyland_HongKong,we flew to hong kong on our way to the philipp...,"[we, flew, to, hong, kong, on, our, way, to, t...","[flew, way, philippines, since, never, decided...",[we flew to hong kong on our way to the philip...,"[[we, flew, to, hong, kong, on, our, way, to, ...","[2, 1, 1, 0, 0, 1, 1, 1, 0, 2, 1, 0, 2, 1, 2, ...","[5, 5, 5, 5, 4, 5, 5, 4, 4, 4, 5, 5, 5, 4, 5, ...","[4.5, 4.647058823529412, 4.285714285714286, 4.0]"
3,169740488,5,2012-10,United States,here are pictures of the original mickey and m...,Disneyland_California,here are pictures of the original mickey and m...,"[here, are, pictures, of, the, original, micke...","[pictures, original, mickey, minnie, yes, marr...",[here are pictures of the original mickey and ...,"[[here, are, pictures, of, the, original, mick...","[1, 1]","[5, 5]","[None, 5.0, None, None]"


In [150]:
# Tokenize review sentence list
def sentence_tokenize(sentences):
    sentence_token_list = []
    for sentence in sentences:
        sentence_token_list.append(token(sentence))
    return sentence_token_list

In [155]:
df1['Sentence_Token_list'] = ''
df1['Sentence_Token_list'] = df1['Review_Sentence'].apply(sentence_tokenize)
df1.head(4)

Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text,Branch,CleanedReview,TokenizedReview,StopwordRemovedReview,Review_Sentence,Sentence_Token_list,Sentence_Topic_Rating,Sentence_Rating,Rating_by_Topic
0,162692522,4,2013-5,United Kingdom,the park is still great and more enjoyable for...,Disneyland_Paris,the park is still great and more enjoyable for...,"[the, park, is, still, great, and, more, enjoy...","[park, still, great, enjoyable, second, kids, ...",[the park is still great and more enjoyable fo...,"[[the, park, is, still, great, and, more, enjo...","[1, 0, 0, 0, 2, 3, 2, 0, 0, 1, 2, 1]","[5, 4, 4, 4, 4, 5, 4, 4, 5, 4, 1, 5]","[4.2, 4.666666666666667, 3.0, 5.0]"
1,159916316,4,2013-4,United States,if you are looking for something different and...,Disneyland_California,if you are looking for something different and...,"[if, you, are, looking, for, something, differ...","[looking, something, different, want, entertai...",[if you are looking for something different an...,"[[if, you, are, looking, for, something, diffe...","[1, 1]","[5, 4]","[None, 4.5, None, None]"
2,315887295,4,2015-9,United States,we flew to hong kong on our way to the philipp...,Disneyland_HongKong,we flew to hong kong on our way to the philipp...,"[we, flew, to, hong, kong, on, our, way, to, t...","[flew, way, philippines, since, never, decided...",[we flew to hong kong on our way to the philip...,"[[we, flew, to, hong, kong, on, our, way, to, ...","[2, 1, 1, 0, 0, 1, 1, 1, 0, 2, 1, 0, 2, 1, 2, ...","[5, 5, 5, 5, 4, 5, 5, 4, 4, 4, 5, 5, 5, 4, 5, ...","[4.5, 4.647058823529412, 4.285714285714286, 4.0]"
3,169740488,5,2012-10,United States,here are pictures of the original mickey and m...,Disneyland_California,here are pictures of the original mickey and m...,"[here, are, pictures, of, the, original, micke...","[pictures, original, mickey, minnie, yes, marr...",[here are pictures of the original mickey and ...,"[[here, are, pictures, of, the, original, mick...","[1, 1]","[5, 5]","[None, 5.0, None, None]"


In [157]:
# Infer topics for a new document
def sentence_topic_infer(sentences):
    topic_list = []
    for sentence in sentences:
        new_doc = sentence
        new_doc_bow = dictionary.doc2bow(new_doc)
        tp= lda_model.get_document_topics(new_doc_bow)
        topic_list.append(get_max_value(tp))
    return topic_list
def get_max_value(possibility_pairs):
    max_pair = max(possibility_pairs, key=lambda pair: pair[1])
    return max_pair[0]


       

In [158]:
df1['Sentence_Topic_Rating'] = ''
df1['Sentence_Topic_Rating'] = df1['Sentence_Token_list'].apply(sentence_topic_infer)
df1.head(5)

Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text,Branch,CleanedReview,TokenizedReview,StopwordRemovedReview,Review_Sentence,Sentence_Token_list,Sentence_Topic_Rating,Sentence_Rating,Rating_by_Topic
0,162692522,4,2013-5,United Kingdom,the park is still great and more enjoyable for...,Disneyland_Paris,the park is still great and more enjoyable for...,"[the, park, is, still, great, and, more, enjoy...","[park, still, great, enjoyable, second, kids, ...",[the park is still great and more enjoyable fo...,"[[the, park, is, still, great, and, more, enjo...","[1, 0, 0, 0, 2, 3, 2, 0, 0, 1, 2, 1]","[5, 4, 4, 4, 4, 5, 4, 4, 5, 4, 1, 5]","[4.2, 4.666666666666667, 3.0, 5.0]"
1,159916316,4,2013-4,United States,if you are looking for something different and...,Disneyland_California,if you are looking for something different and...,"[if, you, are, looking, for, something, differ...","[looking, something, different, want, entertai...",[if you are looking for something different an...,"[[if, you, are, looking, for, something, diffe...","[1, 1]","[5, 4]","[None, 4.5, None, None]"
2,315887295,4,2015-9,United States,we flew to hong kong on our way to the philipp...,Disneyland_HongKong,we flew to hong kong on our way to the philipp...,"[we, flew, to, hong, kong, on, our, way, to, t...","[flew, way, philippines, since, never, decided...",[we flew to hong kong on our way to the philip...,"[[we, flew, to, hong, kong, on, our, way, to, ...","[2, 1, 1, 0, 0, 1, 1, 1, 0, 2, 1, 0, 2, 1, 2, ...","[5, 5, 5, 5, 4, 5, 5, 4, 4, 4, 5, 5, 5, 4, 5, ...","[4.5, 4.647058823529412, 4.285714285714286, 4.0]"
3,169740488,5,2012-10,United States,here are pictures of the original mickey and m...,Disneyland_California,here are pictures of the original mickey and m...,"[here, are, pictures, of, the, original, micke...","[pictures, original, mickey, minnie, yes, marr...",[here are pictures of the original mickey and ...,"[[here, are, pictures, of, the, original, mick...","[1, 1]","[5, 5]","[None, 5.0, None, None]"
4,296488289,4,2015-8,Japan,the park is very organized and there are thing...,Disneyland_California,the park is very organized and there are thing...,"[the, park, is, very, organized, and, there, a...","[park, organized, things, matter, wether, big,...",[the park is very organized and there are thin...,"[[the, park, is, very, organized, and, there, ...","[1, 1]","[5, 5]","[None, 5.0, None, None]"


# 6.Sentence Rating

In [159]:
def sentence_rating(sentences):
    return top2k_naive_bayes.predict(vectorizer_uni.transform(sentences))

In [160]:
df1['Sentence_Rating'] = ''
df1['Sentence_Rating'] = df1['Review_Sentence'].apply(sentence_rating)
df1.head(4)

Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text,Branch,CleanedReview,TokenizedReview,StopwordRemovedReview,Review_Sentence,Sentence_Token_list,Sentence_Topic_Rating,Sentence_Rating,Rating_by_Topic
0,162692522,4,2013-5,United Kingdom,the park is still great and more enjoyable for...,Disneyland_Paris,the park is still great and more enjoyable for...,"[the, park, is, still, great, and, more, enjoy...","[park, still, great, enjoyable, second, kids, ...",[the park is still great and more enjoyable fo...,"[[the, park, is, still, great, and, more, enjo...","[1, 0, 0, 0, 2, 3, 2, 0, 0, 1, 2, 1]","[5, 4, 4, 4, 4, 5, 4, 4, 5, 4, 1, 5]","[4.2, 4.666666666666667, 3.0, 5.0]"
1,159916316,4,2013-4,United States,if you are looking for something different and...,Disneyland_California,if you are looking for something different and...,"[if, you, are, looking, for, something, differ...","[looking, something, different, want, entertai...",[if you are looking for something different an...,"[[if, you, are, looking, for, something, diffe...","[1, 1]","[5, 4]","[None, 4.5, None, None]"
2,315887295,4,2015-9,United States,we flew to hong kong on our way to the philipp...,Disneyland_HongKong,we flew to hong kong on our way to the philipp...,"[we, flew, to, hong, kong, on, our, way, to, t...","[flew, way, philippines, since, never, decided...",[we flew to hong kong on our way to the philip...,"[[we, flew, to, hong, kong, on, our, way, to, ...","[2, 1, 1, 0, 0, 1, 1, 1, 0, 2, 1, 0, 2, 1, 2, ...","[5, 5, 5, 5, 4, 5, 5, 4, 4, 4, 5, 5, 5, 4, 5, ...","[4.5, 4.647058823529412, 4.285714285714286, 4.0]"
3,169740488,5,2012-10,United States,here are pictures of the original mickey and m...,Disneyland_California,here are pictures of the original mickey and m...,"[here, are, pictures, of, the, original, micke...","[pictures, original, mickey, minnie, yes, marr...",[here are pictures of the original mickey and ...,"[[here, are, pictures, of, the, original, mick...","[1, 1]","[5, 5]","[None, 5.0, None, None]"


# 7.Rating by Topic

In [169]:
def topic_avg_rating(topiclist,ratinglist):
    rating_sum = [0,0,0,0]
    rating_count = [0,0,0,0]
    for i in range(len(topiclist)):
        rating_sum[topiclist[i]] += ratinglist[i]
        rating_count[topiclist[i]]+=1
    rating = [None,None,None,None]
    for i in range(len(rating)):
        if rating_count[i] != 0:
            rating[i] = rating_sum[i]/rating_count[i]
    return rating


In [170]:
df1['Rating_by_Topic'] = ''
df1['Rating_by_Topic'] = df1.apply(lambda row: topic_avg_rating(row['Sentence_Topic_Rating'], row['Sentence_Rating']), axis=1)

df1.head(3)

Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text,Branch,CleanedReview,TokenizedReview,StopwordRemovedReview,Review_Sentence,Sentence_Token_list,Sentence_Topic_Rating,Sentence_Rating,Rating_by_Topic
0,162692522,4,2013-5,United Kingdom,the park is still great and more enjoyable for...,Disneyland_Paris,the park is still great and more enjoyable for...,"[the, park, is, still, great, and, more, enjoy...","[park, still, great, enjoyable, second, kids, ...",[the park is still great and more enjoyable fo...,"[[the, park, is, still, great, and, more, enjo...","[1, 0, 0, 0, 2, 3, 2, 0, 0, 1, 2, 1]","[5, 4, 4, 4, 4, 5, 4, 4, 5, 4, 1, 5]","[4.2, 4.666666666666667, 3.0, 5.0]"
1,159916316,4,2013-4,United States,if you are looking for something different and...,Disneyland_California,if you are looking for something different and...,"[if, you, are, looking, for, something, differ...","[looking, something, different, want, entertai...",[if you are looking for something different an...,"[[if, you, are, looking, for, something, diffe...","[1, 1]","[5, 4]","[None, 4.5, None, None]"
2,315887295,4,2015-9,United States,we flew to hong kong on our way to the philipp...,Disneyland_HongKong,we flew to hong kong on our way to the philipp...,"[we, flew, to, hong, kong, on, our, way, to, t...","[flew, way, philippines, since, never, decided...",[we flew to hong kong on our way to the philip...,"[[we, flew, to, hong, kong, on, our, way, to, ...","[2, 1, 1, 0, 0, 1, 1, 1, 0, 2, 1, 0, 2, 1, 2, ...","[5, 5, 5, 5, 4, 5, 5, 4, 4, 4, 5, 5, 5, 4, 5, ...","[4.5, 4.647058823529412, 4.285714285714286, 4.0]"


In [171]:
import pandas as pd

df2 = pd.DataFrame(df1['Rating_by_Topic'].tolist())
df2['Branch'] = df1['Branch']

average_scores = df2.groupby(['Branch']).mean()
average_scores

Unnamed: 0_level_0,0,1,2,3
Branch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Disneyland_California,4.702678,4.901188,4.572554,4.827409
Disneyland_HongKong,4.496395,4.772419,4.391607,4.655626
Disneyland_Paris,4.348546,4.777462,4.105455,4.558417
