# Topic Modeling

In [7]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA
from nltk.corpus import stopwords
import gensim.corpora as corpora
from pprint import pprint
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
from matplotlib import pyplot as plt

In [2]:
cali_df = pd.read_csv("cali_df.csv")
hk_df = pd.read_csv("reviews_hk.csv")
jap_df = pd.read_csv("reviews_jp.csv")

jap_df['name'] = 'japan'
cali_df['name'] = 'cali'
hk_df['name'] = 'hk'

def clean(df):
    df['rating'] = df['rating'].astype(str).str[0]
    df['rating'] = df['rating'].astype(int)
    date = df['date'].str.split(' ', n=1, expand = True)
    df['date'] = date[1]
    day = df['date']
    date = [datetime.strptime(x, '%B %d, %Y') for x in day]
    df['date'] = date
    return df

jap_df = clean(jap_df)
#cali_df = clean(cali_df)
hongk_df = clean(hk_df)

import re
def review_cleaning(text): 
    text = text.str.replace(r'[^a-z0-9A-Z_]', ' ', regex=True)
    text = text.str.lower()
    text = text.str.replace(r'\d+', '', regex=True)
    text = text.str.replace(r"\W", ' ', regex=True)
    text = text.str.replace(r"[^a-zA-Z0-9]+", ' ', regex=True)
    return text

jap_df['review'] = review_cleaning(jap_df['review'])
#cali_df['Review'] = review_cleaning(cali_df['Review'])
hk_df['review'] = review_cleaning(hk_df['review'])

hk_df.rename({"title":"Header", "review":"Review", "rating":"Rating", "date":"Date"}, axis=1, inplace=True)
jap_df.rename({"title":"Header", "review":"Review", "rating":"Rating", "date":"Date"}, axis=1, inplace=True)

jap_df['Header'] = review_cleaning(jap_df['Header'])
cali_df['Header'] = review_cleaning(cali_df['Header'])
hk_df['Header'] = review_cleaning(hk_df['Header'])

df_all = pd.concat([hk_df,jap_df,cali_df])
df_all = df_all.reset_index(drop=True)
df_all = df_all.drop_duplicates(subset=['Header','Review'], keep='first', inplace=False, ignore_index=False)
df_all["sentiment"] = np.where((df_all.Rating == 5) | (df_all.Rating == 4), "Positive","")
df_all["sentiment"] = np.where((df_all.Rating == 1) | (df_all.Rating == 2) | (df_all.Rating == 3), "Negative",df_all["sentiment"])
df_all['Date'] = pd.to_datetime(df_all['Date'])
df_all["Month"] = df_all['Date'].dt.month
df_all["Year"] = df_all['Date'].dt.year
df_all["Season"] = "Winter"
df_all["Season"] = np.where((df_all["Month"]==9)|(df_all["Month"]==10)|(df_all["Month"]==11), "Fall", df_all.Season)
df_all["Season"] = np.where((df_all["Month"]==6)|(df_all["Month"]==7)|(df_all["Month"]==8), "Summer", df_all.Season)
df_all["Season"] = np.where((df_all["Month"]==3)|(df_all["Month"]==4)|(df_all["Month"]==5), "Spring", df_all.Season)
df_all

Unnamed: 0,Header,Review,Rating,Date,name,sentiment,Month,Year,Season
0,a disney christmas indulge in disney s festive...,christmas is right around the corner and noth...,4,2021-12-30,hk,Positive,12,2021,Winter
1,disappointed,it wss christmas day but most of the outdoor s...,1,2021-12-27,hk,Negative,12,2021,Winter
2,disappointing terrible unbelievable even for r...,i guess there s no control on the limit of the...,1,2021-12-26,hk,Negative,12,2021,Winter
3,not a magical day,wifi and network are not stable ticketing and ...,2,2021-12-26,hk,Negative,12,2021,Winter
4,very long queuing time terrible experience,unacceptable queueing time not only for the ga...,1,2021-12-23,hk,Negative,12,2021,Winter
...,...,...,...,...,...,...,...,...,...
20874,like a big comfort blanket,i have just returned from a family holiday to ...,5,2017-04-21,cali,Positive,4,2017,Spring
20875,enchanting experience,the last time we visited disneyland was a few ...,5,2017-04-20,cali,Positive,4,2017,Spring
20876,a magical day at disneyland,this was our rd trip to disneyland and every t...,5,2017-04-20,cali,Positive,4,2017,Spring
20877,dance tour,disney was as magical as ever and did not disa...,5,2017-04-20,cali,Positive,4,2017,Spring


In [224]:
#!pip install gensim
import gensim
from gensim.utils import simple_preprocess
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from gensim.models import Phrases
stop_words = stopwords.words('english')
stop_words.extend(['Disney', 'Disneyland','disney','cast','member','members','highly','recommended', 
                    're', 'edu', 'use','disneyland','hong','kong','japan','Los Angeles','Hong','Kong',
                    'Japan','los angeles','Los angeles','los angeles','universal','Universal','studio','Studio',
                    'birthday','ice','cream','years','year','halloween','Halloween','per','fast','pass','person',
                    'east','coast','make','sure','year','old','theme park','happiest','plan','expectations','formed',
                    'theme','Theme','park',"would","recommend", "even","though","well","better",
                    "day","back","come","main","california","mountain","first","felt","feel","angeles","china","looking",'forward',
                    #"story","king","star","jones","man"
                    ])
def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/taeholee/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [132]:
#positvie
df_all_pos = df_all[df_all.Rating > 3]

#negative
df_all_neg = df_all[df_all.Rating <= 3]

In [146]:
alldata = df_all_pos.Header.values.tolist()
alldata_words = list(sent_to_words(alldata))

allbigram = gensim.models.Phrases(alldata_words, min_count=4, threshold=100
) # higher threshold fewer phrases.
allbigram_mod = gensim.models.phrases.Phraser(allbigram)

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]
def jpmake_bigrams(texts):
    return [allbigram_mod[doc] for doc in texts]
alldata_words_nostops = remove_stopwords(alldata_words)
alldata_words_bigrams = jpmake_bigrams(alldata_words_nostops)

allbigram = []
allbigramlist = []
for i in range(len(alldata_words_bigrams)):
    allbigram=[]
    for j in range(len(alldata_words_bigrams[i])):
        if alldata_words_bigrams[i] not in stopwords.words("english"):
            if len(alldata_words_bigrams[i][j].split("_")) > 1:
                allbigram.append(alldata_words_bigrams[i][j])
    if len(allbigram) >= 1:
        allbigramlist.append(allbigram)

# Create Dictionary
allid2word = corpora.Dictionary(allbigramlist)
# Create Corpus
alltexts = allbigramlist
# Term Document Frequency
allcorpus = [allid2word.doc2bow(text) for text in alltexts]
# View

# number of topics
num_topics = 4
# Build LDA model
alllda_model = gensim.models.LdaMulticore(corpus=allcorpus,
                                       id2word=allid2word,
                                       num_topics=num_topics)
# Print the Keyword in the 10 topics
pprint(alllda_model.print_topics())
alldoc_lda = alllda_model[allcorpus]

pyLDAvis.enable_notebook()
jpvis = gensimvis.prepare(alllda_model, allcorpus, allid2word)
jpvis

[(0,
  '0.365*"spring_break" + 0.133*"customer_service" + 0.133*"lion_king" + '
  '0.086*"fairy_tale" + 0.059*"wish_upon" + 0.037*"every_minute" + '
  '0.035*"gluten_free" + 0.035*"grown_ups" + 0.019*"star_wars" + '
  '0.014*"bucket_list"'),
 (1,
  '0.329*"star_wars" + 0.187*"spring_break" + 0.164*"bucket_list" + '
  '0.156*"galaxy_edge" + 0.088*"every_minute" + 0.016*"fairy_tale" + '
  '0.015*"customer_service" + 0.005*"toy_story" + 0.005*"never_disappoints" + '
  '0.005*"gluten_free"'),
 (2,
  '0.207*"th_anniversary" + 0.158*"fairy_tale" + 0.157*"comes_alive" + '
  '0.132*"nothing_beats" + 0.108*"grown_ups" + 0.059*"gluten_free" + '
  '0.035*"wish_upon" + 0.029*"spring_break" + 0.020*"star_wars" + '
  '0.016*"bucket_list"'),
 (3,
  '0.258*"never_disappoints" + 0.209*"pleasantly_surprised" + '
  '0.186*"toy_story" + 0.098*"gluten_free" + 0.052*"wish_upon" + '
  '0.046*"star_wars" + 0.032*"grown_ups" + 0.025*"spring_break" + '
  '0.013*"bucket_list" + 0.012*"fairy_tale"')]


  default_term_info = default_term_info.sort_values(


In [157]:
alldata = df_all_neg.Header.values.tolist()
alldata_words = list(sent_to_words(alldata))

allbigram = gensim.models.Phrases(alldata_words, min_count=10, threshold=10
) # higher threshold fewer phrases.
allbigram_mod = gensim.models.phrases.Phraser(allbigram)

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]
def jpmake_bigrams(texts):
    return [allbigram_mod[doc] for doc in texts]
alldata_words_nostops = remove_stopwords(alldata_words)
alldata_words_bigrams = jpmake_bigrams(alldata_words_nostops)

allbigram = []
allbigramlist = []
for i in range(len(alldata_words_bigrams)):
    allbigram=[]
    for j in range(len(alldata_words_bigrams[i])):
        if alldata_words_bigrams[i] not in stopwords.words("english"):
            if len(alldata_words_bigrams[i][j].split("_")) > 1:
                allbigram.append(alldata_words_bigrams[i][j])
    if len(allbigram) >= 1:
        allbigramlist.append(allbigram)

# Create Dictionary
allid2word = corpora.Dictionary(allbigramlist)
# Create Corpus
alltexts = allbigramlist
# Term Document Frequency
allcorpus = [allid2word.doc2bow(text) for text in alltexts]
# View

# number of topics
num_topics = 3
# Build LDA model
alllda_model = gensim.models.LdaMulticore(corpus=allcorpus,
                                       id2word=allid2word,
                                       num_topics=num_topics)
# Print the Keyword in the 10 topics
pprint(alllda_model.print_topics())
alldoc_lda = alllda_model[allcorpus]

pyLDAvis.enable_notebook()
jpvis = gensimvis.prepare(alllda_model, allcorpus, allid2word)
jpvis

[(0,
  '0.540*"many_people" + 0.319*"long_lines" + 0.058*"customer_service" + '
  '0.055*"long_queues" + 0.028*"young_children"'),
 (1,
  '0.450*"long_queues" + 0.306*"young_children" + 0.106*"long_lines" + '
  '0.073*"many_people" + 0.066*"customer_service"'),
 (2,
  '0.573*"customer_service" + 0.213*"long_lines" + 0.088*"many_people" + '
  '0.065*"long_queues" + 0.061*"young_children"')]


  default_term_info = default_term_info.sort_values(


In [178]:
alldata = df_all_pos.Review.values.tolist()
alldata_words = list(sent_to_words(alldata))

allbigram = gensim.models.Phrases(alldata_words, min_count=3, threshold=10
) # higher threshold fewer phrases.
allbigram_mod = gensim.models.phrases.Phraser(allbigram)

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]
def jpmake_bigrams(texts):
    return [allbigram_mod[doc] for doc in texts]
alldata_words_nostops = remove_stopwords(alldata_words)
alldata_words_bigrams = jpmake_bigrams(alldata_words_nostops)

allbigram = []
allbigramlist = []
for i in range(len(alldata_words_bigrams)):
    allbigram=[]
    for j in range(len(alldata_words_bigrams[i])):
        if alldata_words_bigrams[i] not in stopwords.words("english"):
            if len(alldata_words_bigrams[i][j].split("_")) > 1:
                allbigram.append(alldata_words_bigrams[i][j])
    if len(allbigram) >= 1:
        allbigramlist.append(allbigram)

# Create Dictionary
allid2word = corpora.Dictionary(allbigramlist)
# Create Corpus
alltexts = allbigramlist
# Term Document Frequency
allcorpus = [allid2word.doc2bow(text) for text in alltexts]
# View

# number of topics
num_topics = 4
# Build LDA model
alllda_model = gensim.models.LdaMulticore(corpus=allcorpus,
                                       id2word=allid2word,
                                       num_topics=num_topics)
# Print the Keyword in the 10 topics
pprint(alllda_model.print_topics())
alldoc_lda = alllda_model[allcorpus]

pyLDAvis.enable_notebook()
jpvis = gensimvis.prepare(alllda_model, allcorpus, allid2word)
jpvis

[(0,
  '0.028*"small_world" + 0.022*"mystic_manor" + 0.015*"lion_king" + '
  '0.013*"wait_times" + 0.012*"iron_man" + 0.011*"grizzly_gulch" + '
  '0.009*"young_children" + 0.009*"star_wars" + 0.009*"mickey_mouse" + '
  '0.008*"toy_story"'),
 (1,
  '0.027*"toy_story" + 0.013*"long_queue" + 0.010*"wait_times" + '
  '0.009*"really_enjoyed" + 0.008*"indiana_jones" + 0.008*"night_parade" + '
  '0.007*"star_wars" + 0.007*"adventure_land" + 0.007*"haunted_mansion" + '
  '0.006*"buy_tickets"'),
 (2,
  '0.018*"small_world" + 0.016*"two_days" + 0.013*"many_people" + '
  '0.013*"mickey_minnie" + 0.013*"popular_rides" + 0.012*"must_see" + '
  '0.011*"much_smaller" + 0.010*"haunted_mansion" + 0.010*"lion_king" + '
  '0.008*"magic_kingdom"'),
 (3,
  '0.030*"lion_king" + 0.015*"night_parade" + 0.014*"star_wars" + '
  '0.013*"roller_coaster" + 0.011*"mickey_mouse" + 0.011*"long_lines" + '
  '0.010*"grizzly_gulch" + 0.010*"toy_story" + 0.009*"long_queues" + '
  '0.007*"young_kids"')]


  default_term_info = default_term_info.sort_values(


In [228]:
alldata = df_all_neg.Review.values.tolist()
alldata_words = list(sent_to_words(alldata))

allbigram = gensim.models.Phrases(alldata_words, min_count=20, threshold=300
) # higher threshold fewer phrases.
allbigram_mod = gensim.models.phrases.Phraser(allbigram)

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]
def jpmake_bigrams(texts):
    return [allbigram_mod[doc] for doc in texts]
alldata_words_nostops = remove_stopwords(alldata_words)
alldata_words_bigrams = jpmake_bigrams(alldata_words_nostops)

allbigram = []
allbigramlist = []
for i in range(len(alldata_words_bigrams)):
    allbigram=[]
    for j in range(len(alldata_words_bigrams[i])):
        if alldata_words_bigrams[i] not in stopwords.words("english"):
            if len(alldata_words_bigrams[i][j].split("_")) > 1:
                allbigram.append(alldata_words_bigrams[i][j])
    if len(allbigram) >= 1:
        allbigramlist.append(allbigram)

# Create Dictionary
allid2word = corpora.Dictionary(allbigramlist)
# Create Corpus
alltexts = allbigramlist
# Term Document Frequency
allcorpus = [allid2word.doc2bow(text) for text in alltexts]
# View

# number of topics
num_topics = 3
# Build LDA model
alllda_model = gensim.models.LdaMulticore(corpus=allcorpus,
                                       id2word=allid2word,
                                       num_topics=num_topics)
# Print the Keyword in the 10 topics
pprint(alllda_model.print_topics())
alldoc_lda = alllda_model[allcorpus]

pyLDAvis.enable_notebook()
jpvis = gensimvis.prepare(alllda_model, allcorpus, allid2word)
jpvis

[(0,
  '0.166*"lion_king" + 0.147*"indiana_jones" + 0.144*"haunted_mansion" + '
  '0.058*"roller_coaster" + 0.057*"peter_pan" + 0.056*"jungle_cruise" + '
  '0.051*"mystic_manor" + 0.051*"iron_man" + 0.047*"star_wars" + '
  '0.045*"spring_break"'),
 (1,
  '0.245*"star_wars" + 0.195*"toy_story" + 0.066*"lion_king" + '
  '0.052*"river_cruise" + 0.051*"single_rider" + 0.044*"haunted_mansion" + '
  '0.042*"mystic_manor" + 0.040*"star_tours" + 0.038*"sleeping_beauty" + '
  '0.032*"bucket_list"'),
 (2,
  '0.158*"roller_coaster" + 0.111*"buzz_lightyear" + 0.105*"grizzly_gulch" + '
  '0.102*"toy_story" + 0.066*"selfie_stick" + 0.065*"sunny_bay" + '
  '0.050*"roller_coasters" + 0.048*"jungle_cruise" + 0.044*"lion_king" + '
  '0.039*"mystic_manor"')]


  default_term_info = default_term_info.sort_values(


In [186]:
alldata = df_all_neg[df_all_neg.name=="cali"].Review.values.tolist()
alldata_words = list(sent_to_words(alldata))

allbigram = gensim.models.Phrases(alldata_words, min_count=10, threshold=100
) # higher threshold fewer phrases.
allbigram_mod = gensim.models.phrases.Phraser(allbigram)

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]
def jpmake_bigrams(texts):
    return [allbigram_mod[doc] for doc in texts]
alldata_words_nostops = remove_stopwords(alldata_words)
alldata_words_bigrams = jpmake_bigrams(alldata_words_nostops)

allbigram = []
allbigramlist = []
for i in range(len(alldata_words_bigrams)):
    allbigram=[]
    for j in range(len(alldata_words_bigrams[i])):
        if alldata_words_bigrams[i] not in stopwords.words("english"):
            if len(alldata_words_bigrams[i][j].split("_")) > 1:
                allbigram.append(alldata_words_bigrams[i][j])
    if len(allbigram) >= 1:
        allbigramlist.append(allbigram)

# Create Dictionary
allid2word = corpora.Dictionary(allbigramlist)
# Create Corpus
alltexts = allbigramlist
# Term Document Frequency
allcorpus = [allid2word.doc2bow(text) for text in alltexts]
# View

# number of topics
num_topics = 4
# Build LDA model
alllda_model = gensim.models.LdaMulticore(corpus=allcorpus,
                                       id2word=allid2word,
                                       num_topics=num_topics)
# Print the Keyword in the 10 topics
pprint(alllda_model.print_topics())
alldoc_lda = alllda_model[allcorpus]

pyLDAvis.enable_notebook()
jpvis = gensimvis.prepare(alllda_model, allcorpus, allid2word)
jpvis

[(0,
  '0.367*"star_wars" + 0.085*"parking_garage" + 0.072*"six_flags" + '
  '0.068*"peter_pan" + 0.065*"anytime_soon" + 0.061*"galaxy_edge" + '
  '0.045*"haunted_mansion" + 0.041*"mid_week" + 0.023*"magic_kingdom" + '
  '0.019*"customer_service"'),
 (1,
  '0.280*"indiana_jones" + 0.092*"haunted_mansion" + 0.073*"magic_kingdom" + '
  '0.060*"city_hall" + 0.056*"peter_pan" + 0.046*"star_tours" + '
  '0.043*"haunted_house" + 0.039*"crowd_control" + 0.033*"spring_break" + '
  '0.033*"star_wars"'),
 (2,
  '0.158*"haunted_mansion" + 0.113*"jungle_cruise" + 0.087*"berry_farm" + '
  '0.084*"blue_bayou" + 0.083*"guest_services" + 0.076*"customer_service" + '
  '0.047*"minnie_mouse" + 0.039*"galaxy_edge" + 0.039*"mid_week" + '
  '0.033*"crowd_control"'),
 (3,
  '0.222*"customer_service" + 0.179*"star_wars" + 0.087*"spring_break" + '
  '0.057*"single_rider" + 0.056*"bucket_list" + 0.050*"selfie_stick" + '
  '0.046*"indiana_jones" + 0.041*"roller_coaster" + 0.040*"haunted_mansion" + '
  '0.034*"l

  default_term_info = default_term_info.sort_values(


In [189]:
alldata = df_all_neg[df_all_neg.name=="japan"].Review.values.tolist()
alldata_words = list(sent_to_words(alldata))

allbigram = gensim.models.Phrases(alldata_words, min_count=4, threshold=100
) # higher threshold fewer phrases.
allbigram_mod = gensim.models.phrases.Phraser(allbigram)

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]
def jpmake_bigrams(texts):
    return [allbigram_mod[doc] for doc in texts]
alldata_words_nostops = remove_stopwords(alldata_words)
alldata_words_bigrams = jpmake_bigrams(alldata_words_nostops)

allbigram = []
allbigramlist = []
for i in range(len(alldata_words_bigrams)):
    allbigram=[]
    for j in range(len(alldata_words_bigrams[i])):
        if alldata_words_bigrams[i] not in stopwords.words("english"):
            if len(alldata_words_bigrams[i][j].split("_")) > 1:
                allbigram.append(alldata_words_bigrams[i][j])
    if len(allbigram) >= 1:
        allbigramlist.append(allbigram)

# Create Dictionary
allid2word = corpora.Dictionary(allbigramlist)
# Create Corpus
alltexts = allbigramlist
# Term Document Frequency
allcorpus = [allid2word.doc2bow(text) for text in alltexts]
# View

# number of topics
num_topics = 4
# Build LDA model
alllda_model = gensim.models.LdaMulticore(corpus=allcorpus,
                                       id2word=allid2word,
                                       num_topics=num_topics)
# Print the Keyword in the 10 topics
pprint(alllda_model.print_topics())
alldoc_lda = alllda_model[allcorpus]

pyLDAvis.enable_notebook()
jpvis = gensimvis.prepare(alllda_model, allcorpus, allid2word)
jpvis

[(0,
  '0.123*"speak_english" + 0.120*"monsters_inc" + 0.060*"next_door" + '
  '0.055*"big_thunder" + 0.047*"small_world" + 0.046*"customer_service" + '
  '0.045*"cinderella_castle" + 0.040*"school_holidays" + 0.038*"mid_week" + '
  '0.033*"haunted_mansion"'),
 (1,
  '0.114*"small_world" + 0.110*"haunted_mansion" + 0.091*"big_thunder" + '
  '0.067*"star_tours" + 0.052*"buzz_lightyear" + 0.041*"trip_advisor" + '
  '0.040*"train_station" + 0.040*"light_show" + 0.038*"haunted_house" + '
  '0.034*"looking_forward"'),
 (2,
  '0.108*"school_holidays" + 0.092*"young_children" + 0.060*"plus_side" + '
  '0.053*"pop_corn" + 0.052*"roller_coaster" + 0.051*"gift_shops" + '
  '0.051*"foreign_tourists" + 0.042*"bucket_list" + 0.041*"looking_forward" + '
  '0.038*"snow_white"'),
 (3,
  '0.156*"walking_around" + 0.103*"peak_season" + 0.097*"star_wars" + '
  '0.085*"mickey_mouse" + 0.071*"jungle_cruise" + 0.050*"single_rider" + '
  '0.047*"never_seen" + 0.037*"small_world" + 0.037*"star_tours" + '
  '0

  default_term_info = default_term_info.sort_values(


In [215]:
alldata = df_all_neg[df_all_neg.name=="hk"].Review.values.tolist()
alldata_words = list(sent_to_words(alldata))

allbigram = gensim.models.Phrases(alldata_words, min_count=15, threshold=200
) # higher threshold fewer phrases.
allbigram_mod = gensim.models.phrases.Phraser(allbigram)

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]
def jpmake_bigrams(texts):
    return [allbigram_mod[doc] for doc in texts]
alldata_words_nostops = remove_stopwords(alldata_words)
alldata_words_bigrams = jpmake_bigrams(alldata_words_nostops)

allbigram = []
allbigramlist = []
for i in range(len(alldata_words_bigrams)):
    allbigram=[]
    for j in range(len(alldata_words_bigrams[i])):
        if alldata_words_bigrams[i] not in stopwords.words("english"):
            if len(alldata_words_bigrams[i][j].split("_")) > 1:
                allbigram.append(alldata_words_bigrams[i][j])
    if len(allbigram) >= 1:
        allbigramlist.append(allbigram)

# Create Dictionary
allid2word = corpora.Dictionary(allbigramlist)
# Create Corpus
alltexts = allbigramlist
# Term Document Frequency
allcorpus = [allid2word.doc2bow(text) for text in alltexts]
# View

# number of topics
num_topics = 4
# Build LDA model
alllda_model = gensim.models.LdaMulticore(corpus=allcorpus,
                                       id2word=allid2word,
                                       num_topics=num_topics)
# Print the Keyword in the 10 topics
pprint(alllda_model.print_topics())
alldoc_lda = alllda_model[allcorpus]

pyLDAvis.enable_notebook()
jpvis = gensimvis.prepare(alllda_model, allcorpus, allid2word)
jpvis

[(0,
  '0.330*"lion_king" + 0.162*"roller_coaster" + 0.099*"jungle_river" + '
  '0.089*"buzz_lightyear" + 0.083*"mystic_manor" + 0.069*"iron_man" + '
  '0.047*"grizzly_gulch" + 0.040*"toy_story" + 0.031*"runaway_mine" + '
  '0.018*"star_wars"'),
 (1,
  '0.377*"toy_story" + 0.144*"lion_king" + 0.099*"mystic_manor" + '
  '0.063*"sleeping_beauty" + 0.057*"star_wars" + 0.037*"iron_man" + '
  '0.035*"jungle_river" + 0.035*"runaway_mine" + 0.034*"roller_coasters" + '
  '0.026*"roller_coaster"'),
 (2,
  '0.288*"lion_king" + 0.201*"iron_man" + 0.149*"buzz_lightyear" + '
  '0.070*"grizzly_gulch" + 0.068*"haunted_mansion" + 0.054*"roller_coasters" + '
  '0.047*"star_wars" + 0.032*"mystic_manor" + 0.028*"runaway_mine" + '
  '0.019*"roller_coaster"'),
 (3,
  '0.240*"roller_coaster" + 0.173*"grizzly_gulch" + 0.163*"sunny_bay" + '
  '0.143*"mystic_manor" + 0.066*"lion_king" + 0.064*"haunted_mansion" + '
  '0.035*"jungle_river" + 0.028*"roller_coasters" + 0.027*"toy_story" + '
  '0.014*"buzz_lightyea

  default_term_info = default_term_info.sort_values(
