### Import modules

In [1]:
from utils import * 

import numpy as np
import pandas as pd
from pprint import pprint
import os
import matplotlib.pyplot as plt
from collections import defaultdict

# Gensim
from gensim.test.utils import datapath
from gensim.test.utils import common_texts, get_tmpfile

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.option_context('display.max_colwidth', 500);

In [2]:
# Import dataset
df = pd.read_pickle('raw_data/netflix.pkl')
reviews = df.review

NLPpipeline

In [3]:
term_doc = pd.read_pickle('preprocessed_data/term_doc.pkl')
data_lemmatized = pd.read_pickle('preprocessed_data/data_lemmatized.pkl')
dictionary = pd.read_pickle('preprocessed_data/dictionary.pkl')
tf_idf = pd.read_pickle('preprocessed_data/tf_idf.pkl')

<string>Code for piepelining</strong>
<code style="font-size: 10px; background-color:transparent;">
nlp_pipe = NLPpipe()
term_doc = nlp_pipe.fit_transform(reviews, min_count = 3, threshold = -0.5)
tf_idf = nlp_pipe.transform(reviews, tf_idf = True)
data_lemmatized = nlp_pipe.clean_text
dictionary = create_dictionary(data_lemmatized)
</code>

### (Standard) LDA model after tuning

In [4]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=term_doc,
                                           id2word=dictionary,
                                           num_topics= 6, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=2000,
                                           passes=10,
                                           alpha= 1.5,
                                           per_word_topics=True)

#### Visualize Standard LDA model

In [5]:
vis_data = pyLDAvis.gensim.prepare(lda_model, term_doc, dictionary, sort_topics=False)
pyLDAvis.save_html(vis_data, 'std_lda_vis/std_lda_topics=6&a=1.5&batchsize=1.html')

*With chunksize = 100 after tuning, although the coherence scores are higher than stochastic one('update_every=1'), the topics are not much distinguishable and uninterpretable.*

In [6]:
lda_model_100 = gensim.models.ldamodel.LdaModel(corpus=term_doc,
                                           id2word=dictionary,
                                           num_topics= 15, 
                                           random_state=100,
                                           update_every=100,
                                           chunksize=2000,
                                           passes=10,
                                           alpha= 1.5,
                                           per_word_topics=True)

vis_data = pyLDAvis.gensim.prepare(lda_model_100, term_doc, dictionary, sort_topics=False)
pyLDAvis.save_html(vis_data, 'std_lda_vis/std_lda_topics=15&a=1.5&batchsize=100.html')

In [7]:
coherence_model = CoherenceModel(model=lda_model_100, texts=data_lemmatized, dictionary=dictionary, coherence='c_npmi')
np.mean(coherence_model.get_coherence_per_topic())

-0.07157395340265418

## <mark>Mallet's LDA model after tuning &#8592; Best Model
*The difference between Mallet and Gensim’s standard LDA is that Gensim uses a Variational Bayes sampling method which is faster but less precise that Mallet’s Gibbs Sampling.*  [link](https://towardsdatascience.com/basic-nlp-on-the-texts-of-harry-potter-topic-modeling-with-latent-dirichlet-allocation-f3c00f77b0f5)

In [8]:
# ldamallet.save(datapath("model"))
ldamallet = gensim.models.wrappers.LdaMallet.load(datapath("model"))

In [9]:
mallet_path = '../mallet-2.0.8/bin/mallet' # update this path
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, 
                                             corpus = term_doc,
                                             num_topics = 6, 
                                             random_seed = 100,
                                             id2word = dictionary,
                                             alpha = 1.5)

#### Visualize Mallet's LDA model

In [10]:
# Visualize the topics
model = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(ldamallet)
pyLDAvis.enable_notebook()

In [11]:
vis = pyLDAvis.gensim.prepare(model, term_doc, dictionary)
pyLDAvis.save_html(vis, 'mallet_lda_vis/mallet_lda_topics=6&a=1.5.html')

  kernel = (topic_given_term * np.log((topic_given_term.T / topic_proportion).T))
  log_lift = np.log(topic_term_dists / term_proportion)
  log_ttd = np.log(topic_term_dists)


In [12]:
coherence_model_m = CoherenceModel(model=model, texts=data_lemmatized, dictionary=dictionary, coherence='c_npmi')
coherence_model_m.get_coherence_per_topic()

[-0.059887677995452145,
 -0.08540541347219917,
 -0.026354790404547596,
 -0.04867638381820212,
 0.002685968866744477,
 -0.08213902199226049]

In [13]:
# model = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(ldamallet)
# model.top_topics(corpus = term_doc,topn=10)

#### For each topic, we could look at frequent and relevant words

In [14]:
frequencies = ldamallet.word_topics.sum(axis=0)
p_word = frequencies / ldamallet.word_topics.sum()
p_word_given_topic = ldamallet.word_topics / np.sum(ldamallet.word_topics, axis = 0)

lamda = 0.5
relevance = lamda * p_word_given_topic + (1-lamda) * p_word_given_topic / p_word 

  This is separate from the ipykernel package so we can avoid doing imports until


In [15]:
topic_dict = {0.: "Platform/Device", 1.: "User Experience", 2.: "Value", 3.: "Service", 4.: "Trouble-shooting", 5.:"Shows"}

for topic_id in range(6):
    words = []
    for id in np.argsort(relevance[topic_id,])[::-1][:15]:
        words.append(dictionary[id])
    print(f"Topic: {topic_dict[topic_id]}")
    print(words)
    print('\n')

Topic: Platform/Device
['take', 'none', 'become', 'one', 'please', 'allow', 'welcome', 'come', 'mean', 'anywhere', 'believe', 'com', 'th', 'try', 'out']


Topic: User Experience
['ok', 'everyone', 'one', 'seeing', 'que', 'believe', 'together', 'try', 'second', 'sorry', 'down', 'become', 'course', 'considering', 'trying']


Topic: Value
['sorry', 'value', 'wonder', 'last', 'etc', 'self', 'course', 'novel', 'over', 'nothing', 'together', 'try', 'd', 'help', 'like']


Topic: Service
['need', 'value', 'believe', 's', 'way', 'seeing', 'thank', 'tell', 'something', 'might', 'out', 'enough', 'other', 'whole', 'want']


Topic: Trouble-shooting
['way', 'together', 'd', 'enough', 'welcome', 'everyone', 'novel', 'etc', 'one', 'anywhere', 'overall', 'might', 'try', 'th', 'wonder']


Topic: Shows
['like', 'allow', 'need', 'd', 'up', 'enough', 'name', 'look', 'over', 'want', 'overall', 'might', 'please', 'thank', 'value']




## Interpret the topic model
1. Finding the dominant topic in each document
2. Find the most representative document for each topic
3. Topic distribution across documents
**The code used here for interpretation of the model are based on this website with a little modification by the user myself: <br>
https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/#11createthedictionaryandcorpusneededfortopicmodeling**

### Standard LDA - Interpret the model

In [16]:
df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=term_doc, texts=data_lemmatized, df=df)

#### 1. Standard LDA - Finding the dominant topic in each document

In [17]:
df_dominant_topic = find_dominant_topic_in_each_doc(df_topic_sents_keywords, df=df)
print("Finding the dominant topic in each document")
df_dominant_topic.head(5).style.set_properties(subset=['review'], **{'width': '600px'})

Finding the dominant topic in each document


Unnamed: 0,Dominant_Topic,Perc_Contribution,Keywords,Text,star_rating,helpful_votes,total_votes,review
0,0.0,0.1667,"month, year, price, content, option, entertainment, money, list, title, quality",[],5,0,0,Five Stars netflix is great!
1,4.0,0.2916,"time, fire, program, issue, way, episode, computer, thing, film, subscription","['buck', 'show', 'buck']",5,0,0,GREAT Why pay per view like the others? 8 bucks for all you want to watch. Want to watch every episode of a show? 8 bucks.
2,5.0,0.2744,"tv, service, phone, kindle, cable, thank, family, streaming, documentary, cost","['navigation', 'movies_show', 'hitch', 'device']",3,0,0,"Satisfactory App runs fine on kindle fire HD. Navigation thru app efficient, movies/shows play without hitch, sound is great. Video quality needs work, expected better from an HD device."
3,0.0,0.1667,"month, year, price, content, option, entertainment, money, list, title, quality",[],5,0,0,Five Stars The best
4,5.0,0.2493,"tv, service, phone, kindle, cable, thank, family, streaming, documentary, cost",['original_series'],5,0,0,Excellent! Love netflix! Worked great on my Kindle. Watched one of the original series that I've been wanting to watch for awhile.


#### 2. Standard LDA - Find the most representative document for each topic

In [18]:
print("Find the most representative document for each topic")
sent_topics_sorteddf = find_most_representative_doc_for_each_doc(df_topic_sents_keywords,df=df)
sent_topics_sorteddf.style.set_properties(subset=['review'], **{'width': '1000px'})

Find the most representative document for each topic


Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords,star_rating,helpful_votes,total_votes,review
0,0.0,0.4947,"month, year, price, content, option, entertainment, money, list, title, quality",2,0,0,"Bring back My List! The latest version of Netflix is a disaster on Kindle! Why? They got rid of My List! What is the use in adding a movie or show to My List if you can't access the list with the app? If I want to watch a show I have been streaming, I have to search for it now. Netflix, bring back My List! I HATE this app without it!"
1,1.0,0.4669,"love, app, day, tablet, account, people, access, version, friend, commercial",3,0,0,"Such Profile. Does what you expect it to do, except there are no profiles! Where are my profiles! Needs more profiles! Wow, such profile, many profile, amaze profile!!"
2,2.0,0.419,"problem, video, work, device, selection, update, screen, series, review, picture",5,0,0,movies @ home when work is done It is ready to watch when I am and it is too late to go out. I do wish they had newer releases.
3,3.0,0.469,"movie, show, use, kid, watch, watch_movie, tv_show, shows_movie, choice, product",4,0,0,"I enjoy the access to movies and the European television series I enjoy the access to movies and the European television series, the BBC in particular. The only shortcoming is there are one or two seasons available and I would like to see more."
4,4.0,0.5238,"time, fire, program, issue, way, episode, computer, thing, film, subscription",5,0,0,"Search for Movies! I can use the search function! I have Netflix on my Blu-ray player, and I can't search movies on it. So with its search function, I'm super glad. Though, I was not too happy I had to subscribe to Amazon prime in order to use Netflix on the fireTV stick..."
5,5.0,0.4663,"tv, service, phone, kindle, cable, thank, family, streaming, documentary, cost",3,2,3,"Netflix Review I love having access to Netflix. I love the movies on Netflix. I just don't like how long it takes Netflix to get a new movie. I feel like they are the bottom of the totem pole and the last resort for a new release. Netflix is a wonderful tool to have during this time of recession. I know that it makes me look good to my grandchildren, cause I can give them decent viewing options and not have to worry as to what they are watching. Thank you Netflix, for sharing with my family."


#### 3. Standard LDA - Topic distribution across documents

In [19]:
# Show
print("Topic distribution across documents")
df_dominant_topic = topic_distribution_across_docs(df_topic_sents_keywords)
df_dominant_topic.style.set_properties(subset=['Keywords'], **{'width': '400px'})

Topic distribution across documents


Unnamed: 0,Dominant_Topic,Keywords,Num_Documents,Perc_Documents
0,0.0,"month, year, price, content, option, entertainment, money, list, title, quality",4192,0.3336
1,1.0,"love, app, day, tablet, account, people, access, version, friend, commercial",2006,0.1596
2,2.0,"problem, video, work, device, selection, update, screen, series, review, picture",1678,0.1335
3,3.0,"movie, show, use, kid, watch, watch_movie, tv_show, shows_movie, choice, product",1664,0.1324
4,4.0,"time, fire, program, issue, way, episode, computer, thing, film, subscription",1627,0.1295
5,5.0,"tv, service, phone, kindle, cable, thank, family, streaming, documentary, cost",1399,0.1113


In [20]:
coherence_model = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=dictionary, coherence='c_npmi')
coherence_model.get_coherence_per_topic()

[-0.49724770057910794,
 -0.48030058871786624,
 -0.4823159450100309,
 -0.49933323555683556,
 -0.5012508814572622,
 -0.473447859465523]

### <mark>Mallet Model - Interpret the model</mark>

In [21]:
ldamallet = gensim.models.wrappers.LdaMallet.load(datapath('model'))
model = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(ldamallet)

In [22]:
df_topic_sents_keywords_m = format_topics_sentences(model, term_doc, data_lemmatized, df)

Getting main topic for document...
0  1000  2000  3000  4000  5000  6000  7000  8000  9000  10000  11000  12000  

In [23]:
all_topics = model.get_document_topics(bow = term_doc)

In [24]:
df_topic_sents_keywords_m

Unnamed: 0,Dominant_Topic,Perc_Contribution,Keywords,0
0,0.0,0.1667,"time, work, app, fire, love, phone, movie, tab...",[]
1,2.0,0.7205,"movie, program, cable, tv, show, selection, lo...","[buck, show, buck]"
2,4.0,0.3689,"app, video, problem, issue, update, time, devi...","[navigation, movies_show, hitch, device]"
3,0.0,0.1667,"time, work, app, fire, love, phone, movie, tab...",[]
4,5.0,0.5000,"love, movie, tv_show, season, lot, watch, tv, ...",[original_series]
...,...,...,...,...
12561,4.0,0.6429,"app, video, problem, issue, update, time, devi...","[zone, edge]"
12562,0.0,0.4957,"time, work, app, fire, love, phone, movie, tab...",[work]
12563,3.0,0.4965,"service, streaming, time, product, year, price...",[service]
12564,0.0,0.1667,"time, work, app, fire, love, phone, movie, tab...",[]


#### 1. Mallet LDA - Finding the dominant topic in each document

In [25]:
df_dominant_topic_m = find_dominant_topic_in_each_doc(df_topic_sents_keywords_m, df)
print("Finding the dominant topic in each document")
df_dominant_topic_m.head(5).style.set_properties(subset=['review'], **{'width': '400px'})

Finding the dominant topic in each document


Unnamed: 0,Dominant_Topic,Perc_Contribution,Keywords,Text,star_rating,helpful_votes,total_votes,review
0,0.0,0.1667,"time, work, app, fire, love, phone, movie, tablet, download, kindle",[],5,0,0,Five Stars netflix is great!
1,2.0,0.7205,"movie, program, cable, tv, show, selection, love, service, year, choice","['buck', 'show', 'buck']",5,0,0,GREAT Why pay per view like the others? 8 bucks for all you want to watch. Want to watch every episode of a show? 8 bucks.
2,4.0,0.3689,"app, video, problem, issue, update, time, device, fire, work, fix","['navigation', 'movies_show', 'hitch', 'device']",3,0,0,"Satisfactory App runs fine on kindle fire HD. Navigation thru app efficient, movies/shows play without hitch, sound is great. Video quality needs work, expected better from an HD device."
3,0.0,0.1667,"time, work, app, fire, love, phone, movie, tablet, download, kindle",[],5,0,0,Five Stars The best
4,5.0,0.5,"love, movie, tv_show, season, lot, watch, tv, watch_movie, love_great, month",['original_series'],5,0,0,Excellent! Love netflix! Worked great on my Kindle. Watched one of the original series that I've been wanting to watch for awhile.


#### 2. Mallet LDA - Find the most representative document for each topic

In [26]:
print("Find the most representative document for each topic")
sent_topics_sorteddf_m = find_most_representative_doc_for_each_doc(df_topic_sents_keywords_m, df)
sent_topics_sorteddf_m.style.set_properties(subset=['review'], **{'width': '400px'})

Find the most representative document for each topic


Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords,star_rating,helpful_votes,total_votes,review
0,0.0,0.8936,"time, work, app, fire, love, phone, movie, tablet, download, kindle",4,2,3,"very good I like this app overall but I don't like that their can only be two people on it at a time. It would be better if you could use it without Internet. Sometimes it takes a long to to load or buffer an it makes you shows blurry. There is not as much movies and shows that some other thing like netflix have. Netflix is very good app for sitting down with you family and watching a movie or being alone and watching a show or movie on the couch or even in your bed. Netflix has a lot of great movies and shows to watch. If you find one that you like but you don't want to watch it the you can add it to your list. Overall netflix is really great app and if  you like to watch movies then I would get this app for sure. I like the fact that it is free so for sure get it.I guess you can pay more money monthly for netflix to work on more than two devices at a time. Overall other than the long load times that we encounter once in a great while we will continue to be a netflix customer. We have been now for 4 years and it runs great on out kindles,laptops,and I pods. You can also use it where they have free WiFi like mcdonalds."
1,1.0,0.9232,"movie, kid, profile, account, selection, family, child, shows_movie, site, watch",4,10,12,"NETFLIX is great! I really enjoy using NETFLIX. It is user-friendly, very convenient and inexpensive. The one thing I would change about NETFLIX is the content availability, for example, my children, ages 7 and 8 love to watch movies on NETFLIX and they do have a great selection of children/family content, but at the same time, my kids have access to R-rated, gay-lesbian, horror, etc. I wish they had a setting which I could prevent them from viewing those genres. For now, I just closely monitor them, as any good parent should do anyway!"
2,2.0,0.9235,"movie, program, cable, tv, show, selection, love, service, year, choice",4,1,2,"It's worth the monthly fee! Considering the cost of a movie theater ticket, what a bargain! I can enjoy a movie without a considerable amount of commercials, if I am interruped, I can pause the movie and if I am hard of hearing, most movies have closed caption! I can even make my own popcorn at home! I can also just stop the movie if it think it's something not of my liking, and start another!! I don't have to drive to and from the theater, no long lines, etc. and if you happen to own a large screen .......... what more could you ask for????????"
3,3.0,0.9433,"service, streaming, time, product, year, price, customer, movie, film, option",1,21,29,"Netflix and the Arrogant App The arrogance of this application is amazing. I am sure some ""team"" thought they new enough, did enough market research, etc. to believe the credits to a film don't matter. The ""team"" also apparently believes giving people fewer options is better because it's all just too advanced for us sheep to understand. I now have to scroll more, hit the back button more, use the navigation ring on Fire TV remote more, and I don't enjoy Netflix as much as I used to. Eventually there will be increasing options outside of Netflix and that will be a good thing. I'm thinking maybe I should try Amazon Prime. Although Amazon is getting arrogant too, at least the Prime service lets the movie play to the end without shrinking the credits into oblivion. I'll decide when to stop watching a movie Netflix, you don't need to prompt me."
4,4.0,0.9348,"app, video, problem, issue, update, time, device, fire, work, fix",3,0,0,"good, browsing ui can be a bit too sluggish Streaming works fairly well. I've noticed that the browsing UI can be a bit sluggish. My guess is that some tasks are being handled in the UI thread, which should actually be delegated to a background thread (I'm a professional developer). This issue should be a high priority fix -- it really detracts from the experience of browsing the available movies and shows, when the Netfix app takes seconds to respond to a drag request. Also, this should have been caught during QA -- it's pretty basic and important stuff.."
5,5.0,0.8667,"love, movie, tv_show, season, lot, watch, tv, watch_movie, love_great, month",5,0,0,Who really doesn't love netflix. It is everything in one place Who really doesn't love netflix. It is everything in one place. I mean they could have a few more things but if not there is project TV or hulu.


#### 3. Mallet LDA - Topic distribution across documents

In [27]:
# Show
print("Topic distribution across documents")
df_dominant_topic_m = topic_distribution_across_docs(df_topic_sents_keywords_m)
df_dominant_topic_m.style.set_properties(subset=['Keywords'], **{'width': '400px'})

Topic distribution across documents


Unnamed: 0,Dominant_Topic,Keywords,Num_Documents,Perc_Documents
0,0.0,"time, work, app, fire, love, phone, movie, tablet, download, kindle",4030,0.3207
1,1.0,"movie, kid, profile, account, selection, family, child, shows_movie, site, watch",1983,0.1578
2,2.0,"movie, program, cable, tv, show, selection, love, service, year, choice",1820,0.1448
3,3.0,"service, streaming, time, product, year, price, customer, movie, film, option",1631,0.1298
4,4.0,"app, video, problem, issue, update, time, device, fire, work, fix",1617,0.1287
5,5.0,"love, movie, tv_show, season, lot, watch, tv, watch_movie, love_great, month",1485,0.1182


In [28]:
coherence_model = CoherenceModel(model=ldamallet, texts=data_lemmatized, dictionary=dictionary, coherence='c_npmi')
coherence_model.get_coherence_per_topic()

[-0.059887677995452145,
 -0.08540541347219917,
 -0.026354790404547596,
 -0.04867638381820212,
 0.002685968866744477,
 -0.08213902199226049]

## Try the model with TF-IDF dataset

### Standard LDA with TF-IDF

In [29]:
lda_model_tfidf = gensim.models.ldamodel.LdaModel(corpus=tf_idf,
                                           num_topics= 6, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=2000,
                                           passes=10,
                                           alpha=1.5,
                                           per_word_topics=True)

In [30]:
vis_data = pyLDAvis.gensim.prepare(lda_model_tfidf, tf_idf, dictionary, sort_topics=False)
pyLDAvis.save_html(vis_data, 'std_lda_vis/std_lda_vis_tfidf_num_topics=6&alpha=1.5.html')

In [31]:
coherence_model_m = CoherenceModel(model=lda_model_tfidf, texts=data_lemmatized, dictionary=dictionary, coherence='c_npmi')
coherence_model_m.get_coherence_per_topic()

[-0.12354832277346144,
 -0.27506340107114513,
 -0.21936770506896217,
 -0.15886486640839953,
 -0.315369990391699,
 -0.06717851894875705]

In [32]:
df_topic_sents_keywords_tfidf = format_topics_sentences(ldamodel=lda_model_tfidf, corpus=term_doc, texts=data_lemmatized)

Getting main topic for document...
0  1000  2000  3000  4000  5000  6000  7000  8000  9000  10000  11000  12000  

#### 1. Standard LDA with TF-IDF - Finding the dominant topic in each document

In [33]:
df_dominant_topic_tfidf = find_dominant_topic_in_each_doc(df_topic_sents_keywords_tfidf)
print("Finding the dominant topic in each document")
df_dominant_topic_tfidf.head(5).style.set_properties(subset=['review'], **{'width': '400px'})

Finding the dominant topic in each document


Unnamed: 0,Dominant_Topic,Perc_Contribution,Keywords,Text,star_rating,helpful_votes,total_votes,review
0,0.0,0.2,"time, video, service, thing, price, product, kid, watch_movie, day, cable",[],5,0,0,Five Stars netflix is great!
1,1.0,0.6345,"selection, show, year, computer, use, watch, content, series, load, commercial","['buck', 'show', 'buck']",5,0,0,GREAT Why pay per view like the others? 8 bucks for all you want to watch. Want to watch every episode of a show? 8 bucks.
2,2.0,0.3834,"movie, love, work, fire, phone, month, episode, lot, picture, title","['navigation', 'movies_show', 'hitch', 'device']",3,0,0,"Satisfactory App runs fine on kindle fire HD. Navigation thru app efficient, movies/shows play without hitch, sound is great. Video quality needs work, expected better from an HD device."
3,0.0,0.2,"time, video, service, thing, price, product, kid, watch_movie, day, cable",[],5,0,0,Five Stars The best
4,4.0,0.4197,"issue, program, way, update, season, kindle, list, money, people, access",['original_series'],5,0,0,Excellent! Love netflix! Worked great on my Kindle. Watched one of the original series that I've been wanting to watch for awhile.


#### 2. Standard LDA with TF-IDF - Find the most representative document for each topic

In [34]:
print("Find the most representative document for each topic")
sent_topics_sorteddf_tfidf = find_most_representative_doc_for_each_doc(df_topic_sents_keywords_tfidf)
sent_topics_sorteddf_tfidf.style.set_properties(subset=['review'], **{'width': '400px'})

Find the most representative document for each topic


Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords,star_rating,helpful_votes,total_votes,review
0,0.0,0.9103,"time, device, service, program, watch_movie, video, thing, shows_movie, thank, kid",2,0,0,"Works about half the time. If Im lucky, I can start netflix and everythings great! But, most of the time, I have to restart the program 2 or 3 times for it to load.Once its playing, dont u dare pause it and set it down! It WONT restart. And forget about choosing which tv episode u want to watch. When its running, I luv it and use it a lot! Best of luck!"
1,1.0,0.9035,"tv, selection, show, watch, use, year, computer, four_great, price, way",5,2,2,"NETFLIX , GREATEST THING SINCE SLICED BREAD Netflix is the greatest bargain around. I recently drove from LA to Palm Springs to see Robin Thicke in concert and found out he cancelled his concert. No notice, not fair. The night before I was mesmerized by a Pink concert on Netflix. Was the greatest thing I have watched on Netflix. I think Netflix is great. I hope they don't get wise and up there online rates."
2,2.0,0.9209,"love, movie, app, problem, work, fire, tablet, tv_show, phone, kindle",4,0,1,"Love Hate Relationship Netflix has the best intuitive consumer metrics for movie/tv series rentals--yet, it doesn't do any good when on-line streaming suffers glitches during the holiday season. Get a clue Amazon! If you're going to have a marriage with Netflix, you can at least provide reliable streaming service during 'peak' periods (Christmas holidays, etc.) Work it out!"


#### 3. Standard LDA with TF-IDF - Topic distribution across documents

In [35]:
# Show
print("Topic distribution across documents")
df_dominant_topic_tfidf = topic_distribution_across_docs(df_topic_sents_keywords_tfidf)
df_dominant_topic_tfidf.style.set_properties(subset=['Keywords'], **{'width': '400px'})

Topic distribution across documents


Unnamed: 0,Dominant_Topic,Keywords,Num_Documents,Perc_Documents
0,0.0,"time, device, service, program, watch_movie, video, thing, shows_movie, thank, kid",5551,0.4417
1,1.0,"tv, selection, show, watch, use, year, computer, four_great, price, way",3712,0.2954
2,2.0,"love, movie, app, problem, work, fire, tablet, tv_show, phone, kindle",3303,0.2629


In [36]:
coherence_model_tfidf = CoherenceModel(model=lda_model_tfidf, texts=data_lemmatized, dictionary=dictionary, coherence='c_npmi')

In [37]:
coherence_model_tfidf.get_coherence_per_topic()

[-0.13766191019561025, -0.17645669284948653, -0.11804508379767793]

### Mallet's LDA with TF-IDF

In [38]:
# Download File: http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip
mallet_path = '../mallet-2.0.8/bin/mallet' # update this path
ldamallet_tfidf = gensim.models.wrappers.LdaMallet(mallet_path, 
                                                   corpus=tf_idf, 
                                                   num_topics=6, 
                                                   random_seed = 100,
                                                   id2word=dictionary,
                                                   alpha = 1.5)

In [39]:
# Visualize the topics
model = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(ldamallet_tfidf)
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(model, tf_idf, dictionary)
pyLDAvis.save_html(vis, 'mallet_lda_vis/mallet_lda_vis_tfidf_num_topics=6&alpha=1.5.html')

In [40]:
coherence_model_m = CoherenceModel(model=model, texts=data_lemmatized, dictionary=dictionary, coherence='c_npmi')
coherence_model_m.get_coherence_per_topic()

In [41]:
df_topic_sents_keywords_tfidf_m = format_topics_sentences(ldamodel=ldamallet_tfidf, corpus=term_doc, texts=data_lemmatized)

Getting main topic for document...
0  1000  2000  3000  4000  5000  6000  7000  8000  9000  10000  11000  12000  

#### 1. Mallet's LDA with TF-IDF - Finding the dominant topic in each document

In [42]:
df_dominant_topic_tfidf_m = find_dominant_topic_in_each_doc(df_topic_sents_keywords_tfidf_m)
print("Finding the dominant topic in each document")
df_dominant_topic_tfidf_m.head(5).style.set_properties(subset=['review'], **{'width': '400px'})

Finding the dominant topic in each document


Unnamed: 0,Dominant_Topic,Perc_Contribution,Keywords,Text,star_rating,helpful_votes,total_votes,review
0,0.0,0.2,"time, video, service, thing, price, product, kid, watch_movie, day, cable",[],5,0,0,Five Stars netflix is great!
1,1.0,0.6345,"selection, show, year, computer, use, watch, content, series, load, commercial","['buck', 'show', 'buck']",5,0,0,GREAT Why pay per view like the others? 8 bucks for all you want to watch. Want to watch every episode of a show? 8 bucks.
2,2.0,0.3834,"movie, love, work, fire, phone, month, episode, lot, picture, title","['navigation', 'movies_show', 'hitch', 'device']",3,0,0,"Satisfactory App runs fine on kindle fire HD. Navigation thru app efficient, movies/shows play without hitch, sound is great. Video quality needs work, expected better from an HD device."
3,0.0,0.2,"time, video, service, thing, price, product, kid, watch_movie, day, cable",[],5,0,0,Five Stars The best
4,4.0,0.4197,"issue, program, way, update, season, kindle, list, money, people, access",['original_series'],5,0,0,Excellent! Love netflix! Worked great on my Kindle. Watched one of the original series that I've been wanting to watch for awhile.


#### 2. Mallet's LDA with TF-IDF - Find the most representative document for each topic

In [43]:
print("Find the most representative document for each topic")
sent_topics_sorteddf_tfidf_m = find_most_representative_doc_for_each_doc(df_topic_sents_keywords_tfidf_m)
sent_topics_sorteddf_tfidf_m.style.set_properties(subset=['review'], **{'width': '400px'})

Find the most representative document for each topic


Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords,star_rating,helpful_votes,total_votes,review
0,0.0,0.9487,"love, work, problem, watch, family, lot, newer_movie, kid, choice, thing",5,0,0,"Netflix is the best! We were more than pleased with our choice of getting Netflix. Prior to having it in our home, I thought it just gave us access to movies. Wow...was I ever wrong! There are tons of great old tv shows, documentaries, family films, Christian films and more. It has been a wonderful addition to our non-cable tv home and a LOT less costly!"
1,0.0,0.9487,"love, work, problem, watch, family, lot, newer_movie, kid, choice, thing",5,0,0,"most used app I love netflix on the kindle fire because its nice and portable, I haven't had any problems with it except that once in a while the sidescroll locks up but if I scroll vertically it fixes the problem this issue is annoying but is not that common and does not hinder usage of the application, overall its amazing and I love using it on my kindle."
2,0.0,0.9487,"love, work, problem, watch, family, lot, newer_movie, kid, choice, thing",2,0,0,"Inconsistent This application is esp. patchy. Delivery of content is labored and iregular--and, as Amazon Prime works fine on my Kindle Fire, my internet speed _should not_ be an issue. Maintenance of recently watched queue is irregular. Search function is poor. Cannot seem to find films that can be found on Wii interface."
3,1.0,0.9608,"movie, app, selection, four_great, time, program, tablet, shows_movie, love_great, service",5,1,1,"The gems go into my game just like my money is removed from my account I ordered a ""Bag of Gems"" for my game ""Dragonvale"" on 1-21-2015 at about 9pm for $4.99. The money was taken out of my account almost instantly, but I'm still waiting on my gems. Any idea on how long it will be? The gems go into my game just like my money is removed from my account. Hmm?"
4,2.0,0.9487,"love, tv, fire, kindle, watch_movie, show, tv_show, commercial, price, device",3,0,0,"Such Profile. Does what you expect it to do, except there are no profiles! Where are my profiles! Needs more profiles! Wow, such profile, many profile, amaze profile!!"


#### 3. Mallet's LDA with TF-IDF - Topic distribution across documents

In [44]:
# Show
print("Topic distribution across documents")
df_dominant_topic_tfidf_m = topic_distribution_across_docs(df_topic_sents_keywords_tfidf_m)
df_dominant_topic_tfidf_m.style.set_properties(subset=['Keywords'], **{'width': '400px'})

Topic distribution across documents


Unnamed: 0,Dominant_Topic,Keywords,Num_Documents,Perc_Documents
0,0.0,"love, work, problem, watch, family, lot, newer_movie, kid, choice, thing",6221,0.4951
1,1.0,"movie, app, selection, four_great, time, program, tablet, shows_movie, love_great, service",3615,0.2877
2,2.0,"love, tv, fire, kindle, watch_movie, show, tv_show, commercial, price, device",2730,0.2173


In [45]:
coherence_model_tfidf_m = CoherenceModel(model=ldamallet_tfidf, texts=data_lemmatized, dictionary=dictionary, coherence='c_npmi')

In [46]:
coherence_model_tfidf_m.get_coherence_per_topic()

[-0.25736556451423853, -0.17892124810727508, -0.2035294296894511]