In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
from collections import Counter
import warnings
import logging

import spacy
from gensim.models import Phrases
from gensim.models.word2vec import LineSentence
from gensim.corpora import Dictionary, MmCorpus
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.coherencemodel import CoherenceModel

import pyLDAvis.gensim

from tqdm._tqdm_notebook import tqdm, tqdm_notebook, tnrange
from S3_read_write import load_df_s3, save_df_s3

from IPython.display import Image
from IPython.core.display import HTML

In [3]:
tqdm_notebook.pandas('Progress')

In [4]:
bucket_name = 'amazon-reviews-project'

In [5]:
logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.DEBUG)

# Load Amazon Reviews Data

Let us start off using only the title (`summary`) and body (`reviewText`) of each review.

In [5]:
%%time
df = load_df_s3(bucket_name, filepath='amazon_reviews/data_clean_v3', filetype='feather')

# df = load_df_s3(bucket_name, filepath='amazon_reviews/reviews_data_clean_v2.feather', filetype='feather')

CPU times: user 2.16 s, sys: 2.01 s, total: 4.17 s
Wall time: 9.81 s


In [6]:
df.dtypes

index                 int64
asin                 object
helpful              object
reviewText           object
overall             float64
summary              object
description          object
title                object
categories_clean     object
cat1                 object
cat2                 object
cat3                 object
cat4                 object
cat5                 object
cat6                 object
cat7                 object
dtype: object

In [7]:
df = df.loc[:, ['asin', 'reviewText', 'summary']]

In [8]:
df.shape

(217530, 3)

In [9]:
df.head()

Unnamed: 0,asin,reviewText,summary
0,978559088,I started taking this after both my parents di...,Bioavailability is the key
1,978559088,I really liked this product because it stayed ...,"I can't find this product any longer, and I wi..."
2,978559088,"Resveratrol is a polar compound, very insolubl...",Just the Resveratrol product we need
3,1427600228,I bought several of these bracelets for my YMC...,The kids love these bracelets
4,1427600228,I bought a few the other week just to see what...,Pleasant Surprise


In [10]:
# for each review, concatenate the review title and body
df.reviewText = df.summary + '. ' + df.reviewText

In [11]:
pd.set_option('max_colwidth', 200)
df.head()

Unnamed: 0,asin,reviewText,summary
0,978559088,Bioavailability is the key. I started taking this after both my parents died of cancer as it supposed to enhance your immune system - the story on 60 Minutes on resveratrol was incredibly inspirin...,Bioavailability is the key
1,978559088,"I can't find this product any longer, and I wish I could.. I really liked this product because it stayed in my mouth for a long time and I felt it was probably doing some good. I take a number of...","I can't find this product any longer, and I wish I could."
2,978559088,"Just the Resveratrol product we need. Resveratrol is a polar compound, very insoluble in water and hence saliva. To get sufficient Resveratrol absorbed a Resveratrol lozenge would have to stay in...",Just the Resveratrol product we need
3,1427600228,"The kids love these bracelets. I bought several of these bracelets for my YMCA kids. Everyone tells me that it brought good luck. Placebo effect, perhaps but it's a positive effect and builds conf...",The kids love these bracelets
4,1427600228,Pleasant Surprise. I bought a few the other week just to see what they're all about. The first day I wore one of the bracelets three people asked about it. They liked the look. I told them it w...,Pleasant Surprise


Let's drop the `summary` column now:

In [12]:
df.drop(['summary'], axis=1, inplace=True)

In [13]:
df.head()

Unnamed: 0,asin,reviewText
0,978559088,Bioavailability is the key. I started taking this after both my parents died of cancer as it supposed to enhance your immune system - the story on 60 Minutes on resveratrol was incredibly inspirin...
1,978559088,"I can't find this product any longer, and I wish I could.. I really liked this product because it stayed in my mouth for a long time and I felt it was probably doing some good. I take a number of..."
2,978559088,"Just the Resveratrol product we need. Resveratrol is a polar compound, very insoluble in water and hence saliva. To get sufficient Resveratrol absorbed a Resveratrol lozenge would have to stay in..."
3,1427600228,"The kids love these bracelets. I bought several of these bracelets for my YMCA kids. Everyone tells me that it brought good luck. Placebo effect, perhaps but it's a positive effect and builds conf..."
4,1427600228,Pleasant Surprise. I bought a few the other week just to see what they're all about. The first day I wore one of the bracelets three people asked about it. They liked the look. I told them it w...


In [14]:
pd.set_option('max_colwidth', 20)

## Remove Missing Reviews

In [15]:
df.reviewText.isnull().sum()    # 73 reviews have neither a review body text, nor a review title

0

Let's look at a few actual review texts:

In [16]:
df.reviewText.iloc[np.random.randint(0, len(df))]

'Great product. A friend told me about this and I love it. It works just like it says. Highly recommend this product.'

In [17]:
df.reviewText.iloc[np.random.randint(0, len(df))]

'Yes! It works fast and lowers your BP withi a month.. It works...it works...lowered my blood pressure by 30 points. I take 2 pills in the AM and 2 in the afternoon. My BP was high and now it is borderline.'

In [18]:
df.reviewText.iloc[np.random.randint(0, len(df))]

"I love it!. I am very sensitive to most things. I can't drink coffee, I can't drink soda, I can't drink most green teas. However with this, I can have just one teaspoon of the Matcha Green Tea with water and I feel a gradual alertness that stays for quite some time with no crashing. I also find it curbs my appetite. Love it!"

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 217530 entries, 0 to 217529
Data columns (total 2 columns):
asin          217530 non-null object
reviewText    217530 non-null object
dtypes: object(2)
memory usage: 3.3+ MB


## Phrase Detection

In [20]:
text = list(df.reviewText.values)    # make an iterable to store only the review text

In [21]:
[sent for sent in text if len(sent) == 0]   # there are no blank sentences

[]

In [22]:
len(text)

217530

In [23]:
# look at a few sample reviews
for rev in text[:4]:
    print(rev, '\n')

Bioavailability is the key. I started taking this after both my parents died of cancer as it supposed to enhance your immune system - the story on 60 Minutes on resveratrol was incredibly inspiring. Doing some research on the Internet, it is indicated that taking resveratrol in lozenge form is preferable as it is broken down by stomach acids.  The ez-melt formula recommended in another review is OK, but it is dissolved in the mouth much more quickly than this lozenge formula, while dissolving more slowly is preferable according to my research.This product has the greatest side effect - since taking it, I haven't had colds or sore throats.  Soon after starting to take it every day, I was starting to come down with a cold, with all my usual symptoms, and was anticipating being very sick the next day, as is my usual pattern.  But I never did get as sick as anticipated - taking this product is the only reason I can come up with.  Since then, I've had no colds or sore throats - it has been 

In [24]:
nlp = spacy.load('en')

The helper functions below are from:

http://nbviewer.jupyter.org/github/skipgram/modern-nlp-in-python/blob/master/executable/Modern_NLP_in_Python.ipynb

In order to use `gensim`'s `Phrases` class to detect natural combinations of words (like 'vanilla ice cream'), we need to format our text into a list of sentences, with each sentence being a list of words.  This process takes a large amount of processing time (for reference, the times shown under the cells are for running the tasks on a c5.18xlarge EC2 instance (equivalent spot fleet)), so `text` has been split into 3 parts.

### Generate Unigram Sentences

In [25]:
len(text)

217530

In [26]:
# split text into 9 parts
text_first  = text[:50000]
text_second = text[50000:100000]
text_third  = text[100000:150000]
text_fourth = text[150000:]

In [27]:
rev_num = 0    # review tracker
sent_num = 0   # sentence tracker
unigram_sents_pos = [] # to store lists of lemmatized tokens for each sentence

for parsed_review in tqdm(nlp.pipe(text_first, batch_size=10000, n_threads=72)):
    rev_num += 1
    for sent in parsed_review.sents:
        sent_num += 1
        # lemmatize tokens & save corresponding pos tag after filtering whitespace and punctuations
        lemmatized_sent = [(token.lemma_, token.pos_) for token in sent if not (token.is_space or token.is_punct)]
        if len(lemmatized_sent) != 0:
            unigram_sents_pos.append([rev_num, sent_num, lemmatized_sent])

print('current rev_num: ', rev_num)
print('current sent_num: ', sent_num)

50000it [06:56, 120.02it/s]

current rev_num:  50000
current sent_num:  289565





In [28]:
len(unigram_sents_pos)

289158

In [29]:
for i in range(5):
    print(unigram_sents_pos[i])

[1, 1, [('bioavailability', 'PROPN'), ('be', 'VERB'), ('the', 'DET'), ('key', 'NOUN')]]
[1, 2, [('-PRON-', 'PRON'), ('start', 'VERB'), ('take', 'VERB'), ('this', 'DET'), ('after', 'ADP'), ('both', 'CCONJ'), ('-PRON-', 'ADJ'), ('parent', 'NOUN'), ('die', 'VERB'), ('of', 'ADP'), ('cancer', 'NOUN'), ('as', 'ADP'), ('-PRON-', 'PRON'), ('suppose', 'VERB'), ('to', 'PART'), ('enhance', 'VERB'), ('-PRON-', 'ADJ'), ('immune', 'ADJ'), ('system', 'NOUN'), ('the', 'DET'), ('story', 'NOUN'), ('on', 'ADP'), ('60', 'NUM'), ('minutes', 'PROPN'), ('on', 'ADP'), ('resveratrol', 'NOUN'), ('be', 'VERB'), ('incredibly', 'ADV'), ('inspiring', 'ADJ')]]
[1, 3, [('do', 'VERB'), ('some', 'DET'), ('research', 'NOUN'), ('on', 'ADP'), ('the', 'DET'), ('internet', 'NOUN'), ('-PRON-', 'PRON'), ('be', 'VERB'), ('indicate', 'VERB'), ('that', 'ADP'), ('take', 'VERB'), ('resveratrol', 'NOUN'), ('in', 'ADP'), ('lozenge', 'NOUN'), ('form', 'NOUN'), ('be', 'VERB'), ('preferable', 'ADJ'), ('as', 'ADP'), ('-PRON-', 'PRON'), 

In [30]:
# check if there are any blank sentences
for sent in unigram_sents_pos:
    if len(sent[2]) == 0:
        print(sent)

In [31]:
# Save progress...
review_number = [row[0] for row in unigram_sents_pos]
sentence_number = [row[1] for row in unigram_sents_pos]
words_joined_all = []
pos_joined_all = []
for sent in unigram_sents_pos:
    word_pos = sent[2]
    word_list = [word for word, pos in word_pos]
    pos_list = [pos for word, pos in word_pos]
    words_joined = '+-+||+-+'.join(word for word in word_list)
    pos_joined   = '+-+||+-+'.join(pos for pos in pos_list)
    words_joined_all.append(words_joined)
    pos_joined_all.append(pos_joined)
    
unigram_sentences_savedf = pd.DataFrame({'review_number': review_number,
                                         'sentence_number': sentence_number,
                                         'unigram_sentences': words_joined_all,
                                         'unigram_pos': pos_joined_all})

save_df_s3(unigram_sentences_savedf, bucket_name, 'amazon_reviews/kk/unigram_sentences.feather')

In [32]:
for parsed_review in tqdm(nlp.pipe(text_second, batch_size=20000, n_threads=36)):
    rev_num += 1
    for sent in parsed_review.sents:
        sent_num += 1
        # lemmatize tokens & save corresponding pos tag after filtering whitespace and punctuations
        lemmatized_sent = [(token.lemma_, token.pos_) for token in sent if not (token.is_space or token.is_punct)]
        if len(lemmatized_sent) != 0:
            unigram_sents_pos.append([rev_num, sent_num, lemmatized_sent])

print('current rev_num: ', rev_num)
print('current sent_num: ', sent_num)

50000it [06:55, 120.43it/s]

current rev_num:  100000
current sent_num:  576892





In [33]:
print(len(unigram_sents_pos))

576034


In [34]:
# Save progress...
review_number = [row[0] for row in unigram_sents_pos]
sentence_number = [row[1] for row in unigram_sents_pos]
words_joined_all = []
pos_joined_all = []
for sent in unigram_sents_pos:
    word_pos = sent[2]
    word_list = [word for word, pos in word_pos]
    pos_list = [pos for word, pos in word_pos]
    words_joined = '+-+||+-+'.join(word for word in word_list)
    pos_joined   = '+-+||+-+'.join(pos for pos in pos_list)
    words_joined_all.append(words_joined)
    pos_joined_all.append(pos_joined)
    
unigram_sentences_savedf = pd.DataFrame({'review_number': review_number,
                                         'sentence_number': sentence_number,
                                         'unigram_sentences': words_joined_all,
                                         'unigram_pos': pos_joined_all})

save_df_s3(unigram_sentences_savedf, bucket_name, 'amazon_reviews/kk/unigram_sentences.feather')

In [35]:
for parsed_review in tqdm(nlp.pipe(text_third, batch_size=20000, n_threads=36)):
    rev_num += 1
    for sent in parsed_review.sents:
        sent_num += 1
        # lemmatize tokens & save corresponding pos tag after filtering whitespace and punctuations
        lemmatized_sent = [(token.lemma_, token.pos_) for token in sent if not (token.is_space or token.is_punct)]
        if len(lemmatized_sent) != 0:
            unigram_sents_pos.append([rev_num, sent_num, lemmatized_sent])

print('current rev_num: ', rev_num)
print('current sent_num: ', sent_num)

50000it [06:27, 129.16it/s]

current rev_num:  150000
current sent_num:  855567





In [36]:
# Save progress...
review_number = [row[0] for row in unigram_sents_pos]
sentence_number = [row[1] for row in unigram_sents_pos]
words_joined_all = []
pos_joined_all = []
for sent in unigram_sents_pos:
    word_pos = sent[2]
    word_list = [word for word, pos in word_pos]
    pos_list = [pos for word, pos in word_pos]
    words_joined = '+-+||+-+'.join(word for word in word_list)
    pos_joined   = '+-+||+-+'.join(pos for pos in pos_list)
    words_joined_all.append(words_joined)
    pos_joined_all.append(pos_joined)
    
unigram_sentences_savedf = pd.DataFrame({'review_number': review_number,
                                         'sentence_number': sentence_number,
                                         'unigram_sentences': words_joined_all,
                                         'unigram_pos': pos_joined_all})

save_df_s3(unigram_sentences_savedf, bucket_name, 'amazon_reviews/kk/unigram_sentences.feather')

In [37]:
for parsed_review in tqdm(nlp.pipe(text_fourth, batch_size=20000, n_threads=36)):
    rev_num += 1
    for sent in parsed_review.sents:
        sent_num += 1
        # lemmatize tokens & save corresponding pos tag after filtering whitespace and punctuations
        lemmatized_sent = [(token.lemma_, token.pos_) for token in sent if not (token.is_space or token.is_punct)]
        if len(lemmatized_sent) != 0:
            unigram_sents_pos.append([rev_num, sent_num, lemmatized_sent])

print('current rev_num: ', rev_num)
print('current sent_num: ', sent_num)

67530it [09:13, 122.02it/s]

current rev_num:  217530
current sent_num:  1243596





In [38]:
# Save progress...
review_number = [row[0] for row in unigram_sents_pos]
sentence_number = [row[1] for row in unigram_sents_pos]
words_joined_all = []
pos_joined_all = []
for sent in unigram_sents_pos:
    word_pos = sent[2]
    word_list = [word for word, pos in word_pos]
    pos_list = [pos for word, pos in word_pos]
    words_joined = '+-+||+-+'.join(word for word in word_list)
    pos_joined   = '+-+||+-+'.join(pos for pos in pos_list)
    words_joined_all.append(words_joined)
    pos_joined_all.append(pos_joined)
    
unigram_sentences_savedf = pd.DataFrame({'review_number': review_number,
                                         'sentence_number': sentence_number,
                                         'unigram_sentences': words_joined_all,
                                         'unigram_pos': pos_joined_all})

save_df_s3(unigram_sentences_savedf, bucket_name, 'amazon_reviews/kk/unigram_sentences.feather')

In [39]:
# DON'T LOAD THIS FILE - there's a _v1 version further down!
# del unigram_sentences_savedf
unigram_sentences_savedf = load_df_s3(bucket_name, 'amazon_reviews/kk/unigram_sentences.feather', filetype='feather')

In [40]:
unigram_sentences_savedf.head()

Unnamed: 0,review_number,sentence_number,unigram_pos,unigram_sentences
0,1,1,PROPN+-+||+-+VER...,bioavailability+...
1,1,2,PRON+-+||+-+VERB...,-PRON-+-+||+-+st...
2,1,3,VERB+-+||+-+DET+...,do+-+||+-+some+-...
3,1,4,DET+-+||+-+ADP+-...,the+-+||+-+ez+-+...
4,1,5,DET+-+||+-+NOUN+...,this+-+||+-+prod...


In [41]:
unigram_sentences_savedf[unigram_sentences_savedf.unigram_pos == ''].shape

(0, 4)

In [42]:
unigram_sentences_savedf[unigram_sentences_savedf.unigram_sentences == ''].shape

(0, 4)

In [43]:
unigram_sentences_savedf[unigram_sentences_savedf.unigram_pos == ''].head()  # no blank sentences

Unnamed: 0,review_number,sentence_number,unigram_pos,unigram_sentences


In [44]:
unigram_sentences_savedf[unigram_sentences_savedf.unigram_pos == ''].shape

(0, 4)

#### Additional Data Cleaning

In [45]:
def clean_up(sentence, sentence_pos, sep):
    """Expects a sentence as a single string as input 1, and its corresponding part-of-speech tags as input 2 (also single string).
    sep is the string pattern used to separate words in each sentence string
    Cleans it up and returns a single string.
    Also updates corresponding part-of-speech string.
    """
    # get rid of webpage links
    cond = ['http' in sentence, 'www' in sentence]
    if any(cond):
        words = sentence.split(sep)
        words_pos = sentence_pos.split(sep)
        to_remove = []
        for i in range(len(words)):
            cond_word = ['http' in words[i], 'www' in words[i]]
            if any(cond_word):
                to_remove.append(i)
        # remove words that are links
        for j in sorted(to_remove, reverse=True):
            del words[j]
            del words_pos[j]
        # reconstruct sentence after deleting links
        sentence = sep.join(words)
        sentence_pos = sep.join(words_pos)

    # replace underscores with blanks to avoid mix-up with paired words later
    # cannot replace with spaces because the strings are split on spaces later 
    # and this would create new words with no corresponding pos tags
    if '_' in sentence:
        sentence = sentence.replace('_', '')
    return sentence, sentence_pos

In [46]:
test_clean = ['whoa watch out for them links boy http://sup.com and also BAM! underscore_time!', 'this is a normal sentence', 
              '__ what is this ____ http', '_', 'http']
test_clean

['whoa watch out for them links boy http://sup.com and also BAM! underscore_time!',
 'this is a normal sentence',
 '__ what is this ____ http',
 '_',
 'http']

In [47]:
test_clean_pos = ['X X X X X X X X X X X X', 'X X X X X', 'X X X X X X', 'X', 'X']

In [48]:
[len(e.split(' ')) for e in test_clean]

[12, 5, 6, 1, 1]

In [49]:
[e.count('X') for e in test_clean_pos]

[12, 5, 6, 1, 1]

In [50]:
# check if clean_up works as expected
to_remove = []
for i in range(len(test_clean)):
    sentence = test_clean[i]
    sentence_pos = test_clean_pos[i]
    test_clean[i], test_clean_pos[i] = clean_up(sentence, sentence_pos, sep=' ')
    
    # mark elements to delete if empty
    if test_clean[i] == '':
        to_remove.append(i)

# delete elements that are empty
for j in sorted(to_remove, reverse=True):
    del test_clean[j]
    del test_clean_pos[j]

test_clean

['whoa watch out for them links boy and also BAM! underscoretime!',
 'this is a normal sentence',
 ' what is this ']

In [51]:
test_clean_pos

['X X X X X X X X X X X', 'X X X X X', 'X X X X X']

In [52]:
[e.count('X') for e in test_clean_pos]

[11, 5, 5]

In [53]:
[len(e.split(' ')) for e in test_clean]

[11, 5, 5]

In [54]:
words_joined_all = unigram_sentences_savedf.unigram_sentences.tolist()

In [55]:
pos_joined_all = unigram_sentences_savedf.unigram_pos.tolist()

In [56]:
len(words_joined_all)

1241850

In [57]:
len([sentence for sentence in words_joined_all if '_' in sentence])

172

In [58]:
len([sentence for sentence in words_joined_all if 'http' in sentence])

126

In [59]:
len([sentence for sentence in words_joined_all if 'www' in sentence])

157

In [60]:
unigram_sentences_savedf[unigram_sentences_savedf.unigram_sentences.str.contains('_')].head()

Unnamed: 0,review_number,sentence_number,unigram_pos,unigram_sentences
3375,607,3386,X,http://www.amazo...
8051,1454,8071,DET+-+||+-+NOUN+...,no+-+||+-+jet_la...
12199,2166,12224,ADJ+-+||+-+PART+...,easy+-+||+-+to+-...
12201,2166,12226,PRON+-+||+-+VERB...,-PRON-+-+||+-+ha...
18188,3106,18214,ADV+-+||+-+ADJ+-...,overall+-+||+-+-...


In [61]:
[sentence for sentence in words_joined_all if '_' in sentence][:10]

['http://www.amazon.com/gp/product/b0000533z8/ref=cm_cr_rev_prod_title',
 'no+-+||+-+jet_lag+-+||+-+pill',
 'easy+-+||+-+to+-+||+-+use_work+-+||+-+well',
 '-PRON-+-+||+-+have+-+||+-+have+-+||+-+pedometer+-+||+-+in+-+||+-+the+-+||+-+past_all+-+||+-+difficult+-+||+-+and+-+||+-+confusing+-+||+-+to+-+||+-+use+-+||+-+to+-+||+-+the+-+||+-+point+-+||+-+-PRON-+-+||+-+simply+-+||+-+give+-+||+-+up+-+||+-+on+-+||+-+-PRON-',
 'overall+-+||+-+-PRON-+-+||+-+mother+-+||+-+be+-+||+-+very+-+||+-+satisfied+-+||+-+with+-+||+-+this+-+||+-+product!-d_lionz',
 'this+-+||+-+inexpensive+-+||+-+strap+-+||+-+with+-+||+-+a+-+||+-+metal+-+||+-+clip+-+||+-+http://www.amazon.com/gp/product/b000bitymg/ref=oh_details_o00_s00_i00?ie=utf8&psc;=1+-+||+-+be+-+||+-+a+-+||+-+good+-+||+-+replacement+-+||+-+for+-+||+-+the+-+||+-+flimsy+-+||+-+omron+-+||+-+plastic+-+||+-+clip+-+||+-+but+-+||+-+-PRON-+-+||+-+have+-+||+-+not+-+||+-+be+-+||+-+use+-+||+-+-PRON-+-+||+-+long',
 'hj_112+-+||+-+digital+-+||+-+pemium+-+||+-+pedometer+

In [62]:
# clean up all unigrams
to_remove = []
for i in range(len(words_joined_all)):
    sentence = words_joined_all[i]
    sentence_pos = pos_joined_all[i]
    words_joined_all[i], pos_joined_all[i] = clean_up(sentence, sentence_pos, sep='+-+||+-+')
    
    # mark elements to delete if empty
    if words_joined_all[i] == '':
        to_remove.append(i)

# delete elements that are empty
for j in sorted(to_remove, reverse=True):
    del words_joined_all[j]
    del pos_joined_all[j]

In [63]:
# drop rows from unigram_sentences_savedf corresponding to the row numbers (indices) of sentences
# that will be blank after the transformation above
unigram_sentences_savedf.drop(unigram_sentences_savedf.index[to_remove], axis=0, inplace=True)

In [64]:
unigram_sentences_savedf.drop(['unigram_sentences'], axis=1, inplace=True)
unigram_sentences_savedf.drop(['unigram_pos'], axis=1, inplace=True)

In [65]:
unigram_sentences_savedf['unigram_sentences'] = words_joined_all
unigram_sentences_savedf['unigram_pos'] = pos_joined_all

In [66]:
unigram_sentences_savedf.head()

Unnamed: 0,review_number,sentence_number,unigram_sentences,unigram_pos
0,1,1,bioavailability+...,PROPN+-+||+-+VER...
1,1,2,-PRON-+-+||+-+st...,PRON+-+||+-+VERB...
2,1,3,do+-+||+-+some+-...,VERB+-+||+-+DET+...
3,1,4,the+-+||+-+ez+-+...,DET+-+||+-+ADP+-...
4,1,5,this+-+||+-+prod...,DET+-+||+-+NOUN+...


In [68]:
unigram_sentences_savedf.shape

(1241826, 4)

In [69]:
# updated, cleaned up version of unigram_sentences.feather
save_df_s3(unigram_sentences_savedf, bucket_name, 'amazon_reviews/kk/unigram_sentences_v1.feather')

### Phrase Detection

In [21]:
unigram_sentences_savedf = load_df_s3(bucket_name, 'amazon_reviews/kk/unigram_sentences_v1.feather', filetype='feather')

In [70]:
words_joined_all = unigram_sentences_savedf.unigram_sentences.tolist()

In [71]:
unigram_sentences = [sentence.split('+-+||+-+') for sentence in words_joined_all]

In [72]:
print(unigram_sentences[:4])

[['bioavailability', 'be', 'the', 'key'], ['-PRON-', 'start', 'take', 'this', 'after', 'both', '-PRON-', 'parent', 'die', 'of', 'cancer', 'as', '-PRON-', 'suppose', 'to', 'enhance', '-PRON-', 'immune', 'system', 'the', 'story', 'on', '60', 'minutes', 'on', 'resveratrol', 'be', 'incredibly', 'inspiring'], ['do', 'some', 'research', 'on', 'the', 'internet', '-PRON-', 'be', 'indicate', 'that', 'take', 'resveratrol', 'in', 'lozenge', 'form', 'be', 'preferable', 'as', '-PRON-', 'be', 'break', 'down', 'by', 'stomach', 'acid'], ['the', 'ez', 'melt', 'formula', 'recommend', 'in', 'another', 'review', 'be', 'ok', 'but', '-PRON-', 'be', 'dissolve', 'in', 'the', 'mouth', 'much', 'more', 'quickly', 'than', 'this', 'lozenge', 'formula', 'while', 'dissolve', 'more', 'slowly', 'be', 'preferable', 'accord', 'to', '-PRON-', 'research']]


In [73]:
len(words_joined_all)

1241826

In [74]:
%%time
# The common_terms parameter add a way to give special treatment to common terms 
# (aka stop words) such that their presence between two words won’t prevent bigram detection. 
# It allows to detect expressions like “bank of america”
common_terms = ["of", "with", "without", "and", "or"]

# Train a first-order phrase detector
bigram_model = Phrases(unigram_sentences, threshold=0.6, scoring='npmi', common_terms=common_terms)

# Transform unigram sentences into bigram sentences
# Paired words are connected by an underscore, e.g. ice_cream
bigram_sentences = []
for sentence in unigram_sentences:
    bigram_sentences.append(bigram_model[sentence])



CPU times: user 1min 24s, sys: 1.12 s, total: 1min 25s
Wall time: 1min 25s


In [75]:
%%time
# Train a second-order phrase detector
# trigram_model = Phrases(bigram_sentences, min_count=5)
trigram_model = Phrases(bigram_sentences, threshold=0.5, scoring='npmi')

# Transform bigram sentences into trigram sentences
trigram_sentences = []
for sentence in bigram_sentences:
    trigram_sentences.append(trigram_model[sentence])

# remove any remaining stopwords
# trigram_sentences = [[word for word in sentence if word not in nlp.Defaults.stop_words] for sentence in trigram_sentences]



CPU times: user 1min 23s, sys: 1.29 s, total: 1min 24s
Wall time: 1min 24s


In [76]:
# the trigrams will be saved in a dataframe with a single column.
# each row is one sentence from any review
# each sentence is a single string separated by a single space.
trigram_sentences_savedf = pd.DataFrame([u'+-+||+-+'.join(sentence) for sentence in trigram_sentences], columns=['preprocessed_review'])
save_df_s3(trigram_sentences_savedf, bucket_name, 'amazon_reviews/kk/preprocessed_reviews.feather')

In [5]:
trigram_sentences_savedf = load_df_s3(bucket_name, 'amazon_reviews/kk/preprocessed_reviews.feather', filetype='feather')

In [77]:
trigram_sentences_savedf.head()

Unnamed: 0,preprocessed_review
0,bioavailability+...
1,-PRON-+-+||+-+st...
2,do+-+||+-+some+-...
3,the+-+||+-+ez+-+...
4,this_product+-+|...


In [78]:
del unigram_sentences_savedf
unigram_sents_pos_df = load_df_s3(bucket_name, 'amazon_reviews/kk/unigram_sentences_v1.feather', filetype='feather')

In [79]:
unigram_sents_pos_df.head()

Unnamed: 0,review_number,sentence_number,unigram_sentences,unigram_pos
0,1,1,bioavailability+...,PROPN+-+||+-+VER...
1,1,2,-PRON-+-+||+-+st...,PRON+-+||+-+VERB...
2,1,3,do+-+||+-+some+-...,VERB+-+||+-+DET+...
3,1,4,the+-+||+-+ez+-+...,DET+-+||+-+ADP+-...
4,1,5,this+-+||+-+prod...,DET+-+||+-+NOUN+...


In [80]:
unigram_sents_pos_df[unigram_sents_pos_df.unigram_pos == ''].shape

(0, 4)

In [81]:
unigram_sents_pos_df.shape

(1241826, 4)

In [82]:
unigram_sents_pos_df = pd.merge(unigram_sents_pos_df, trigram_sentences_savedf, how='inner', left_index=True, right_index=True)

In [83]:
unigram_sents_pos_df.head(10)

Unnamed: 0,review_number,sentence_number,unigram_sentences,unigram_pos,preprocessed_review
0,1,1,bioavailability+...,PROPN+-+||+-+VER...,bioavailability+...
1,1,2,-PRON-+-+||+-+st...,PRON+-+||+-+VERB...,-PRON-+-+||+-+st...
2,1,3,do+-+||+-+some+-...,VERB+-+||+-+DET+...,do+-+||+-+some+-...
3,1,4,the+-+||+-+ez+-+...,DET+-+||+-+ADP+-...,the+-+||+-+ez+-+...
4,1,5,this+-+||+-+prod...,DET+-+||+-+NOUN+...,this_product+-+|...
5,1,6,soon+-+||+-+afte...,ADV+-+||+-+ADP+-...,soon+-+||+-+afte...
6,1,7,but+-+||+-+-PRON...,CCONJ+-+||+-+PRO...,but+-+||+-+-PRON...
7,1,8,since+-+||+-+the...,ADP+-+||+-+ADV+-...,since+-+||+-+the...
8,1,9,-PRON-+-+||+-+re...,PRON+-+||+-+VERB...,-PRON-+-+||+-+re...
9,2,10,-PRON-+-+||+-+ca...,PRON+-+||+-+VERB...,-PRON-+-+||+-+ca...


In [84]:
save_df_s3(unigram_sents_pos_df, bucket_name, 'amazon_reviews/kk/preprocessed_reviews_v1.feather')

In [113]:
unigram_sents_pos_df = load_df_s3(bucket_name, 'amazon_reviews/kk/preprocessed_reviews_v1.feather', filetype='feather')

In [85]:
unigram_sents_pos_df.shape

(1241826, 5)

In [86]:
unigram_sents_pos_df.iloc[400:410]

Unnamed: 0,review_number,sentence_number,unigram_sentences,unigram_pos,preprocessed_review
400,73,401,-PRON-+-+||+-+ha...,PRON+-+||+-+VERB...,-PRON-+-+||+-+ha...
401,73,402,so+-+||+-+just+-...,ADV+-+||+-+ADV+-...,so+-+||+-+just+-...
402,73,403,-PRON-+-+||+-+ca...,PRON+-+||+-+VERB...,-PRON-+-+||+-+ca...
403,73,404,-PRON-+-+||+-+do...,PRON+-+||+-+VERB...,-PRON-+-+||+-+do...
404,73,405,but+-+||+-+-PRON...,CCONJ+-+||+-+PRO...,but+-+||+-+-PRON...
405,74,406,excellent+-+||+-...,INTJ+-+||+-+PRON...,excellent+-+||+-...
406,74,407,-PRON-+-+||+-+ha...,PRON+-+||+-+VERB...,-PRON-+-+||+-+ha...
407,74,408,-PRON-+-+||+-+ac...,PRON+-+||+-+ADV+...,-PRON-+-+||+-+ac...
408,74,409,-PRON-+-+||+-+be...,PRON+-+||+-+VERB...,-PRON-+-+||+-+be...
409,74,410,one+-+||+-+pill+...,NUM+-+||+-+NOUN+...,one+-+||+-+pill+...


In [87]:
unigram_sents_pos_df.isnull().sum()

review_number          0
sentence_number        0
unigram_sentences      0
unigram_pos            0
preprocessed_review    0
dtype: int64

In [88]:
unigram_sents_pos_df['has_paired_words'] = 0

In [89]:
unigram_sents_pos_df.loc[unigram_sents_pos_df.preprocessed_review.str.contains('_'), ['has_paired_words']] = 1

In [90]:
unigram_sents_pos_df.has_paired_words.sum()  # number of sentences with paired words

532356

In [91]:
unigram_sents_pos_df.head()

Unnamed: 0,review_number,sentence_number,unigram_sentences,unigram_pos,preprocessed_review,has_paired_words
0,1,1,bioavailability+...,PROPN+-+||+-+VER...,bioavailability+...,0
1,1,2,-PRON-+-+||+-+st...,PRON+-+||+-+VERB...,-PRON-+-+||+-+st...,1
2,1,3,do+-+||+-+some+-...,VERB+-+||+-+DET+...,do+-+||+-+some+-...,1
3,1,4,the+-+||+-+ez+-+...,DET+-+||+-+ADP+-...,the+-+||+-+ez+-+...,0
4,1,5,this+-+||+-+prod...,DET+-+||+-+NOUN+...,this_product+-+|...,1


In [92]:
%%time
unigram_sents_pos_df.unigram_pos = [sent.split('+-+||+-+') for sent in unigram_sents_pos_df.unigram_pos.tolist()]
unigram_sents_pos_df.unigram_sentences = [sent.split('+-+||+-+') for sent in unigram_sents_pos_df.unigram_sentences.tolist()]
unigram_sents_pos_df.preprocessed_review = [sent.split('+-+||+-+') for sent in unigram_sents_pos_df.preprocessed_review.tolist()]

CPU times: user 13.5 s, sys: 2.31 s, total: 15.8 s
Wall time: 15.8 s


In [93]:
unigram_sents_pos_df.iloc[400:410]

Unnamed: 0,review_number,sentence_number,unigram_sentences,unigram_pos,preprocessed_review,has_paired_words
400,73,401,"[-PRON-, have, u...","[PRON, VERB, VER...","[-PRON-, have, u...",1
401,73,402,"[so, just, take,...","[ADV, ADV, VERB,...","[so, just, take,...",0
402,73,403,"[-PRON-, can, no...","[PRON, VERB, ADV...","[-PRON-, can_not...",1
403,73,404,"[-PRON-, do, not...","[PRON, VERB, ADV...","[-PRON-, do_not,...",1
404,73,405,"[but, -PRON-, be...","[CCONJ, PRON, VE...","[but, -PRON-, be...",0
405,74,406,"[excellent, prod...","[INTJ, PRON, ADJ...","[excellent, prod...",0
406,74,407,"[-PRON-, have, t...","[PRON, VERB, VER...","[-PRON-, have, t...",1
407,74,408,"[-PRON-, actuall...","[PRON, ADV, VERB...","[-PRON-, actuall...",1
408,74,409,"[-PRON-, be, con...","[PRON, VERB, ADJ...","[-PRON-, be, con...",0
409,74,410,"[one, pill, seem...","[NUM, NOUN, VERB...","[one, pill, seem...",1


In [94]:
unigram_sents_pos_df.isnull().sum()

review_number          0
sentence_number        0
unigram_sentences      0
unigram_pos            0
preprocessed_review    0
has_paired_words       0
dtype: int64

Let's look at an arbitrary sentence and it's transformation:

In [95]:
print(unigram_sents_pos_df.unigram_sentences.iloc[105])

['-PRON-', 'have', 'buy', 'nu', 'skin', 'product', 'about', '5', 'year', 'ago', 'from', 'a', 'beauty', 'salon', 'and', '-PRON-', 'love', '-PRON-']


In [96]:
print(unigram_sents_pos_df.unigram_pos.iloc[105])

['PRON', 'VERB', 'VERB', 'PROPN', 'PROPN', 'NOUN', 'ADV', 'NUM', 'NOUN', 'ADV', 'ADP', 'DET', 'NOUN', 'NOUN', 'CCONJ', 'PRON', 'VERB', 'PRON']


In [97]:
print(unigram_sents_pos_df.preprocessed_review.iloc[105])

['-PRON-', 'have', 'buy', 'nu', 'skin', 'product', 'about', '5', 'year_ago', 'from', 'a', 'beauty', 'salon', 'and', '-PRON-', 'love', '-PRON-']


In [98]:
gramlist = [word for sent in trigram_sentences for word in sent if '_' in word]

In [99]:
paired_words_frq = Counter(gramlist)
paired_words_frq.most_common(100)

[('this_product', 69494),
 ('do_not', 59653),
 ('seem_to', 14671),
 ('can_not', 14216),
 ('fish_oil', 12919),
 ('highly_recommend', 12415),
 ('as_well', 10388),
 ('lot_of', 9337),
 ('so_far', 6983),
 ('would_recommend', 6173),
 ('will_continue', 5383),
 ('every_day', 5290),
 ('side_effect', 5008),
 ('along_with', 4968),
 ('vitamin_d', 4601),
 ('at_least', 4305),
 ('per_day', 4262),
 ('high_quality', 4237),
 ('year_ago', 4147),
 ('at_night', 3881),
 ('very_happy', 3827),
 ('vitamin_c', 3800),
 ('suffer_from', 3715),
 ('immune_system', 3494),
 ('five_star', 3471),
 ('omega_3', 3426),
 ('run_out', 3420),
 ('year_old', 3134),
 ('long_time', 3126),
 ('no_longer', 3036),
 ('krill_oil', 3001),
 ('make_sure', 2684),
 ('out_there', 2672),
 ('very_pleased', 2620),
 ('wake_up', 2582),
 ('blood_pressure', 2492),
 ('energy_level', 2438),
 ('no_side_effect', 2353),
 ('get_sick', 2260),
 ('anyone_who', 2208),
 ('every_morning', 2101),
 ('go_away', 2018),
 ('joint_pain', 1996),
 ('weight_loss', 1992),

In [100]:
# Find the 100 most infrequent paired words
paired_words_frq.most_common()[::-1][:100]

[('wishful_thinking!ftc_disclosure', 1),
 ('none_after!.', 1),
 ('34;healthy_fat&#34', 1),
 ('atrail_fibrillationso', 1),
 ('superior_product!paula', 1),
 ('ever!!!highly_recommended', 1),
 ('george_flansbaum_whom', 1),
 ('side_effects).this', 1),
 ('side_effects!so', 1),
 ('onelife_pharma_sound', 1),
 ('coco_mak_seriously', 1),
 ('34;off_days&#34', 1),
 ('i&#8217;m_assuming', 1),
 ('yeast_infection_every2', 1),
 ('plant_derived', 1),
 ('protease_enzym', 1),
 ('virtually_untreatable', 1),
 ('con_su_rodilla', 1),
 ('mi_mama', 1),
 ('ayuda_mucho', 1),
 ('productmuy_buen_producto', 1),
 ('sodium_free.&#34_wow', 1),
 ('1000_mg_serving!.', 1),
 ('countless_cortizone', 1),
 ('tons_of_benefits!.', 1),
 ('34;clear_out&#34', 1),
 ('bri_nutrition&#8217;s_unconditional_guarantee', 1),
 ('dieting_pilling', 1),
 ('nutrition!._bri_nutrition', 1),
 ('garage_and_repaint', 1),
 ('bri_nutrition_triphalia', 1),
 ('unhygienic_colon', 1),
 ('slouchy_and_drain', 1),
 ('8220;bowel_issues&#8221_lately', 1),
 

In [101]:
len(paired_words_frq)  # number of paired terms  (this drops down to 46,785 after further processing)

83377

In [102]:
unigram_sents_pos_df.head()

Unnamed: 0,review_number,sentence_number,unigram_sentences,unigram_pos,preprocessed_review,has_paired_words
0,1,1,[bioavailability...,"[PROPN, VERB, DE...",[bioavailability...,0
1,1,2,"[-PRON-, start, ...","[PRON, VERB, VER...","[-PRON-, start, ...",1
2,1,3,"[do, some, resea...","[VERB, DET, NOUN...","[do, some, resea...",1
3,1,4,"[the, ez, melt, ...","[DET, ADP, NOUN,...","[the, ez, melt, ...",0
4,1,5,"[this, product, ...","[DET, NOUN, VERB...","[this_product, h...",1


In [103]:
def handle_failed_pairing(i, skip, num_paired, sent, sent_paired, to_remove):
    # split up paired words failing our format requirements
    to_remove.extend([i])
    sent_paired.extend(sent[i + skip: i + skip + num_paired])


def filter_pairs(sent, sent_paired, sent_pos):
    """modify sent_paired in place"""
    paired_sent_len = len(sent_paired)
    skip = 0
    to_remove = []
    
    for i in range(paired_sent_len):
        word = sent_paired[i]
        if '_' in word:
            num_paired = word.count('_') + 1
            
            # more than 3 words paired - ignore pairing
            if num_paired > 3:
                handle_failed_pairing(i, skip, num_paired, sent, sent_paired, to_remove)
                skip += num_paired - 1
                continue
            
            # bigrams: noun/adj, noun
            elif num_paired == 2:
                pos_word_1 = sent_pos[i + skip]
                pos_word_2 = sent_pos[i + skip + 1]
                cond = (pos_word_1 in ('NOUN', 'ADJ'), pos_word_2 == 'NOUN')
                if not all(cond):
                    handle_failed_pairing(i, skip, num_paired, sent, sent_paired, to_remove)
                    skip += num_paired - 1
                    continue
            
            # trigrams: noun/adj, all types, noun/adj
            elif num_paired == 3:
                pos_word_1 = sent_pos[i + skip]
                pos_word_2 = sent_pos[i + skip + 1]
                pos_word_3 = sent_pos[i + skip + 2]
                cond = (pos_word_1 in ('NOUN', 'ADJ'), pos_word_3 in ('NOUN', 'ADJ'))
                if not all(cond):
                    handle_failed_pairing(i, skip, num_paired, sent, sent_paired, to_remove)
                    skip += num_paired - 1
                    continue
        
            # num. of words to skip indexing over sent and sent_pos in the next iter
            skip += num_paired - 1
        
    # remove rejected pairs that are already split and added back individually
    if len(to_remove) > 0:
        for j in sorted(to_remove, reverse=True):
            del sent_paired[j]

**Test the filtering function:**

Test 1:

In [104]:
sent = ['liver', 'support', 'supports', 'liver', 'function', 'stimulate', 'des', 'intoxication', 'and', 'restore', 'liver', 'function', 'eliminate', 'harmful', 'metabolite']
print(sent)

['liver', 'support', 'supports', 'liver', 'function', 'stimulate', 'des', 'intoxication', 'and', 'restore', 'liver', 'function', 'eliminate', 'harmful', 'metabolite']


In [105]:
sent_pos = ['PROPN', 'PROPN', 'PROPN', 'NOUN', 'NOUN', 'VERB', 'X', 'NOUN', 'CCONJ', 'VERB', 'NOUN', 'NOUN', 'VERB', 'ADJ', 'NOUN']
print(sent_pos)

['PROPN', 'PROPN', 'PROPN', 'NOUN', 'NOUN', 'VERB', 'X', 'NOUN', 'CCONJ', 'VERB', 'NOUN', 'NOUN', 'VERB', 'ADJ', 'NOUN']


In [106]:
sent_paired = ['liver', 'support', 'supports', 'liver_function', 'stimulate_des_intoxication_and_restore', 'liver_function', 'eliminate', 'harmful', 'metabolite']
print(sent_paired)

['liver', 'support', 'supports', 'liver_function', 'stimulate_des_intoxication_and_restore', 'liver_function', 'eliminate', 'harmful', 'metabolite']


In [107]:
filter_pairs(sent, sent_paired, sent_pos)

In [108]:
# Expected output:
print(['liver', 'support', 'supports', 'liver_function', 'liver_function', 'eliminate', 'harmful', 'metabolite', 'stimulate', 'des', 'intoxication', 'and', 'restore'])

['liver', 'support', 'supports', 'liver_function', 'liver_function', 'eliminate', 'harmful', 'metabolite', 'stimulate', 'des', 'intoxication', 'and', 'restore']


In [109]:
print(sent_paired)

['liver', 'support', 'supports', 'liver_function', 'liver_function', 'eliminate', 'harmful', 'metabolite', 'stimulate', 'des', 'intoxication', 'and', 'restore']


Test 2:

In [110]:
sent = ['-PRON-', 'have', 'a', 'lot', 'more', 'energy', 'and', 'have', 'not', 'be', 'sick', 'at', 'all']
print(sent)

['-PRON-', 'have', 'a', 'lot', 'more', 'energy', 'and', 'have', 'not', 'be', 'sick', 'at', 'all']


In [111]:
sent_pos = ['PRON', 'VERB', 'DET', 'NOUN', 'ADJ', 'NOUN', 'CCONJ', 'VERB', 'ADV', 'VERB', 'ADJ', 'ADV', 'ADV']
print(sent_pos)

['PRON', 'VERB', 'DET', 'NOUN', 'ADJ', 'NOUN', 'CCONJ', 'VERB', 'ADV', 'VERB', 'ADJ', 'ADV', 'ADV']


In [112]:
sent_paired = ['-PRON-', 'have', 'a_lot', 'more_energy', 'and', 'have', 'not', 'be', 'sick', 'at_all']
print(sent_paired)

['-PRON-', 'have', 'a_lot', 'more_energy', 'and', 'have', 'not', 'be', 'sick', 'at_all']


In [113]:
filter_pairs(sent, sent_paired, sent_pos)

In [114]:
print(sent_paired)

['-PRON-', 'have', 'more_energy', 'and', 'have', 'not', 'be', 'sick', 'a', 'lot', 'at', 'all']


### Filter Phrases

In [115]:
unigram_sents_pos_df.head()

Unnamed: 0,review_number,sentence_number,unigram_sentences,unigram_pos,preprocessed_review,has_paired_words
0,1,1,[bioavailability...,"[PROPN, VERB, DE...",[bioavailability...,0
1,1,2,"[-PRON-, start, ...","[PRON, VERB, VER...","[-PRON-, start, ...",1
2,1,3,"[do, some, resea...","[VERB, DET, NOUN...","[do, some, resea...",1
3,1,4,"[the, ez, melt, ...","[DET, ADP, NOUN,...","[the, ez, melt, ...",0
4,1,5,"[this, product, ...","[DET, NOUN, VERB...","[this_product, h...",1


In [116]:
preprocessed_reviews = unigram_sents_pos_df.preprocessed_review.tolist()
unigram_sentences = unigram_sents_pos_df.unigram_sentences.tolist()
unigram_pos = unigram_sents_pos_df.unigram_pos.tolist()
has_paired_words = unigram_sents_pos_df.has_paired_words.tolist()

In [117]:
# get rid of paired words from the corpus which
# (1) have more than 3 words joined
# (2) bigrams not in the format: noun/adj, noun
# (3) trigrams not in the format: noun/adj, all types, noun/adj
for i in tqdm(range(len(preprocessed_reviews))):
    if has_paired_words[i] == 1:
        filter_pairs(sent=unigram_sentences[i], sent_paired=preprocessed_reviews[i], sent_pos=unigram_pos[i])

100%|██████████| 1241826/1241826 [00:03<00:00, 393305.14it/s]


In [118]:
# save picked dataframe to S3.  Pickle format allows the columns to store lists
save_df_s3(unigram_sents_pos_df, bucket_name, filepath='amazon_reviews/kk/preprocessed_reviews_v2.pkl', filetype='pickle')

In [5]:
# load from the pickled dataframe on S3
unigram_sents_pos_df = load_df_s3(bucket_name, filepath='amazon_reviews/kk/preprocessed_reviews_v2.pkl', filetype='pickle')

In [119]:
unigram_sents_pos_df.head()

Unnamed: 0,review_number,sentence_number,unigram_sentences,unigram_pos,preprocessed_review,has_paired_words
0,1,1,[bioavailability...,"[PROPN, VERB, DE...",[bioavailability...,0
1,1,2,"[-PRON-, start, ...","[PRON, VERB, VER...","[-PRON-, start, ...",1
2,1,3,"[do, some, resea...","[VERB, DET, NOUN...","[do, some, resea...",1
3,1,4,"[the, ez, melt, ...","[DET, ADP, NOUN,...","[the, ez, melt, ...",0
4,1,5,"[this, product, ...","[DET, NOUN, VERB...","[have, the, grea...",1


In [120]:
unigram_sents_pos_df.shape

(1241826, 6)

In [121]:
preprocessed_review_updated = unigram_sents_pos_df.preprocessed_review.tolist()

In [122]:
len(preprocessed_review_updated)

1241826

In [123]:
preprocessed_review_updated[:3]

[['bioavailability', 'be', 'the', 'key'],
 ['-PRON-',
  'start',
  'take',
  'this',
  'after',
  'both',
  '-PRON-',
  'parent',
  'die',
  'of',
  'cancer',
  'as',
  '-PRON-',
  'suppose',
  'to',
  'enhance',
  '-PRON-',
  'immune_system',
  'the',
  'story',
  'on',
  'on',
  'resveratrol',
  'be',
  '60',
  'minutes',
  'incredibly',
  'inspiring'],
 ['do',
  'some',
  'research',
  'on',
  'the',
  'internet',
  '-PRON-',
  'be',
  'indicate',
  'that',
  'take',
  'resveratrol',
  'in',
  'lozenge',
  'form',
  'be',
  'preferable',
  'as',
  '-PRON-',
  'be',
  'by',
  'stomach',
  'acid',
  'break',
  'down']]

In [124]:
gramlist_updated = [word for sent in preprocessed_review_updated for word in sent if '_' in word]

In [125]:
paired_words_frq_updated = Counter(gramlist_updated)
paired_words_frq_updated.most_common(100)

[('fish_oil', 10809),
 ('side_effect', 4969),
 ('high_quality', 3908),
 ('vitamin_d', 3852),
 ('immune_system', 3374),
 ('long_time', 3112),
 ('vitamin_c', 2660),
 ('blood_pressure', 2457),
 ('energy_level', 2408),
 ('anyone_who', 2207),
 ('joint_pain', 1932),
 ('weight_loss', 1891),
 ('hot_flash', 1758),
 ('health_benefit', 1577),
 ('blood_sugar', 1465),
 ('digestive_system', 1354),
 ('krill_oil', 1312),
 ('people_who', 1227),
 ('vitamin_e', 1203),
 ('customer_service', 1112),
 ('long_term', 1076),
 ('acid_reflux', 1044),
 ('health_food_store', 1038),
 ('little_bit', 1038),
 ('fast_shipping', 1035),
 ('blood_test', 956),
 ('big_difference', 951),
 ('huge_difference', 950),
 ('reasonable_price', 950),
 ('coconut_oil', 928),
 ('second_bottle', 908),
 ('small_amount', 867),
 ('multi_vitamin', 808),
 ('green_tea', 807),
 ('daily_basis', 788),
 ('sore_throat', 785),
 ('local_health', 784),
 ('digestive_issue', 781),
 ('food_store', 763),
 ('soft_gel', 753),
 ('love_it!.', 725),
 ('leg_cram

In [127]:
len(paired_words_frq_updated)   # final number of cleaned-up paired words in the specified phrase format

27137

#### Final Clean-up: Remove Stop Words

In [128]:
unigram_sents_pos_df.head()

Unnamed: 0,review_number,sentence_number,unigram_sentences,unigram_pos,preprocessed_review,has_paired_words
0,1,1,[bioavailability...,"[PROPN, VERB, DE...",[bioavailability...,0
1,1,2,"[-PRON-, start, ...","[PRON, VERB, VER...","[-PRON-, start, ...",1
2,1,3,"[do, some, resea...","[VERB, DET, NOUN...","[do, some, resea...",1
3,1,4,"[the, ez, melt, ...","[DET, ADP, NOUN,...","[the, ez, melt, ...",0
4,1,5,"[this, product, ...","[DET, NOUN, VERB...","[have, the, grea...",1


In [129]:
unigram_sents_pos_df.shape

(1241826, 6)

In [130]:
preprocessed_review_final = [[word for word in sentence if word not in nlp.Defaults.stop_words] for sentence in preprocessed_review_updated]

In [131]:
unigram_sents_pos_df.drop(['preprocessed_review'], axis=1, inplace=True)
unigram_sents_pos_df['preprocessed_review'] = preprocessed_review_final
unigram_sents_pos_df.head()

Unnamed: 0,review_number,sentence_number,unigram_sentences,unigram_pos,has_paired_words,preprocessed_review
0,1,1,[bioavailability...,"[PROPN, VERB, DE...",0,[bioavailability...
1,1,2,"[-PRON-, start, ...","[PRON, VERB, VER...",1,"[-PRON-, start, ..."
2,1,3,"[do, some, resea...","[VERB, DET, NOUN...",1,"[research, inter..."
3,1,4,"[the, ez, melt, ...","[DET, ADP, NOUN,...",0,"[ez, melt, formu..."
4,1,5,"[this, product, ...","[DET, NOUN, VERB...",1,"[great, side_eff..."


In [132]:
# save picked dataframe to S3.  Pickle format allows the columns to store lists
save_df_s3(unigram_sents_pos_df, bucket_name, filepath='amazon_reviews/kk/preprocessed_reviews_v3.pkl', filetype='pickle')

In [6]:
# load from the pickled dataframe on S3
unigram_sents_pos_df = load_df_s3(bucket_name, filepath='amazon_reviews/preprocessed_reviews_v3.pkl', filetype='pickle')

# Training the LDA Model

In [7]:
unigram_sents_pos_df.head(10)

Unnamed: 0,review_number,sentence_number,unigram_sentences,unigram_pos,has_paired_words,preprocessed_review
0,1,1,"[dpe, the, job, well]","[NOUN, DET, NOUN, ADV]",0,"[dpe, job]"
1,1,2,"[b, flax, d, be, a, regular, at, -PRON-, house]","[NOUN, NOUN, NOUN, VERB, DET, ADJ, ADP, ADJ, N...",0,"[b, flax, d, regular, -PRON-, house]"
2,1,3,"[-PRON-, do, -PRON-, job, simply, and, with, g...","[PRON, VERB, ADJ, NOUN, ADV, CCONJ, ADP, ADJ, ...",0,"[-PRON-, -PRON-, job, simply, good, result]"
3,1,4,"[-PRON-, be, reasonable, last, a, long, time, ...","[PRON, VERB, ADJ, VERB, DET, ADJ, NOUN, CCONJ,...",1,"[-PRON-, reasonable, long, time, able, obtain,..."
4,1,5,"[good, product, good, price, good, result]","[ADJ, NOUN, ADJ, NOUN, ADJ, NOUN]",0,"[good, product, good, price, good, result]"
5,2,6,"[fast, shipping, good, communication]","[ADJ, NOUN, ADJ, NOUN]",1,"[fast_shipping, good, communication]"
6,2,7,"[study, show, that, resveratrol, be, poorly, a...","[NOUN, VERB, ADP, PROPN, VERB, ADV, VERB, ADV,...",0,"[study, resveratrol, poorly, absorb, pill, loz..."
7,2,8,"[hardly, any, company, be, sell, lozenge]","[ADV, DET, NOUN, VERB, VERB, NOUN]",0,"[hardly, company, sell, lozenge]"
8,2,9,"[this, company, promise, 99, purity, and, have...","[DET, NOUN, VERB, NUM, NOUN, CCONJ, VERB, ADJ,...",1,"[company, promise, 99, purity, fast_shipping, ..."
9,2,10,"[-PRON-, can, not, comment, on, the, quality, ...","[PRON, VERB, ADV, VERB, ADP, DET, NOUN, ADP, N...",1,"[-PRON-, comment, quality, product, -PRON-, ch..."


In [8]:
review_num = unigram_sents_pos_df.review_number.tolist()

In [9]:
tokenized_reviews = unigram_sents_pos_df.preprocessed_review.tolist()

In [10]:
print(tokenized_reviews[:3])

[['dpe', 'job'], ['b', 'flax', 'd', 'regular', '-PRON-', 'house'], ['-PRON-', '-PRON-', 'job', 'simply', 'good', 'result']]


In [11]:
tokenized_reviews = unigram_sents_pos_df.groupby(('review_number'))['preprocessed_review'].sum()

In [12]:
tokenized_reviews = tokenized_reviews.tolist()

In [13]:
len(tokenized_reviews)

584828

In [14]:
print(tokenized_reviews[0])

['dpe', 'job', 'b', 'flax', 'd', 'regular', '-PRON-', 'house', '-PRON-', '-PRON-', 'job', 'simply', 'good', 'result', '-PRON-', 'reasonable', 'long', 'time', 'able', 'obtain', 'free_shipping', '-PRON-', 'hunt', 'good', 'product', 'good', 'price', 'good', 'result']


In [148]:
save_df_s3(tokenized_reviews, bucket_name, filepath='amazon_reviews/kk/tokenized_reviews_v1.pkl', filetype='pickle')

In [15]:
%%time
# we need to learn the full vocabulary of the corpus to be modeled
# learn the dictionary by iterating over all of the reviews
vocab_dictionary = Dictionary(tokenized_reviews)

INFO : adding document #0 to Dictionary(0 unique tokens: [])
INFO : adding document #10000 to Dictionary(17822 unique tokens: ['-PRON-', 'able', 'b', 'd', 'dpe']...)
INFO : adding document #20000 to Dictionary(30475 unique tokens: ['-PRON-', 'able', 'b', 'd', 'dpe']...)
INFO : adding document #30000 to Dictionary(39820 unique tokens: ['-PRON-', 'able', 'b', 'd', 'dpe']...)
INFO : adding document #40000 to Dictionary(47546 unique tokens: ['-PRON-', 'able', 'b', 'd', 'dpe']...)
INFO : adding document #50000 to Dictionary(54542 unique tokens: ['-PRON-', 'able', 'b', 'd', 'dpe']...)
INFO : adding document #60000 to Dictionary(62429 unique tokens: ['-PRON-', 'able', 'b', 'd', 'dpe']...)
INFO : adding document #70000 to Dictionary(68571 unique tokens: ['-PRON-', 'able', 'b', 'd', 'dpe']...)
INFO : adding document #80000 to Dictionary(75729 unique tokens: ['-PRON-', 'able', 'b', 'd', 'dpe']...)
INFO : adding document #90000 to Dictionary(82207 unique tokens: ['-PRON-', 'able', 'b', 'd', 'dpe'

CPU times: user 23 s, sys: 768 ms, total: 23.8 s
Wall time: 23.7 s


In [10]:
# filter tokens that are very rare or too common from
# the dictionary (filter_extremes) and reassign integer ids (compactify)
# vocab_dictionary.filter_extremes(no_below=1000, no_above=0.6)
# vocab_dictionary.compactify()   # remove gaps in id sequence after words that were removed

INFO : discarding 291004 tokens: [('dpe', 1), ('-PRON-', 2487546), ('hunt', 239), ('obtain', 980), ('communication', 262), ('lozenge', 785), ('poorly', 616), ('99', 534), ('chemist', 209), ('legitimate', 307)]...
INFO : keeping 1934 tokens which were in no less than 1000 and no more than 2159571 (=60.0%) documents
DEBUG : rebuilding dictionary, shrinking gaps
INFO : resulting dictionary: Dictionary(1934 unique tokens: ['job', 'b', 'd', 'flax', 'house']...)
DEBUG : rebuilding dictionary, shrinking gaps


In [16]:
print(vocab_dictionary)

Dictionary(292938 unique tokens: ['-PRON-', 'able', 'b', 'd', 'dpe']...)


In [146]:
save_df_s3(vocab_dictionary, bucket_name, filepath='amazon_reviews/kk/vocab_dictionary_v1.dict', filetype='pickle')

In [182]:
# vocab_dictionary = Dictionary.load('../vocab_dictionary.dict')  # load the finished dictionary from disk

In [17]:
# bag-of-words representation of the corpus/ doc-term matrix
bow_corpus = [vocab_dictionary.doc2bow(review) for review in tokenized_reviews]

In [18]:
%%time

# Set training parameters.
# num_topics_list = np.arange(3, 21, 1)
num_topics = 10
chunksize = 10000    # number of docs processed at a time
passes = 3
iterations = 400
eval_every = 1


coherenceList_umass = []

# We set alpha = 'auto' and eta = 'auto'. Thus, we are automatically learning 
# 2 parameters in the model that we usually would have to specify explicitly.
# for num_topics in tqdm(num_topics_list):
# training the model
lda = LdaMulticore(bow_corpus, num_topics=num_topics, id2word=vocab_dictionary, chunksize=chunksize, 
                   passes=passes, eta='auto', eval_every = eval_every, iterations=iterations)
# performance metric
cm = CoherenceModel(model=lda, corpus=bow_corpus, dictionary=vocab_dictionary, coherence='u_mass')
coherenceList_umass.append(cm.get_coherence())

# visualization
vis = pyLDAvis.gensim.prepare(lda, bow_corpus, vocab_dictionary)
pyLDAvis.save_html(vis, 'pyLDAvis_{num_topics}.html')

INFO : using symmetric alpha at 0.1
INFO : using serial LDA version on this node
INFO : running online LDA training, 10 topics, 3 passes over the supplied corpus of 584828 documents, updating every 350000 documents, evaluating every ~350000 documents, iterating 400x with a convergence threshold of 0.001000
INFO : training LDA model using 35 processes
DEBUG : worker process entering E-step loop
DEBUG : getting a new job
DEBUG : worker process entering E-step loop
DEBUG : getting a new job
DEBUG : worker process entering E-step loop
DEBUG : getting a new job
DEBUG : worker process entering E-step loop
DEBUG : getting a new job
DEBUG : worker process entering E-step loop
DEBUG : getting a new job
DEBUG : worker process entering E-step loop
DEBUG : getting a new job
DEBUG : worker process entering E-step loop
DEBUG : getting a new job
DEBUG : worker process entering E-step loop
DEBUG : getting a new job
DEBUG : worker process entering E-step loop
DEBUG : getting a new job
DEBUG : worker pr

INFO : PROGRESS: pass 0, dispatched chunk #36 = documents up to #370000/584828, outstanding queue size 37
INFO : PROGRESS: pass 0, dispatched chunk #37 = documents up to #380000/584828, outstanding queue size 38
DEBUG : processing chunk #14 of 10000 documents
DEBUG : performing inference on a chunk of 10000 documents
INFO : PROGRESS: pass 0, dispatched chunk #38 = documents up to #390000/584828, outstanding queue size 39
INFO : PROGRESS: pass 0, dispatched chunk #39 = documents up to #400000/584828, outstanding queue size 40
INFO : PROGRESS: pass 0, dispatched chunk #40 = documents up to #410000/584828, outstanding queue size 41
DEBUG : processing chunk #15 of 10000 documents
DEBUG : performing inference on a chunk of 10000 documents
INFO : PROGRESS: pass 0, dispatched chunk #41 = documents up to #420000/584828, outstanding queue size 42
INFO : PROGRESS: pass 0, dispatched chunk #42 = documents up to #430000/584828, outstanding queue size 43
INFO : PROGRESS: pass 0, dispatched chunk #4

DEBUG : performing inference on a chunk of 10000 documents
DEBUG : 9724/10000 documents converged within 400 iterations
DEBUG : processed chunk, queuing the result
DEBUG : result put
DEBUG : getting a new job
DEBUG : 9766/10000 documents converged within 400 iterations
DEBUG : processed chunk, queuing the result
DEBUG : performing inference on a chunk of 10000 documents
DEBUG : result put
DEBUG : getting a new job
DEBUG : processing chunk #49 of 10000 documents
DEBUG : processing chunk #50 of 10000 documents
DEBUG : processed chunk, queuing the result
DEBUG : performing inference on a chunk of 10000 documents
DEBUG : 9786/10000 documents converged within 400 iterations
DEBUG : result put
DEBUG : getting a new job
DEBUG : processing chunk #51 of 10000 documents
DEBUG : performing inference on a chunk of 10000 documents
DEBUG : 9757/10000 documents converged within 400 iterations
DEBUG : processed chunk, queuing the result
DEBUG : 9792/10000 documents converged within 400 iterations
DEBU

DEBUG : processed chunk, queuing the result
DEBUG : result put
DEBUG : getting a new job
DEBUG : 9782/10000 documents converged within 400 iterations
DEBUG : processed chunk, queuing the result
DEBUG : result put
DEBUG : getting a new job
DEBUG : 9746/10000 documents converged within 400 iterations
DEBUG : processed chunk, queuing the result
DEBUG : result put
DEBUG : getting a new job
DEBUG : 9758/10000 documents converged within 400 iterations
DEBUG : processed chunk, queuing the result
DEBUG : result put
DEBUG : getting a new job
DEBUG : 9808/10000 documents converged within 400 iterations
DEBUG : processed chunk, queuing the result
DEBUG : result put
DEBUG : getting a new job
DEBUG : 9720/10000 documents converged within 400 iterations
DEBUG : processed chunk, queuing the result
DEBUG : result put
DEBUG : getting a new job
DEBUG : updating topics
INFO : merging changes from 234828 documents into a model of 584828 documents
INFO : topic #6 (0.100): 0.134*"-PRON-" + 0.015*"great" + 0

DEBUG : performing inference on a chunk of 10000 documents
INFO : PROGRESS: pass 1, dispatched chunk #51 = documents up to #520000/584828, outstanding queue size 52
INFO : PROGRESS: pass 1, dispatched chunk #52 = documents up to #530000/584828, outstanding queue size 53
INFO : PROGRESS: pass 1, dispatched chunk #53 = documents up to #540000/584828, outstanding queue size 54
INFO : PROGRESS: pass 1, dispatched chunk #54 = documents up to #550000/584828, outstanding queue size 55
INFO : PROGRESS: pass 1, dispatched chunk #55 = documents up to #560000/584828, outstanding queue size 56
INFO : PROGRESS: pass 1, dispatched chunk #56 = documents up to #570000/584828, outstanding queue size 57
INFO : PROGRESS: pass 1, dispatched chunk #57 = documents up to #580000/584828, outstanding queue size 58
INFO : PROGRESS: pass 1, dispatched chunk #58 = documents up to #584828/584828, outstanding queue size 59
DEBUG : processing chunk #8 of 10000 documents
DEBUG : performing inference on a chunk of 100

DEBUG : processing chunk #45 of 10000 documents
DEBUG : 9947/10000 documents converged within 400 iterations
DEBUG : getting a new job
DEBUG : processed chunk, queuing the result
DEBUG : result put
DEBUG : getting a new job
DEBUG : 9948/10000 documents converged within 400 iterations
DEBUG : processed chunk, queuing the result
DEBUG : processing chunk #46 of 10000 documents
DEBUG : performing inference on a chunk of 10000 documents
DEBUG : result put
DEBUG : 9958/10000 documents converged within 400 iterations
DEBUG : performing inference on a chunk of 10000 documents
DEBUG : processed chunk, queuing the result
DEBUG : result put
DEBUG : getting a new job
DEBUG : processing chunk #47 of 10000 documents
DEBUG : 9950/10000 documents converged within 400 iterations
DEBUG : performing inference on a chunk of 10000 documents
DEBUG : processed chunk, queuing the result
DEBUG : result put
DEBUG : getting a new job
DEBUG : processing chunk #48 of 10000 documents
DEBUG : processing chunk #49 of

INFO : topic #8 (0.100): 0.211*"-PRON-" + 0.025*"product" + 0.016*"good" + 0.013*"use" + 0.012*"like" + 0.008*"great" + 0.008*"day" + 0.007*"help" + 0.007*"work" + 0.007*"recommend"
INFO : topic #6 (0.100): 0.128*"-PRON-" + 0.015*"great" + 0.014*"product" + 0.013*"good" + 0.010*"day" + 0.010*"use" + 0.010*"energy" + 0.009*"body" + 0.008*"supplement" + 0.007*"help"
INFO : topic #1 (0.100): 0.106*"-PRON-" + 0.030*"product" + 0.017*"help" + 0.015*"good" + 0.009*"use" + 0.007*"excellent" + 0.006*"work" + 0.005*"day" + 0.005*"taste" + 0.005*"recommend"
INFO : topic #4 (0.100): 0.061*"-PRON-" + 0.043*"product" + 0.019*"great" + 0.018*"good" + 0.015*"use" + 0.012*"work" + 0.010*"recommend" + 0.008*"vitamin" + 0.008*"star" + 0.007*"price"
INFO : topic diff=0.267372, rho=0.128583
DEBUG : bound: at document #0
INFO : -6.303 per-word bound, 78.9 perplexity estimate based on a held-out corpus of 4828 documents with 193875 words
INFO : PROGRESS: pass 2, dispatched chunk #0 = documents up to #10000/

DEBUG : processing chunk #10 of 10000 documents
DEBUG : performing inference on a chunk of 10000 documents
DEBUG : processing chunk #11 of 10000 documents
DEBUG : performing inference on a chunk of 10000 documents
DEBUG : processing chunk #12 of 10000 documents
DEBUG : performing inference on a chunk of 10000 documents
DEBUG : processing chunk #13 of 10000 documents
DEBUG : performing inference on a chunk of 10000 documents
DEBUG : processing chunk #14 of 10000 documents
DEBUG : performing inference on a chunk of 10000 documents
DEBUG : processing chunk #15 of 10000 documents
DEBUG : performing inference on a chunk of 10000 documents
DEBUG : processing chunk #16 of 10000 documents
DEBUG : performing inference on a chunk of 10000 documents
DEBUG : processing chunk #17 of 10000 documents
DEBUG : performing inference on a chunk of 10000 documents
DEBUG : 9974/10000 documents converged within 400 iterations
DEBUG : processed chunk, queuing the result
DEBUG : result put
DEBUG : getting a ne

DEBUG : 9961/10000 documents converged within 400 iterations
DEBUG : getting a new job
DEBUG : processed chunk, queuing the result
DEBUG : processing chunk #46 of 10000 documents
DEBUG : performing inference on a chunk of 10000 documents
DEBUG : result put
DEBUG : getting a new job
DEBUG : 9964/10000 documents converged within 400 iterations
DEBUG : processed chunk, queuing the result
DEBUG : result put
DEBUG : 9957/10000 documents converged within 400 iterations
DEBUG : performing inference on a chunk of 10000 documents
DEBUG : 9967/10000 documents converged within 400 iterations
DEBUG : processed chunk, queuing the result
DEBUG : processed chunk, queuing the result
DEBUG : result put
DEBUG : getting a new job
DEBUG : result put
DEBUG : getting a new job
DEBUG : processing chunk #47 of 10000 documents
DEBUG : processing chunk #48 of 10000 documents
DEBUG : performing inference on a chunk of 10000 documents
DEBUG : result put
DEBUG : processing chunk #49 of 10000 documents
DEBUG : perf

INFO : CorpusAccumulator accumulated stats from 1000 documents
INFO : CorpusAccumulator accumulated stats from 2000 documents
INFO : CorpusAccumulator accumulated stats from 3000 documents
INFO : CorpusAccumulator accumulated stats from 4000 documents
INFO : CorpusAccumulator accumulated stats from 5000 documents
INFO : CorpusAccumulator accumulated stats from 6000 documents
INFO : CorpusAccumulator accumulated stats from 7000 documents
INFO : CorpusAccumulator accumulated stats from 8000 documents
INFO : CorpusAccumulator accumulated stats from 9000 documents
INFO : CorpusAccumulator accumulated stats from 10000 documents
INFO : CorpusAccumulator accumulated stats from 11000 documents
INFO : CorpusAccumulator accumulated stats from 12000 documents
INFO : CorpusAccumulator accumulated stats from 13000 documents
INFO : CorpusAccumulator accumulated stats from 14000 documents
INFO : CorpusAccumulator accumulated stats from 15000 documents
INFO : CorpusAccumulator accumulated stats from 1

INFO : CorpusAccumulator accumulated stats from 129000 documents
INFO : CorpusAccumulator accumulated stats from 130000 documents
INFO : CorpusAccumulator accumulated stats from 131000 documents
INFO : CorpusAccumulator accumulated stats from 132000 documents
INFO : CorpusAccumulator accumulated stats from 133000 documents
INFO : CorpusAccumulator accumulated stats from 134000 documents
INFO : CorpusAccumulator accumulated stats from 135000 documents
INFO : CorpusAccumulator accumulated stats from 136000 documents
INFO : CorpusAccumulator accumulated stats from 137000 documents
INFO : CorpusAccumulator accumulated stats from 138000 documents
INFO : CorpusAccumulator accumulated stats from 139000 documents
INFO : CorpusAccumulator accumulated stats from 140000 documents
INFO : CorpusAccumulator accumulated stats from 141000 documents
INFO : CorpusAccumulator accumulated stats from 142000 documents
INFO : CorpusAccumulator accumulated stats from 143000 documents
INFO : CorpusAccumulator 

INFO : CorpusAccumulator accumulated stats from 256000 documents
INFO : CorpusAccumulator accumulated stats from 257000 documents
INFO : CorpusAccumulator accumulated stats from 258000 documents
INFO : CorpusAccumulator accumulated stats from 259000 documents
INFO : CorpusAccumulator accumulated stats from 260000 documents
INFO : CorpusAccumulator accumulated stats from 261000 documents
INFO : CorpusAccumulator accumulated stats from 262000 documents
INFO : CorpusAccumulator accumulated stats from 263000 documents
INFO : CorpusAccumulator accumulated stats from 264000 documents
INFO : CorpusAccumulator accumulated stats from 265000 documents
INFO : CorpusAccumulator accumulated stats from 266000 documents
INFO : CorpusAccumulator accumulated stats from 267000 documents
INFO : CorpusAccumulator accumulated stats from 268000 documents
INFO : CorpusAccumulator accumulated stats from 269000 documents
INFO : CorpusAccumulator accumulated stats from 270000 documents
INFO : CorpusAccumulator 

INFO : CorpusAccumulator accumulated stats from 383000 documents
INFO : CorpusAccumulator accumulated stats from 384000 documents
INFO : CorpusAccumulator accumulated stats from 385000 documents
INFO : CorpusAccumulator accumulated stats from 386000 documents
INFO : CorpusAccumulator accumulated stats from 387000 documents
INFO : CorpusAccumulator accumulated stats from 388000 documents
INFO : CorpusAccumulator accumulated stats from 389000 documents
INFO : CorpusAccumulator accumulated stats from 390000 documents
INFO : CorpusAccumulator accumulated stats from 391000 documents
INFO : CorpusAccumulator accumulated stats from 392000 documents
INFO : CorpusAccumulator accumulated stats from 393000 documents
INFO : CorpusAccumulator accumulated stats from 394000 documents
INFO : CorpusAccumulator accumulated stats from 395000 documents
INFO : CorpusAccumulator accumulated stats from 396000 documents
INFO : CorpusAccumulator accumulated stats from 397000 documents
INFO : CorpusAccumulator 

INFO : CorpusAccumulator accumulated stats from 510000 documents
INFO : CorpusAccumulator accumulated stats from 511000 documents
INFO : CorpusAccumulator accumulated stats from 512000 documents
INFO : CorpusAccumulator accumulated stats from 513000 documents
INFO : CorpusAccumulator accumulated stats from 514000 documents
INFO : CorpusAccumulator accumulated stats from 515000 documents
INFO : CorpusAccumulator accumulated stats from 516000 documents
INFO : CorpusAccumulator accumulated stats from 517000 documents
INFO : CorpusAccumulator accumulated stats from 518000 documents
INFO : CorpusAccumulator accumulated stats from 519000 documents
INFO : CorpusAccumulator accumulated stats from 520000 documents
INFO : CorpusAccumulator accumulated stats from 521000 documents
INFO : CorpusAccumulator accumulated stats from 522000 documents
INFO : CorpusAccumulator accumulated stats from 523000 documents
INFO : CorpusAccumulator accumulated stats from 524000 documents
INFO : CorpusAccumulator 

CPU times: user 8min 32s, sys: 34.4 s, total: 9min 6s
Wall time: 9min 12s


In [None]:
coherenceList_umass

In [19]:
%%time

# Set training parameters.
# num_topics_list = np.arange(3, 21, 1)
num_topics = 15
chunksize = 10000    # number of docs processed at a time
passes = 3
iterations = 400
eval_every = 1


coherenceList_umass = []

# We set alpha = 'auto' and eta = 'auto'. Thus, we are automatically learning 
# 2 parameters in the model that we usually would have to specify explicitly.
# for num_topics in tqdm(num_topics_list):
# training the model
lda = LdaMulticore(bow_corpus, num_topics=num_topics, id2word=vocab_dictionary, chunksize=chunksize, 
                   passes=passes, eta='auto', eval_every = eval_every, iterations=iterations)
# performance metric
cm = CoherenceModel(model=lda, corpus=bow_corpus, dictionary=vocab_dictionary, coherence='u_mass')
coherenceList_umass.append(cm.get_coherence())

# visualization
vis = pyLDAvis.gensim.prepare(lda, bow_corpus, vocab_dictionary)
pyLDAvis.save_html(vis, 'pyLDAvis_{num_topics}.html')

INFO : using symmetric alpha at 0.06666666666666667
INFO : using serial LDA version on this node
INFO : running online LDA training, 15 topics, 3 passes over the supplied corpus of 584828 documents, updating every 350000 documents, evaluating every ~350000 documents, iterating 400x with a convergence threshold of 0.001000
INFO : training LDA model using 35 processes
DEBUG : worker process entering E-step loop
DEBUG : getting a new job
DEBUG : worker process entering E-step loop
DEBUG : getting a new job
DEBUG : worker process entering E-step loop
DEBUG : getting a new job
DEBUG : worker process entering E-step loop
DEBUG : getting a new job
DEBUG : getting a new job
DEBUG : worker process entering E-step loop
DEBUG : worker process entering E-step loop
DEBUG : getting a new job
DEBUG : worker process entering E-step loop
DEBUG : worker process entering E-step loop
DEBUG : getting a new job
DEBUG : getting a new job
DEBUG : worker process entering E-step loop
DEBUG : getting a new job
D

DEBUG : processing chunk #13 of 10000 documents
DEBUG : performing inference on a chunk of 10000 documents
INFO : PROGRESS: pass 0, dispatched chunk #37 = documents up to #380000/584828, outstanding queue size 38
INFO : PROGRESS: pass 0, dispatched chunk #38 = documents up to #390000/584828, outstanding queue size 39
INFO : PROGRESS: pass 0, dispatched chunk #39 = documents up to #400000/584828, outstanding queue size 40
DEBUG : processing chunk #14 of 10000 documents
DEBUG : performing inference on a chunk of 10000 documents
INFO : PROGRESS: pass 0, dispatched chunk #40 = documents up to #410000/584828, outstanding queue size 41
INFO : PROGRESS: pass 0, dispatched chunk #41 = documents up to #420000/584828, outstanding queue size 42
INFO : PROGRESS: pass 0, dispatched chunk #42 = documents up to #430000/584828, outstanding queue size 43
DEBUG : processing chunk #15 of 10000 documents
DEBUG : performing inference on a chunk of 10000 documents
INFO : PROGRESS: pass 0, dispatched chunk #

DEBUG : processed chunk, queuing the result
DEBUG : result put
DEBUG : getting a new job
DEBUG : result put
DEBUG : getting a new job
DEBUG : processing chunk #48 of 10000 documents
DEBUG : performing inference on a chunk of 10000 documents
DEBUG : 9700/10000 documents converged within 400 iterations
DEBUG : getting a new job
DEBUG : processed chunk, queuing the result
DEBUG : 9705/10000 documents converged within 400 iterations
DEBUG : processed chunk, queuing the result
DEBUG : result put
DEBUG : getting a new job
DEBUG : result put
DEBUG : getting a new job
DEBUG : processing chunk #49 of 10000 documents
DEBUG : performing inference on a chunk of 10000 documents
DEBUG : 9699/10000 documents converged within 400 iterations
DEBUG : processed chunk, queuing the result
DEBUG : result put
DEBUG : processing chunk #50 of 10000 documents
DEBUG : performing inference on a chunk of 10000 documents
DEBUG : 9639/10000 documents converged within 400 iterations
DEBUG : processed chunk, queuing t

DEBUG : getting a new job
DEBUG : 9752/10000 documents converged within 400 iterations
DEBUG : 9677/10000 documents converged within 400 iterations
DEBUG : processed chunk, queuing the result
DEBUG : processed chunk, queuing the result
DEBUG : result put
DEBUG : getting a new job
DEBUG : result put
DEBUG : getting a new job
DEBUG : 9728/10000 documents converged within 400 iterations
DEBUG : processed chunk, queuing the result
DEBUG : 9748/10000 documents converged within 400 iterations
DEBUG : processed chunk, queuing the result
DEBUG : result put
DEBUG : getting a new job
DEBUG : result put
DEBUG : getting a new job
DEBUG : 9681/10000 documents converged within 400 iterations
DEBUG : processed chunk, queuing the result
DEBUG : 9728/10000 documents converged within 400 iterations
DEBUG : result put
DEBUG : getting a new job
DEBUG : processed chunk, queuing the result
DEBUG : result put
DEBUG : getting a new job
DEBUG : updating topics
INFO : merging changes from 234828 documents into 

INFO : PROGRESS: pass 1, dispatched chunk #52 = documents up to #530000/584828, outstanding queue size 53
INFO : PROGRESS: pass 1, dispatched chunk #53 = documents up to #540000/584828, outstanding queue size 54
INFO : PROGRESS: pass 1, dispatched chunk #54 = documents up to #550000/584828, outstanding queue size 55
INFO : PROGRESS: pass 1, dispatched chunk #55 = documents up to #560000/584828, outstanding queue size 56
INFO : PROGRESS: pass 1, dispatched chunk #56 = documents up to #570000/584828, outstanding queue size 57
INFO : PROGRESS: pass 1, dispatched chunk #57 = documents up to #580000/584828, outstanding queue size 58
DEBUG : processing chunk #6 of 10000 documents
DEBUG : performing inference on a chunk of 10000 documents
INFO : PROGRESS: pass 1, dispatched chunk #58 = documents up to #584828/584828, outstanding queue size 59
DEBUG : processing chunk #7 of 10000 documents
DEBUG : performing inference on a chunk of 10000 documents
DEBUG : processing chunk #8 of 10000 documents

DEBUG : performing inference on a chunk of 10000 documents
DEBUG : processing chunk #45 of 10000 documents
DEBUG : 9950/10000 documents converged within 400 iterations
DEBUG : processed chunk, queuing the result
DEBUG : processing chunk #46 of 10000 documents
DEBUG : performing inference on a chunk of 10000 documents
DEBUG : result put
DEBUG : getting a new job
DEBUG : 9955/10000 documents converged within 400 iterations
DEBUG : processed chunk, queuing the result
DEBUG : performing inference on a chunk of 10000 documents
DEBUG : 9933/10000 documents converged within 400 iterations
DEBUG : result put
DEBUG : getting a new job
DEBUG : processed chunk, queuing the result
DEBUG : result put
DEBUG : getting a new job
DEBUG : 9954/10000 documents converged within 400 iterations
DEBUG : processing chunk #47 of 10000 documents
DEBUG : processed chunk, queuing the result
DEBUG : 9951/10000 documents converged within 400 iterations
DEBUG : processed chunk, queuing the result
DEBUG : result put


INFO : topic #11 (0.067): 0.179*"-PRON-" + 0.019*"use" + 0.017*"work" + 0.017*"day" + 0.016*"product" + 0.010*"great" + 0.007*"supplement" + 0.006*"week" + 0.005*"recommend" + 0.005*"start"
INFO : topic #3 (0.067): 0.203*"-PRON-" + 0.025*"good" + 0.024*"product" + 0.013*"use" + 0.012*"great" + 0.011*"work" + 0.009*"feel" + 0.007*"try" + 0.007*"find" + 0.006*"help"
INFO : topic #1 (0.067): 0.143*"-PRON-" + 0.061*"product" + 0.032*"use" + 0.019*"good" + 0.012*"great" + 0.010*"taste" + 0.009*"work" + 0.009*"help" + 0.008*"year" + 0.007*"easy"
INFO : topic #7 (0.067): 0.087*"-PRON-" + 0.025*"use" + 0.023*"good" + 0.022*"product" + 0.012*"price" + 0.009*"work" + 0.008*"great" + 0.006*"month" + 0.005*"try" + 0.005*"brand"
INFO : topic #10 (0.067): 0.227*"-PRON-" + 0.016*"product" + 0.012*"help" + 0.012*"use" + 0.008*"love" + 0.008*"day" + 0.007*"great" + 0.006*"work" + 0.006*"like" + 0.006*"time"
INFO : topic diff=0.275763, rho=0.128583
DEBUG : bound: at document #0
INFO : -6.298 per-word bo

INFO : PROGRESS: pass 2, dispatched chunk #58 = documents up to #584828/584828, outstanding queue size 59
DEBUG : processing chunk #10 of 10000 documents
DEBUG : performing inference on a chunk of 10000 documents
DEBUG : performing inference on a chunk of 10000 documents
DEBUG : performing inference on a chunk of 10000 documents
DEBUG : processing chunk #11 of 10000 documents
DEBUG : performing inference on a chunk of 10000 documents
DEBUG : processing chunk #12 of 10000 documents
DEBUG : processing chunk #13 of 10000 documents
DEBUG : processing chunk #14 of 10000 documents
DEBUG : performing inference on a chunk of 10000 documents
DEBUG : processing chunk #15 of 10000 documents
DEBUG : performing inference on a chunk of 10000 documents
DEBUG : processing chunk #16 of 10000 documents
DEBUG : performing inference on a chunk of 10000 documents
DEBUG : processing chunk #17 of 10000 documents
DEBUG : performing inference on a chunk of 10000 documents
DEBUG : 9967/10000 documents converged

DEBUG : 9963/10000 documents converged within 400 iterations
DEBUG : processed chunk, queuing the result
DEBUG : result put
DEBUG : getting a new job
DEBUG : 9978/10000 documents converged within 400 iterations
DEBUG : result put
DEBUG : getting a new job
DEBUG : processing chunk #47 of 10000 documents
DEBUG : performing inference on a chunk of 10000 documents
DEBUG : processed chunk, queuing the result
DEBUG : result put
DEBUG : getting a new job
DEBUG : 9977/10000 documents converged within 400 iterations
DEBUG : 9971/10000 documents converged within 400 iterations
DEBUG : processed chunk, queuing the result
DEBUG : result put
DEBUG : getting a new job
DEBUG : processing chunk #48 of 10000 documents
DEBUG : performing inference on a chunk of 10000 documents
DEBUG : processed chunk, queuing the result
DEBUG : result put
DEBUG : performing inference on a chunk of 10000 documents
DEBUG : getting a new job
DEBUG : 9977/10000 documents converged within 400 iterations
DEBUG : processed chu

DEBUG : Setting topics to those of the model: LdaModel(num_terms=292938, num_topics=15, decay=0.5, chunksize=10000)
INFO : CorpusAccumulator accumulated stats from 1000 documents
INFO : CorpusAccumulator accumulated stats from 2000 documents
INFO : CorpusAccumulator accumulated stats from 3000 documents
INFO : CorpusAccumulator accumulated stats from 4000 documents
INFO : CorpusAccumulator accumulated stats from 5000 documents
INFO : CorpusAccumulator accumulated stats from 6000 documents
INFO : CorpusAccumulator accumulated stats from 7000 documents
INFO : CorpusAccumulator accumulated stats from 8000 documents
INFO : CorpusAccumulator accumulated stats from 9000 documents
INFO : CorpusAccumulator accumulated stats from 10000 documents
INFO : CorpusAccumulator accumulated stats from 11000 documents
INFO : CorpusAccumulator accumulated stats from 12000 documents
INFO : CorpusAccumulator accumulated stats from 13000 documents
INFO : CorpusAccumulator accumulated stats from 14000 documen

INFO : CorpusAccumulator accumulated stats from 127000 documents
INFO : CorpusAccumulator accumulated stats from 128000 documents
INFO : CorpusAccumulator accumulated stats from 129000 documents
INFO : CorpusAccumulator accumulated stats from 130000 documents
INFO : CorpusAccumulator accumulated stats from 131000 documents
INFO : CorpusAccumulator accumulated stats from 132000 documents
INFO : CorpusAccumulator accumulated stats from 133000 documents
INFO : CorpusAccumulator accumulated stats from 134000 documents
INFO : CorpusAccumulator accumulated stats from 135000 documents
INFO : CorpusAccumulator accumulated stats from 136000 documents
INFO : CorpusAccumulator accumulated stats from 137000 documents
INFO : CorpusAccumulator accumulated stats from 138000 documents
INFO : CorpusAccumulator accumulated stats from 139000 documents
INFO : CorpusAccumulator accumulated stats from 140000 documents
INFO : CorpusAccumulator accumulated stats from 141000 documents
INFO : CorpusAccumulator 

INFO : CorpusAccumulator accumulated stats from 254000 documents
INFO : CorpusAccumulator accumulated stats from 255000 documents
INFO : CorpusAccumulator accumulated stats from 256000 documents
INFO : CorpusAccumulator accumulated stats from 257000 documents
INFO : CorpusAccumulator accumulated stats from 258000 documents
INFO : CorpusAccumulator accumulated stats from 259000 documents
INFO : CorpusAccumulator accumulated stats from 260000 documents
INFO : CorpusAccumulator accumulated stats from 261000 documents
INFO : CorpusAccumulator accumulated stats from 262000 documents
INFO : CorpusAccumulator accumulated stats from 263000 documents
INFO : CorpusAccumulator accumulated stats from 264000 documents
INFO : CorpusAccumulator accumulated stats from 265000 documents
INFO : CorpusAccumulator accumulated stats from 266000 documents
INFO : CorpusAccumulator accumulated stats from 267000 documents
INFO : CorpusAccumulator accumulated stats from 268000 documents
INFO : CorpusAccumulator 

INFO : CorpusAccumulator accumulated stats from 381000 documents
INFO : CorpusAccumulator accumulated stats from 382000 documents
INFO : CorpusAccumulator accumulated stats from 383000 documents
INFO : CorpusAccumulator accumulated stats from 384000 documents
INFO : CorpusAccumulator accumulated stats from 385000 documents
INFO : CorpusAccumulator accumulated stats from 386000 documents
INFO : CorpusAccumulator accumulated stats from 387000 documents
INFO : CorpusAccumulator accumulated stats from 388000 documents
INFO : CorpusAccumulator accumulated stats from 389000 documents
INFO : CorpusAccumulator accumulated stats from 390000 documents
INFO : CorpusAccumulator accumulated stats from 391000 documents
INFO : CorpusAccumulator accumulated stats from 392000 documents
INFO : CorpusAccumulator accumulated stats from 393000 documents
INFO : CorpusAccumulator accumulated stats from 394000 documents
INFO : CorpusAccumulator accumulated stats from 395000 documents
INFO : CorpusAccumulator 

INFO : CorpusAccumulator accumulated stats from 508000 documents
INFO : CorpusAccumulator accumulated stats from 509000 documents
INFO : CorpusAccumulator accumulated stats from 510000 documents
INFO : CorpusAccumulator accumulated stats from 511000 documents
INFO : CorpusAccumulator accumulated stats from 512000 documents
INFO : CorpusAccumulator accumulated stats from 513000 documents
INFO : CorpusAccumulator accumulated stats from 514000 documents
INFO : CorpusAccumulator accumulated stats from 515000 documents
INFO : CorpusAccumulator accumulated stats from 516000 documents
INFO : CorpusAccumulator accumulated stats from 517000 documents
INFO : CorpusAccumulator accumulated stats from 518000 documents
INFO : CorpusAccumulator accumulated stats from 519000 documents
INFO : CorpusAccumulator accumulated stats from 520000 documents
INFO : CorpusAccumulator accumulated stats from 521000 documents
INFO : CorpusAccumulator accumulated stats from 522000 documents
INFO : CorpusAccumulator 

CPU times: user 9min 36s, sys: 43.2 s, total: 10min 19s
Wall time: 10min 4s


In [20]:
coherenceList_umass

[-2.0752244412473058]

In [None]:
# topic coherence plot
plotData = pd.DataFrame({'Number of topics':num_topics_list,
                         'CoherenceScore':coherenceList_umass})
f,ax = plt.subplots(figsize=(10,6))
sns.set_style("darkgrid")
sns.pointplot(x='Number of topics',y= 'CoherenceScore',data=plotData)
plt.axhline(y=-3.9)
plt.title('Topic coherence')
plt.savefig('Topic coherence plot.png')