In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
from collections import Counter
import warnings

import spacy
from gensim.models import Phrases
from gensim.models.word2vec import LineSentence
from gensim.corpora import Dictionary, MmCorpus
from gensim.models.ldamulticore import LdaMulticore
# import pyLDAvis
# import pyLDAvis.gensim

# import cPickle as pickle

from tqdm._tqdm_notebook import tqdm, tqdm_notebook, tnrange
from S3_read_write import load_df_s3, save_df_s3

from IPython.display import Image
from IPython.core.display import HTML 

In [3]:
tqdm_notebook.pandas('Progress')

In [4]:
bucket_name = 'amazon-reviews-project'

# Load Amazon Reviews Data

In [86]:
reviews = load_df_s3(bucket_name, 'amazon_reviews/reviews_data_clean', filetype='text', sep='|')

In [31]:
reviews.shape    # 585,444 records

(585444, 8)

In [32]:
reviews.head()

Unnamed: 0,asin,helpful,reviewText,overall,summary,description,title,categories_clean
0,929619730,"[0, 0]",B-flax-D is a re...,5.0,Dpes the job well,Contains Organic...,New Generation B...,Health & Persona...
1,978559088,"[1, 1]",Studies show tha...,4.0,"Fast shipping, g...",Everyone knows t...,Nutrihill Resver...,Health & Persona...
2,978559088,"[1, 1]",I started taking...,5.0,Bioavailability ...,Everyone knows t...,Nutrihill Resver...,Health & Persona...
3,978559088,"[0, 1]",I tried Nutrihil...,1.0,Other Resveratro...,Everyone knows t...,Nutrihill Resver...,Health & Persona...
4,978559088,"[0, 0]",I really liked t...,5.0,I can't find thi...,Everyone knows t...,Nutrihill Resver...,Health & Persona...


In [33]:
reviews.dtypes

asin                 object
helpful              object
reviewText           object
overall             float64
summary              object
description          object
title                object
categories_clean     object
dtype: object

## Data Cleaning

In [52]:
reviews.categories_clean.unique()[:10]

array(['Health & Personal Care, Vitamins & Dietary Supplements, Multi & Prenatal Vitamins, Multiple Vitamin-Mineral Supplements',
       'Health & Personal Care, Vitamins & Dietary Supplements, Supplements, Antioxidants, Resveratrol',
       'Health & Personal Care, Vitamins & Dietary Supplements, Multi & Prenatal Vitamins, Multivitamins',
       'Health & Personal Care, Vitamins & Dietary Supplements, Vitamins, Vitamin B, B3 (Niacin)',
       'Health & Personal Care, Vitamins & Dietary Supplements, Herbal Supplements',
       'Health & Personal Care, Vitamins & Dietary Supplements, Herbal Supplements, Green Tea',
       'Health & Personal Care, Vitamins & Dietary Supplements, Weight Loss, Supplements, Green Coffee Bean Extract',
       'Health & Personal Care, Vitamins & Dietary Supplements, Weight Loss, Supplements',
       'Health & Personal Care, Vitamins & Dietary Supplements, Supplements, Antioxidants, CoQ10',
       'Health & Personal Care, Vitamins & Dietary Supplements, Herbal

The catergories' list indicates that there may be some reviews in the dataset unrelated to health supplements.  Let's get rid of these.

In [35]:
reviews[reviews.categories_clean.str.contains('CDs & Vinyl')].title.unique()

array(['Liturgy of St. John Chrysostom', 'Origins',
       'Sounds of the Earth: Soft Ocean Sounds', 'Bali',
       'Tranquil Waters', 'Bach: St. John Passion, BWV 245',
       '21st Century Soul', 'Bodies for Strontium', "John's Bunch",
       'An Evening of Paganini', "John's Other Bunch",
       'Sus Mas Grandes Exitos', 'Complex Simplicity',
       'Kidnapped By Neptune', 'Roman Chant / Easter Vespers', 'Dead 60s',
       "Cilla in the 60's", 'Chromium', 'Letters From the Vitamin Sea',
       'The Stinging Nettles', 'Tendres Annees 60', 'Wehiwehi Hawaii',
       'none'], dtype=object)

In [36]:
len(reviews[reviews.categories_clean.str.contains('CDs & Vinyl')])

263

The product titles shown above are all music albums/songs.

In [37]:
reviews_filt = reviews[~(reviews.categories_clean.str.contains('CDs & Vinyl'))]   # remove rows with category including 'CDs & Vinyl'

In [51]:
reviews_filt.categories_clean.unique()[:10]

array(['Health & Personal Care, Vitamins & Dietary Supplements, Multi & Prenatal Vitamins, Multiple Vitamin-Mineral Supplements',
       'Health & Personal Care, Vitamins & Dietary Supplements, Supplements, Antioxidants, Resveratrol',
       'Health & Personal Care, Vitamins & Dietary Supplements, Multi & Prenatal Vitamins, Multivitamins',
       'Health & Personal Care, Vitamins & Dietary Supplements, Vitamins, Vitamin B, B3 (Niacin)',
       'Health & Personal Care, Vitamins & Dietary Supplements, Herbal Supplements',
       'Health & Personal Care, Vitamins & Dietary Supplements, Herbal Supplements, Green Tea',
       'Health & Personal Care, Vitamins & Dietary Supplements, Weight Loss, Supplements, Green Coffee Bean Extract',
       'Health & Personal Care, Vitamins & Dietary Supplements, Weight Loss, Supplements',
       'Health & Personal Care, Vitamins & Dietary Supplements, Supplements, Antioxidants, CoQ10',
       'Health & Personal Care, Vitamins & Dietary Supplements, Herbal

In [39]:
reviews_filt[reviews_filt.categories_clean.str.contains('Software')]

Unnamed: 0,asin,helpful,reviewText,overall,summary,description,title,categories_clean
3639,B00009QP4Q,"[2, 2]",The company has ...,5.0,lives up to its ...,Alpha Five's QLi...,none,Health & Persona...
50015,B0002TIEQQ,"[0, 0]",I ordered this f...,1.0,waste of money,Self help tutori...,none,Health & Persona...


In [40]:
reviews_filt = reviews_filt[~(reviews_filt.categories_clean.str.contains('Software'))]

In [41]:
len(reviews_filt)

585179

In [53]:
# Get rid of reviews of pet-related products
search_for = [' pet ', ' cat ', ' dog ']
pattern = '|'.join(search_for)
reviews_filt.title.str.contains(pattern, case=False).sum()

277

In [50]:
reviews_filt[reviews_filt.title.str.contains(pattern, case=False)]['title'].values[:10]

array(['Power - Mune Tuna Flavor Pet Herbal Supplement From Vetvittles.com',
       'Power - Mune Tuna Flavor Pet Herbal Supplement From Vetvittles.com',
       'Power - Mune Tuna Flavor Pet Herbal Supplement From Vetvittles.com',
       'AniMed Witch Hazel 86-Percent Multi-Species Pet Supplement',
       'AniMed Witch Hazel 86-Percent Multi-Species Pet Supplement',
       'AniMed Witch Hazel 86-Percent Multi-Species Pet Supplement',
       'AniMed Witch Hazel 86-Percent Multi-Species Pet Supplement',
       'AniMed Witch Hazel 86-Percent Multi-Species Pet Supplement',
       'AniMed Witch Hazel 86-Percent Multi-Species Pet Supplement',
       'Composure Liquid for Dogs and Cat (188 SERVINGS)'], dtype=object)

In [54]:
# Get rid of all pet products
reviews_filt = reviews_filt[~(reviews_filt.title.str.contains(pattern, case=False))]

In [55]:
# saving the cleaned dataframe
save_df_s3(df=reviews_filt, bucket_name=bucket_name, filepath='amazon_reviews/reviews_data_clean_v2.feather')

In [56]:
reviews_filt.asin.nunique()     # 48,535 unique products and 585,179 reviews

48501

## Examine One Observation

In [57]:
example = reviews_filt.iloc[0]

In [58]:
example.asin     # Amazon Standard Identification Number

'0929619730'

In [59]:
example.title     # this is the product's name

'New Generation B-Flax-D'

In [60]:
example.categories_clean   # previously filtered/curated categories of interest

'Health & Personal Care, Vitamins & Dietary Supplements, Multi & Prenatal Vitamins, Multiple Vitamin-Mineral Supplements'

In [61]:
example.description       # product description provided by the seller

'Contains Organic Cold-Milled Flaxseed\nValuable source of soluble and insoluble fiber\nProvides Omega-3 essential fats, and many other nutrients to help achieve and maintain optimal bowel function.\n\nContains Vitamin B12\nB12 helps prevent nerve damage\nB12 aids in healthy cell formation.\nB12 helps prevent anemia\n\nContains Vitamin D\nVitamin D assists the body in the absorption of important minerals like calcium.\n\nContains Seleno-yeast\nA source of selenium, a mineral with powerful anti-viral and disease-fighting properties.\n\nContains Vitamin K2\nMenaQ7TM provides vitamin K2 (menaquinone), extracted and concentrated from natto without solvents. Vitamin K2 prevents arterial calcification and promotes strong bones by improving cross-linking of osteocalcin, a protein found in bones. The amount here has been clinically shown not to interfere with blood anti-coagulant medication. \n\nServing Size:\n1/4 Cup (30 Grams)\n\nServings Per Container:\n30 Servings per container\n\nNet Wt. 

In [62]:
example.summary      # review title

'Dpes the job well'

In [63]:
example.reviewText   # review content

'B-flax-D is a regular at our house. It does its job simply and with good results. It is reasonable, lasts a long time, and is able to be obtained with free shipping if you hunt around. Good product, good price, good results.'

Here's what the actual review looks like:

In [64]:
example.overall     # the rating provided by the reviewer

5.0

In [29]:
example.helpful

'[0, 0]'

In [27]:
Image(url= "images/amazon_review_screenshot.png")

# Data Pre-processing

Let us start off using only the title (`summary`) and body (`reviewText`) of each review.

In [5]:
%%time
df = load_df_s3(bucket_name, filepath='amazon_reviews/reviews_data_clean_v2.feather', filetype='feather')

CPU times: user 3.18 s, sys: 4.05 s, total: 7.23 s
Wall time: 29.9 s


In [6]:
df.dtypes

asin                 object
helpful              object
reviewText           object
overall             float64
summary              object
description          object
title                object
categories_clean     object
dtype: object

In [7]:
df.drop(['helpful', 'overall', 'title', 'categories_clean', 'description'], axis=1, inplace=True)

In [8]:
df.head()

Unnamed: 0,asin,reviewText,summary
0,929619730,B-flax-D is a regular at our house. It does it...,Dpes the job well
1,978559088,Studies show that Resveratrol is poorly absorb...,"Fast shipping, good communication"
2,978559088,I started taking this after both my parents di...,Bioavailability is the key
3,978559088,"I tried Nutrihill, but did not feel any of the...",Other Resveratrol Supplements are Better
4,978559088,I really liked this product because it stayed ...,"I can't find this product any longer, and I wi..."


In [9]:
# for each review, concatenate the review title and body
df.reviewText = df.summary + '. ' + df.reviewText

In [10]:
pd.set_option('max_colwidth', 200)
df.head()

Unnamed: 0,asin,reviewText,summary
0,929619730,"Dpes the job well. B-flax-D is a regular at our house. It does its job simply and with good results. It is reasonable, lasts a long time, and is able to be obtained with free shipping if you hunt ...",Dpes the job well
1,978559088,"Fast shipping, good communication. Studies show that Resveratrol is poorly absorbed when taken by pill, but lozenges are very effectively absorbed. Hardly any companies are selling lozenges. This ...","Fast shipping, good communication"
2,978559088,Bioavailability is the key. I started taking this after both my parents died of cancer as it supposed to enhance your immune system - the story on 60 Minutes on resveratrol was incredibly inspirin...,Bioavailability is the key
3,978559088,"Other Resveratrol Supplements are Better. I tried Nutrihill, but did not feel any of the supposed health benefits. I started reading and realized that even though buccal delivery is the best, the ...",Other Resveratrol Supplements are Better
4,978559088,"I can't find this product any longer, and I wish I could.. I really liked this product because it stayed in my mouth for a long time and I felt it was probably doing some good. I take a number of...","I can't find this product any longer, and I wish I could."


Let's drop the `summary` column now:

In [11]:
df.drop(['summary'], axis=1, inplace=True)

In [12]:
df.head()

Unnamed: 0,asin,reviewText
0,929619730,"Dpes the job well. B-flax-D is a regular at our house. It does its job simply and with good results. It is reasonable, lasts a long time, and is able to be obtained with free shipping if you hunt ..."
1,978559088,"Fast shipping, good communication. Studies show that Resveratrol is poorly absorbed when taken by pill, but lozenges are very effectively absorbed. Hardly any companies are selling lozenges. This ..."
2,978559088,Bioavailability is the key. I started taking this after both my parents died of cancer as it supposed to enhance your immune system - the story on 60 Minutes on resveratrol was incredibly inspirin...
3,978559088,"Other Resveratrol Supplements are Better. I tried Nutrihill, but did not feel any of the supposed health benefits. I started reading and realized that even though buccal delivery is the best, the ..."
4,978559088,"I can't find this product any longer, and I wish I could.. I really liked this product because it stayed in my mouth for a long time and I felt it was probably doing some good. I take a number of..."


In [13]:
pd.set_option('max_colwidth', 20)

## Remove Missing Reviews

In [14]:
df.reviewText.isnull().sum()    # 73 reviews have neither a review body text, nor a review title

73

In [15]:
# drop reviews with no text
df = df[~(df.reviewText.isnull())]

In [16]:
df.asin.isnull().sum()

0

Let's look at a few actual review texts:

In [17]:
df.reviewText.iloc[np.random.randint(0, len(df))]

"Didn't help with hypoplasia. This was recommended to my by my lactation consultant.  She said she had great success with goat's rue for women with hypoplasia.  But it didn't work for me."

In [18]:
df.reviewText.iloc[np.random.randint(0, len(df))]

"Pure Cordyceps Capsules 525 mg. Okay....I have taken cordyceps for about 20+ years...I use this for lung health, as well as stamina support regarding bike rides and training @ the gym.....I am 63 years, and have been running a labour  concern for roughly 20 years,my workers are in their early 20s' and I am moving 28 foot ladders around with the best of them...a few years ago I had an environmental infection in a lung, the cordyceps were on my list for medications....Cordyceps has been my vitamin of choice...especially the Aloha Medicinals product...Once I discovered them, I have been extremely pleased with the quality of this chimaera, and the purity of it is quite noticeable, there are no other ''fillers'' added to the capsules...I have always recieved what I have ordered as well as quite quickly  thru the regular post..when the pachage arrives, there is no real damage to the contents ever.Pure Cordyceps Capsules 525 mg (3 Bottles)..I would highly reccomend this prodect to another us

In [19]:
df.reviewText.iloc[np.random.randint(0, len(df))]

"Rockie says. I've been taking this vitamin for years -- wouldn't be without it -- I would recommend it to everyone that wants to stay healthy."

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 584829 entries, 0 to 584901
Data columns (total 2 columns):
asin          584829 non-null object
reviewText    584829 non-null object
dtypes: object(2)
memory usage: 13.4+ MB


## Phrase Detection

In [21]:
text = list(df.reviewText.values)    # make an iterable to store only the review text

In [22]:
len(text)

584829

In [23]:
# look at a few sample reviews
for rev in text[:4]:
    print(rev, '\n')

Dpes the job well. B-flax-D is a regular at our house. It does its job simply and with good results. It is reasonable, lasts a long time, and is able to be obtained with free shipping if you hunt around. Good product, good price, good results. 

Fast shipping, good communication. Studies show that Resveratrol is poorly absorbed when taken by pill, but lozenges are very effectively absorbed. Hardly any companies are selling lozenges. This company promises 99% purity and has fast shipping and good communication. I can't comment on the quality of product because I'm not a chemist but they seem to be legitimate. 

Bioavailability is the key. I started taking this after both my parents died of cancer as it supposed to enhance your immune system - the story on 60 Minutes on resveratrol was incredibly inspiring. Doing some research on the Internet, it is indicated that taking resveratrol in lozenge form is preferable as it is broken down by stomach acids.  The ez-melt formula recommended in a

In [24]:
nlp = spacy.load('en')

The helper functions below are from:

http://nbviewer.jupyter.org/github/skipgram/modern-nlp-in-python/blob/master/executable/Modern_NLP_in_Python.ipynb

In order to use `gensim`'s `Phrases` class to detect natural combinations of words (like 'vanilla ice cream'), we need to format our text into a list of sentences, with each sentence being a list of words.  This process takes a large amount of processing time (for reference, the times shown under the cells are for running the tasks on a c5.18xlarge EC2 instance (equivalent spot fleet)), so `text` has been split into 3 parts.

### Generate Unigram Sentences

In [25]:
len(text)

584829

In [26]:
# split text into 9 parts
text_first  = text[:50000]
text_second = text[50000:100000]
text_third  = text[100000:150000]
text_fourth = text[150000:300000]
text_fifth  = text[300000:350000]
text_sixth  = text[350000:400000]
text_seventh= text[400000:450000]
text_eighth = text[450000:500000]
text_ninth = text[500000:]

In [30]:
rev_num = 0    # review tracker
sent_num = 0   # sentence tracker
unigram_sents_pos = [] # to store lists of lemmatized tokens for each sentence

for parsed_review in tqdm(nlp.pipe(text_first, batch_size=20000, n_threads=72)):
    rev_num += 1
    for sent in parsed_review.sents:
        sent_num += 1
        # lemmatize tokens & save corresponding pos tag after filtering whitespace and punctuations
        lemmatized_sent = [(token.lemma_, token.pos_) for token in sent if not (token.is_space or token.is_punct)]
        unigram_sents_pos.append([rev_num, sent_num, lemmatized_sent])

print('current rev_num: ', rev_num)
print('current sent_num: ', sent_num)

50000it [08:06, 102.75it/s]

current rev_num:  50000
current sent_num:  305895





In [32]:
len(unigram_sents_pos)

305895

In [33]:
for i in range(5):
    print(unigram_sents_pos[i])

[1, 1, [('dpe', 'NOUN'), ('the', 'DET'), ('job', 'NOUN'), ('well', 'ADV')]]
[1, 2, [('b', 'NOUN'), ('flax', 'NOUN'), ('d', 'NOUN'), ('be', 'VERB'), ('a', 'DET'), ('regular', 'ADJ'), ('at', 'ADP'), ('-PRON-', 'ADJ'), ('house', 'NOUN')]]
[1, 3, [('-PRON-', 'PRON'), ('do', 'VERB'), ('-PRON-', 'ADJ'), ('job', 'NOUN'), ('simply', 'ADV'), ('and', 'CCONJ'), ('with', 'ADP'), ('good', 'ADJ'), ('result', 'NOUN')]]
[1, 4, [('-PRON-', 'PRON'), ('be', 'VERB'), ('reasonable', 'ADJ'), ('last', 'VERB'), ('a', 'DET'), ('long', 'ADJ'), ('time', 'NOUN'), ('and', 'CCONJ'), ('be', 'VERB'), ('able', 'ADJ'), ('to', 'PART'), ('be', 'VERB'), ('obtain', 'VERB'), ('with', 'ADP'), ('free', 'ADJ'), ('shipping', 'NOUN'), ('if', 'ADP'), ('-PRON-', 'PRON'), ('hunt', 'VERB'), ('around', 'ADV')]]
[1, 5, [('good', 'ADJ'), ('product', 'NOUN'), ('good', 'ADJ'), ('price', 'NOUN'), ('good', 'ADJ'), ('result', 'NOUN')]]


In [124]:
# Save progress...
review_number = [row[0] for row in unigram_sents_pos]
sentence_number = [row[1] for row in unigram_sents_pos]
words_joined_all = []
pos_joined_all = []
for sent in unigram_sents_pos:
    word_pos = sent[2]
    word_list = [word for word, pos in word_pos]
    pos_list = [pos for word, pos in word_pos]
    words_joined = ' '.join(word for word in word_list)
    pos_joined   = ' '.join(pos for pos in pos_list)
    words_joined_all.append(words_joined)
    pos_joined_all.append(pos_joined)
    
unigram_sentences_savedf = pd.DataFrame({'review_number': review_number,
                                         'sentence_number': sentence_number,
                                         'unigram_sentences': words_joined_all,
                                         'unigram_pos': pos_joined_all})

save_df_s3(unigram_sentences_savedf, bucket_name, 'amazon_reviews/unigram_sentences.feather')

In [131]:
unigram_sentences_savedf.head()

Unnamed: 0,review_number,sentence_number,unigram_pos,unigram_sentences
0,1,1,NOUN DET NOUN ADV,dpe the job well
1,1,2,NOUN NOUN NOUN V...,b flax d be a re...
2,1,3,PRON VERB ADJ NO...,-PRON- do -PRON-...
3,1,4,PRON VERB ADJ VE...,-PRON- be reason...
4,1,5,ADJ NOUN ADJ NOU...,good product goo...


In [134]:
for parsed_review in tqdm(nlp.pipe(text_second, batch_size=20000, n_threads=72)):
    rev_num += 1
    for sent in parsed_review.sents:
        sent_num += 1
        # lemmatize tokens & save corresponding pos tag after filtering whitespace and punctuations
        lemmatized_sent = [(token.lemma_, token.pos_) for token in sent if not (token.is_space or token.is_punct)]
        unigram_sents_pos.append([rev_num, sent_num, lemmatized_sent])

print('current rev_num: ', rev_num)
print('current sent_num: ', sent_num)

50000it [08:04, 103.18it/s]

current rev_num:  100000
current sent_num:  616751





In [136]:
print(len(unigram_sents_pos))

616751


In [137]:
# Save progress...
review_number = [row[0] for row in unigram_sents_pos]
sentence_number = [row[1] for row in unigram_sents_pos]
words_joined_all = []
pos_joined_all = []
for sent in unigram_sents_pos:
    word_pos = sent[2]
    word_list = [word for word, pos in word_pos]
    pos_list = [pos for word, pos in word_pos]
    words_joined = ' '.join(word for word in word_list)
    pos_joined   = ' '.join(pos for pos in pos_list)
    words_joined_all.append(words_joined)
    pos_joined_all.append(pos_joined)
    
unigram_sentences_savedf = pd.DataFrame({'review_number': review_number,
                                         'sentence_number': sentence_number,
                                         'unigram_sentences': words_joined_all,
                                         'unigram_pos': pos_joined_all})

save_df_s3(unigram_sentences_savedf, bucket_name, 'amazon_reviews/unigram_sentences.feather')

In [138]:
for parsed_review in tqdm(nlp.pipe(text_third, batch_size=20000, n_threads=72)):
    rev_num += 1
    for sent in parsed_review.sents:
        sent_num += 1
        # lemmatize tokens & save corresponding pos tag after filtering whitespace and punctuations
        lemmatized_sent = [(token.lemma_, token.pos_) for token in sent if not (token.is_space or token.is_punct)]
        unigram_sents_pos.append([rev_num, sent_num, lemmatized_sent])

print('current rev_num: ', rev_num)
print('current sent_num: ', sent_num)

50000it [07:55, 105.06it/s]

current rev_num:  150000
current sent_num:  923642





In [139]:
# Save progress...
review_number = [row[0] for row in unigram_sents_pos]
sentence_number = [row[1] for row in unigram_sents_pos]
words_joined_all = []
pos_joined_all = []
for sent in unigram_sents_pos:
    word_pos = sent[2]
    word_list = [word for word, pos in word_pos]
    pos_list = [pos for word, pos in word_pos]
    words_joined = ' '.join(word for word in word_list)
    pos_joined   = ' '.join(pos for pos in pos_list)
    words_joined_all.append(words_joined)
    pos_joined_all.append(pos_joined)
    
unigram_sentences_savedf = pd.DataFrame({'review_number': review_number,
                                         'sentence_number': sentence_number,
                                         'unigram_sentences': words_joined_all,
                                         'unigram_pos': pos_joined_all})

save_df_s3(unigram_sentences_savedf, bucket_name, 'amazon_reviews/unigram_sentences.feather')

In [140]:
for parsed_review in tqdm(nlp.pipe(text_fourth, batch_size=20000, n_threads=72)):
    rev_num += 1
    for sent in parsed_review.sents:
        sent_num += 1
        # lemmatize tokens & save corresponding pos tag after filtering whitespace and punctuations
        lemmatized_sent = [(token.lemma_, token.pos_) for token in sent if not (token.is_space or token.is_punct)]
        unigram_sents_pos.append([rev_num, sent_num, lemmatized_sent])

print('current rev_num: ', rev_num)
print('current sent_num: ', sent_num)

150000it [23:51, 104.82it/s]

current rev_num:  300000
current sent_num:  1843092





In [141]:
# Save progress...
review_number = [row[0] for row in unigram_sents_pos]
sentence_number = [row[1] for row in unigram_sents_pos]
words_joined_all = []
pos_joined_all = []
for sent in unigram_sents_pos:
    word_pos = sent[2]
    word_list = [word for word, pos in word_pos]
    pos_list = [pos for word, pos in word_pos]
    words_joined = ' '.join(word for word in word_list)
    pos_joined   = ' '.join(pos for pos in pos_list)
    words_joined_all.append(words_joined)
    pos_joined_all.append(pos_joined)
    
unigram_sentences_savedf = pd.DataFrame({'review_number': review_number,
                                         'sentence_number': sentence_number,
                                         'unigram_sentences': words_joined_all,
                                         'unigram_pos': pos_joined_all})

save_df_s3(unigram_sentences_savedf, bucket_name, 'amazon_reviews/unigram_sentences.feather')

In [142]:
for parsed_review in tqdm(nlp.pipe(text_fifth, batch_size=20000, n_threads=72)):
    rev_num += 1
    for sent in parsed_review.sents:
        sent_num += 1
        # lemmatize tokens & save corresponding pos tag after filtering whitespace and punctuations
        lemmatized_sent = [(token.lemma_, token.pos_) for token in sent if not (token.is_space or token.is_punct)]
        unigram_sents_pos.append([rev_num, sent_num, lemmatized_sent])

print('current rev_num: ', rev_num)
print('current sent_num: ', sent_num)

50000it [07:43, 107.98it/s]

current rev_num:  350000
current sent_num:  2144424





In [143]:
# Save progress...
review_number = [row[0] for row in unigram_sents_pos]
sentence_number = [row[1] for row in unigram_sents_pos]
words_joined_all = []
pos_joined_all = []
for sent in unigram_sents_pos:
    word_pos = sent[2]
    word_list = [word for word, pos in word_pos]
    pos_list = [pos for word, pos in word_pos]
    words_joined = ' '.join(word for word in word_list)
    pos_joined   = ' '.join(pos for pos in pos_list)
    words_joined_all.append(words_joined)
    pos_joined_all.append(pos_joined)
    
unigram_sentences_savedf = pd.DataFrame({'review_number': review_number,
                                         'sentence_number': sentence_number,
                                         'unigram_sentences': words_joined_all,
                                         'unigram_pos': pos_joined_all})

save_df_s3(unigram_sentences_savedf, bucket_name, 'amazon_reviews/unigram_sentences.feather')

In [144]:
for parsed_review in tqdm(nlp.pipe(text_sixth, batch_size=20000, n_threads=72)):
    rev_num += 1
    for sent in parsed_review.sents:
        sent_num += 1
        # lemmatize tokens & save corresponding pos tag after filtering whitespace and punctuations
        lemmatized_sent = [(token.lemma_, token.pos_) for token in sent if not (token.is_space or token.is_punct)]
        unigram_sents_pos.append([rev_num, sent_num, lemmatized_sent])

print('current rev_num: ', rev_num)
print('current sent_num: ', sent_num)

50000it [07:46, 107.22it/s]

current rev_num:  400000
current sent_num:  2447985





In [145]:
# Save progress...
review_number = [row[0] for row in unigram_sents_pos]
sentence_number = [row[1] for row in unigram_sents_pos]
words_joined_all = []
pos_joined_all = []
for sent in unigram_sents_pos:
    word_pos = sent[2]
    word_list = [word for word, pos in word_pos]
    pos_list = [pos for word, pos in word_pos]
    words_joined = ' '.join(word for word in word_list)
    pos_joined   = ' '.join(pos for pos in pos_list)
    words_joined_all.append(words_joined)
    pos_joined_all.append(pos_joined)
    
unigram_sentences_savedf = pd.DataFrame({'review_number': review_number,
                                         'sentence_number': sentence_number,
                                         'unigram_sentences': words_joined_all,
                                         'unigram_pos': pos_joined_all})

save_df_s3(unigram_sentences_savedf, bucket_name, 'amazon_reviews/unigram_sentences.feather')

In [146]:
for parsed_review in tqdm(nlp.pipe(text_seventh, batch_size=20000, n_threads=72)):
    rev_num += 1
    for sent in parsed_review.sents:
        sent_num += 1
        # lemmatize tokens & save corresponding pos tag after filtering whitespace and punctuations
        lemmatized_sent = [(token.lemma_, token.pos_) for token in sent if not (token.is_space or token.is_punct)]
        unigram_sents_pos.append([rev_num, sent_num, lemmatized_sent])

print('current rev_num: ', rev_num)
print('current sent_num: ', sent_num)

50000it [07:41, 108.43it/s]

current rev_num:  450000
current sent_num:  2754623





In [147]:
# Save progress...
review_number = [row[0] for row in unigram_sents_pos]
sentence_number = [row[1] for row in unigram_sents_pos]
words_joined_all = []
pos_joined_all = []
for sent in unigram_sents_pos:
    word_pos = sent[2]
    word_list = [word for word, pos in word_pos]
    pos_list = [pos for word, pos in word_pos]
    words_joined = ' '.join(word for word in word_list)
    pos_joined   = ' '.join(pos for pos in pos_list)
    words_joined_all.append(words_joined)
    pos_joined_all.append(pos_joined)
    
unigram_sentences_savedf = pd.DataFrame({'review_number': review_number,
                                         'sentence_number': sentence_number,
                                         'unigram_sentences': words_joined_all,
                                         'unigram_pos': pos_joined_all})

save_df_s3(unigram_sentences_savedf, bucket_name, 'amazon_reviews/unigram_sentences.feather')

In [148]:
for parsed_review in tqdm(nlp.pipe(text_eighth, batch_size=20000, n_threads=72)):
    rev_num += 1
    for sent in parsed_review.sents:
        sent_num += 1
        # lemmatize tokens & save corresponding pos tag after filtering whitespace and punctuations
        lemmatized_sent = [(token.lemma_, token.pos_) for token in sent if not (token.is_space or token.is_punct)]
        unigram_sents_pos.append([rev_num, sent_num, lemmatized_sent])

print('current rev_num: ', rev_num)
print('current sent_num: ', sent_num)

50000it [08:04, 103.24it/s]


current rev_num:  500000
current sent_num:  3073060


In [149]:
# Save progress...
review_number = [row[0] for row in unigram_sents_pos]
sentence_number = [row[1] for row in unigram_sents_pos]
words_joined_all = []
pos_joined_all = []
for sent in unigram_sents_pos:
    word_pos = sent[2]
    word_list = [word for word, pos in word_pos]
    pos_list = [pos for word, pos in word_pos]
    words_joined = ' '.join(word for word in word_list)
    pos_joined   = ' '.join(pos for pos in pos_list)
    words_joined_all.append(words_joined)
    pos_joined_all.append(pos_joined)
    
unigram_sentences_savedf = pd.DataFrame({'review_number': review_number,
                                         'sentence_number': sentence_number,
                                         'unigram_sentences': words_joined_all,
                                         'unigram_pos': pos_joined_all})

save_df_s3(unigram_sentences_savedf, bucket_name, 'amazon_reviews/unigram_sentences.feather')

In [150]:
for parsed_review in tqdm(nlp.pipe(text_ninth, batch_size=20000, n_threads=72)):
    rev_num += 1
    for sent in parsed_review.sents:
        sent_num += 1
        # lemmatize tokens & save corresponding pos tag after filtering whitespace and punctuations
        lemmatized_sent = [(token.lemma_, token.pos_) for token in sent if not (token.is_space or token.is_punct)]
        unigram_sents_pos.append([rev_num, sent_num, lemmatized_sent])

print('current rev_num: ', rev_num)
print('current sent_num: ', sent_num)

84829it [13:30, 104.70it/s]

current rev_num:  584829
current sent_num:  3605491





In [151]:
# Save progress...
review_number = [row[0] for row in unigram_sents_pos]
sentence_number = [row[1] for row in unigram_sents_pos]
words_joined_all = []
pos_joined_all = []
for sent in unigram_sents_pos:
    word_pos = sent[2]
    word_list = [word for word, pos in word_pos]
    pos_list = [pos for word, pos in word_pos]
    words_joined = ' '.join(word for word in word_list)
    pos_joined   = ' '.join(pos for pos in pos_list)
    words_joined_all.append(words_joined)
    pos_joined_all.append(pos_joined)
    
unigram_sentences_savedf = pd.DataFrame({'review_number': review_number,
                                         'sentence_number': sentence_number,
                                         'unigram_sentences': words_joined_all,
                                         'unigram_pos': pos_joined_all})

save_df_s3(unigram_sentences_savedf, bucket_name, 'amazon_reviews/unigram_sentences.feather')

In [5]:
unigram_sentences_savedf = load_df_s3(bucket_name, 'amazon_reviews/unigram_sentences.feather', filetype='feather')

### Phrase Detection

In [160]:
len(words_joined_all)

3605491

In [159]:
len(unigram_sents_pos)

3605491

In [162]:
unigram_sentences = [sentence.split(' ') for sentence in words_joined_all]

In [165]:
print(unigram_sentences[:4])

[['dpe', 'the', 'job', 'well'], ['b', 'flax', 'd', 'be', 'a', 'regular', 'at', '-PRON-', 'house'], ['-PRON-', 'do', '-PRON-', 'job', 'simply', 'and', 'with', 'good', 'result'], ['-PRON-', 'be', 'reasonable', 'last', 'a', 'long', 'time', 'and', 'be', 'able', 'to', 'be', 'obtain', 'with', 'free', 'shipping', 'if', '-PRON-', 'hunt', 'around']]


In [166]:
# The common_terms parameter add a way to give special treatment to common terms 
# (aka stop words) such that their presence between two words won’t prevent bigram detection. 
# It allows to detect expressions like “bank of america” or “eye of the beholder”.
common_terms = ["of", "with", "without", "and", "or", "the", "a"]

# Train a first-order phrase detector
bigram_model = Phrases(unigram_sentences, threshold=0.5, scoring='npmi', common_terms=common_terms)

# Transform unigram sentences into bigram sentences
# Paired words are connected by an underscore, e.g. ice_cream
bigram_sentences = []
for sentence in unigram_sentences:
    bigram_sentences.append(bigram_model[sentence])



In [167]:
%%time
# Train a second-order phrase detector
# trigram_model = Phrases(bigram_sentences, min_count=5)
trigram_model = Phrases(bigram_sentences, threshold=0.5, scoring='npmi', common_terms=common_terms)

# Transform bigram sentences into trigram sentences
trigram_sentences = []
for sentence in bigram_sentences:
    trigram_sentences.append(trigram_model[sentence])

# remove any remaining stopwords
trigram_sentences = [[word for word in sentence if word not in nlp.Defaults.stop_words] for sentence in trigram_sentences]



CPU times: user 3min 42s, sys: 5.11 s, total: 3min 47s
Wall time: 3min 47s


In [168]:
# the trigrams will be saved in a dataframe with a single column.
# each row is one sentence from any review
# each sentence is a single string separated by a single space.
trigram_sentences_savedf = pd.DataFrame([u' '.join(sentence) for sentence in trigram_sentences], columns=['preprocessed_review'])
save_df_s3(trigram_sentences_savedf, bucket_name, 'amazon_reviews/preprocessed_reviews.feather')

In [7]:
trigram_sentences_savedf = load_df_s3(bucket_name, 'amazon_reviews/preprocessed_reviews.feather', filetype='feather')

In [8]:
trigram_sentences_savedf.head()

Unnamed: 0,preprocessed_review
0,dpe job
1,b flax d regular -PRON- house
2,-PRON- -PRON- job simply good result
3,-PRON- reasonable long time able obtain free_s...
4,good product good price good result


In [9]:
trigram_sentences_savedf = trigram_sentences_savedf.preprocessed_review.tolist()[:100]

In [10]:
trigram_sentences_savedf

['dpe job',
 'b flax d regular -PRON- house',
 '-PRON- -PRON- job simply good result',
 '-PRON- reasonable long time able obtain free_shipping -PRON- hunt',
 'good product good price good result',
 'fast_shipping good communication',
 'study resveratrol poorly absorb pill lozenge effectively absorb',
 'hardly company sell lozenge',
 'company promise 99 purity fast_shipping good communication',
 '-PRON- can_not comment quality product -PRON- chemist',
 '-PRON- seem_to legitimate',
 'bioavailability key',
 '-PRON- start -PRON- parent die cancer -PRON- suppose enhance -PRON- immune_system story 60 minutes resveratrol incredibly inspiring',
 'research internet -PRON- indicate resveratrol lozenge form preferable -PRON- break stomach acid',
 'ez melt formula recommend review ok -PRON- dissolve mouth quickly lozenge formula dissolve slowly preferable accord -PRON- research',
 'this_product great side_effect -PRON- -PRON- cold sore_throat',
 'soon start -PRON- every_day -PRON- start come cold 

In [170]:
len(trigram_sentences)

3605491

In [158]:
# vocabulary size with unigrams
len([word for sentence in unigram_sentences for word in sentence])

43362695

In [159]:
# vocabulary size with trigrams
len([word for sentence in trigram_sentences for word in sentence])

21960569

In [160]:
trigrams_flat = [word for sentence in trigram_sentences for word in sentence]

In [161]:
len(trigrams_flat)

21960569

In [162]:
print(trigrams_flat[:15])

['dpe', 'job', 'b', 'flax', 'd', 'regular', '-PRON-', 'house', '-PRON-', '-PRON-', 'job', 'simply', 'good', 'result', '-PRON-']


In [163]:
paired_words = set([word for word in trigrams_flat if '_' in word])

In [173]:
len(paired_words)

203277

In [164]:
print(trigrams_flat[100:150])

['mouth', 'quickly', 'lozenge', 'formula', 'dissolve', 'slowly', 'preferable', 'accord', '-PRON-', 'research', 'this_product', 'great', 'side_effect', '-PRON-', '-PRON-', 'cold', 'sore_throat', 'soon', 'start', '-PRON-', 'every_day', '-PRON-', 'start', 'come', 'cold', '-PRON-', 'usual', 'symptom', 'anticipate', 'sick', 'day', '-PRON-', 'usual', 'pattern', '-PRON-', 'sick', 'anticipate', 'taking', 'this_product', 'reason', '-PRON-', 'come', '-PRON-', 'cold', 'sore_throat', '-PRON-', 'great', '-PRON-', 'recommend', 'this_product']


In [165]:
# print trigrams containing 'no' or 'not'
for w in paired_words:
    if ('_no_' in w or 'not_' in w):
        print(w)

night.not_a_miracle_cure
pinot_noir
solublenot_certify_kosher_or_halal$8.99
count)*****fat_solublenot_certify_kosher
240_softgels)****fat_solublenot_certify
solublenot_certify_kosher_or_halal$13.78
8220;not_hungry&#8221
distilledmercury_freenot_enteric_coatednot
cholesterolmolecularly_distilledmercury_freenot_enteric
estafa!not_worth_the_money
34;not_guilty&#34
hacking_snot_fill
solublenot_certify_kosher_or_halal$27.99
each)****triglycerides_formnot_certify_kosher
freshness_34;not_rancid&#34
formnot_certify_kosher_or_halal$45.82
formnot_certify_kosher_or_halal$45.46
hungry.not_a_stimulant
240_softgels,)fat_solublenot_certify
stearateschelatedvegetariannot_enteric_coatedcontain_laxative
34;not_work&#34;.
enteric_coatednot_vegetarianone
supply).)ethyl_ester_formnot_certify
22.8=_78.6not_373i_freak
cholesterolmolecularly_distilledno_mercurynot_enteric
solublenot_certify_kosher_or_halal$27.77
90-count)*****ubiquinolfat_solublenot_certify_kosher
mercurynot_enteric_coatednot_vegetarianphosph

In [166]:
unigram_text = unigram_sentences_savedf.unigram_sentences.tolist()

In [167]:
# search for one of the weird paired terms in the list above: 'solublenot_certify_kosher'
# this shows the review it was a part of before getting paired
[sent for sent in unigram_text if 'not certify kosher' in sent][0]

"magnesium malate magnesium glycinatewater solublenot certify kosher or halal$ n a for 120 200 mg capsule on amazonrecommended serving two capsulesprice per gel cap $ n a use amazon 's price)price per 100 mgs magnesium $ n a use amazon 's price)no soyno gmosno cholesterolno stearateschelatedvegetariannot enteric coatedno laxative propertiesno ingredient source from chinaphone number 800 476 3542manufactur in the u.s.a.ingredient magnesium malate chelate magnesium glycinate and vegetarian capsule non gmo plant cellulose)doctor 's good high absorption 100 chelated magnesium"

**Clearly, there was a problem in the unigram terms as well since `soluble` and `not` are joined together (along with other words).**

In [168]:
# find the same review in the original unprocessed reviews dataset
[sent for sent in text if '$17.09' in sent][0][:2000]

"KAL Magnesium Glycinate 400 vs Nine Leading Magnesium Supplements. ***Here is a side-by-side comparison of ten leading magnesium supplements: Nutrigold Magnesium Gold, Doctor's Best High Absorption 100% Chelated Magnesium, JigSaw Magnesium w/SRT, Now Foods Magnesium Citrate (200 mgs), Now Foods Magnesium Capsules (400 mgs), Solgar Magnesium Citrate, Life Extension Magnesium Caps, Thorne Research Magnesium Citrate, Bluebonnet Nutrition Albion Chelated Magnesium, and KAL Magnesium Glycinate 400.Magnesium is needed for more than 300 biochemical reactions in the body. It helps maintain normal muscle and nerve function, keeps heart rhythm steady, supports a healthy immune system, and keeps bones strong. Magnesium also helps regulate blood sugar levels, promotes normal blood pressure, and is known to be involved in energy metabolism and protein synthesis. There is an increased interest in the role of magnesium in preventing and managing disorders such as hypertension, cardiovascular disease

**In the unprocessed reviews as well, `soluble` and `not` are joined together (along with other words).  This is a problem with the data itself; not an outcome of the preprocessing.**

In [169]:
q1 = reviews[reviews.asin.str.contains('B00013YZ1Q')]
q2 = q1[q1.summary.str.contains('KAL Magnesium Glycinate 400 vs Nine Leading Magnesium')]

In [170]:
# let's find the product from the review above:
q2.summary.values

array(['KAL Magnesium Glycinate 400 vs Nine Leading Magnesium Supplements'], dtype=object)

In [171]:
# let's look at the 100 most frequent paired words
paired_words_frq = Counter([word for word in trigrams_flat if '_' in word])
paired_words_frq.most_common(100)

[('do_not', 268437),
 ('this_product', 207554),
 ('seem_to', 45681),
 ('can_not', 45528),
 ('great_product', 41158),
 ('weight_loss', 35438),
 ('so_far', 29550),
 ('at_all', 25321),
 ('this_stuff', 23679),
 ('highly_recommend', 23157),
 ('lose_weight', 23118),
 ('fish_oil', 21909),
 ('side_effect', 17800),
 ('as_well', 17148),
 ('would_recommend', 16050),
 ('in_the_morning', 15725),
 ('at_least', 14776),
 ('will_continue', 14454),
 ('more_than', 13908),
 ('more_energy', 13045),
 ('per_day', 11691),
 ('every_day', 11147),
 ('garcinia_cambogia', 10203),
 ('as_well_as', 9379),
 ('at_night', 8951),
 ('very_happy', 8328),
 ('too_much', 8297),
 ('year_ago', 8001),
 ('no_side_effect', 7872),
 ('high_quality', 7664),
 ('energy_level', 7583),
 ('vitamin_d', 7473),
 ('vitamin_c', 7400),
 ('year_old', 7201),
 ('run_out', 7056),
 ('no_longer', 7043),
 ('five_star', 6781),
 ('suffer_from', 6679),
 ('dr._oz', 6578),
 ('wake_up', 6439),
 ('immune_system', 6167),
 ('twice_a_day', 6086),
 ('on_the_mark

In [175]:
# Find the 100 most infrequent paired words
paired_words_frq.most_common()[::-1][:100]

[('overturn_conventional_wisdom', 1),
 ('eat&#8221_the_wrong_combo', 1),
 ('tub_of_humus_with_veggie', 1),
 ('veep_university', 1),
 ('consumer_of_cookies!!it', 1),
 ('portion_veep_university---', 1),
 ('expereienc_with_veep', 1),
 ('visual_representation_veep', 1),
 ('veep_lookcut_program', 1),
 ('fitness_fanatic_veep_university', 1),
 ('outdoor_enthusiast_mtn', 1),
 ('mountain_biking_rowing', 1),
 ('trx_training', 1),
 ('lilttle_longer', 1),
 ('double_decker_cheeseburger', 1),
 ('marathon_and_a_tri_atholon', 1),
 ('8220_recommended&#8221', 1),
 ('trade_show&#8230', 1),
 ('go!upon_arrival', 1),
 ('hydroxycitric_acid_hca).this', 1),
 ('sharp_edges2', 1),
 ('crash_dieting).in_conclusion', 1),
 ('nuline_nutritionals_and_tomoson', 1),
 ('wishful_thinking!ftc_disclosure', 1),
 ('savor_the_taste).as', 1),
 ('34;healthy_fat&#34', 1),
 ('atrail_fibrillationso', 1),
 ('holy_cr*p', 1),
 ('w700_and_the_ubersurge', 1),
 ("bootle_of_uberday_women_'s", 1),
 ('detail_and_a_superior_product!paula', 1

In [174]:
len(paired_words_frq)   # number of paired words

203277

# Training the LDA Model

In [181]:
%%time

# we need to learn the full vocabulary of the corpus to be modeled
# learn the dictionary by iterating over all of the reviews
trigram_dictionary = Dictionary(trigram_sentences)

CPU times: user 29.5 s, sys: 0 ns, total: 29.5 s
Wall time: 29.5 s


In [182]:
# filter tokens that are very rare or too common from
# the dictionary (filter_extremes) and reassign integer ids (compactify)
trigram_dictionary.filter_extremes(no_below=10, no_above=0.6)
trigram_dictionary.compactify()   # remove gaps in id sequence after words that were removed

In [182]:
trigram_dictionary.save('../vocab_dictionary.dict')     # save vocabulary dict locally

In [182]:
trigram_dictionary = Dictionary.load('../vocab_dictionary.dict')  # load the finished dictionary from disk