In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
from collections import Counter
import warnings

import spacy
from gensim.models import Phrases
from gensim.models.word2vec import LineSentence
from gensim.corpora import Dictionary, MmCorpus
from gensim.models.ldamulticore import LdaMulticore
# import pyLDAvis
# import pyLDAvis.gensim

from tqdm._tqdm_notebook import tqdm, tqdm_notebook, tnrange
from S3_read_write import load_df_s3, save_df_s3

from IPython.display import Image
from IPython.core.display import HTML

In [3]:
tqdm_notebook.pandas('Progress')

In [4]:
bucket_name = 'amazon-reviews-project'

# Load Amazon Reviews Data

In [179]:
reviews = load_df_s3(bucket_name, 'amazon_reviews/reviews_data_clean', filetype='text', sep='|')

In [31]:
reviews.shape    # 585,444 records

(585444, 8)

In [32]:
reviews.head()

Unnamed: 0,asin,helpful,reviewText,overall,summary,description,title,categories_clean
0,929619730,"[0, 0]",B-flax-D is a re...,5.0,Dpes the job well,Contains Organic...,New Generation B...,Health & Persona...
1,978559088,"[1, 1]",Studies show tha...,4.0,"Fast shipping, g...",Everyone knows t...,Nutrihill Resver...,Health & Persona...
2,978559088,"[1, 1]",I started taking...,5.0,Bioavailability ...,Everyone knows t...,Nutrihill Resver...,Health & Persona...
3,978559088,"[0, 1]",I tried Nutrihil...,1.0,Other Resveratro...,Everyone knows t...,Nutrihill Resver...,Health & Persona...
4,978559088,"[0, 0]",I really liked t...,5.0,I can't find thi...,Everyone knows t...,Nutrihill Resver...,Health & Persona...


In [33]:
reviews.dtypes

asin                 object
helpful              object
reviewText           object
overall             float64
summary              object
description          object
title                object
categories_clean     object
dtype: object

## Data Cleaning

In [52]:
reviews.categories_clean.unique()[:10]

array(['Health & Personal Care, Vitamins & Dietary Supplements, Multi & Prenatal Vitamins, Multiple Vitamin-Mineral Supplements',
       'Health & Personal Care, Vitamins & Dietary Supplements, Supplements, Antioxidants, Resveratrol',
       'Health & Personal Care, Vitamins & Dietary Supplements, Multi & Prenatal Vitamins, Multivitamins',
       'Health & Personal Care, Vitamins & Dietary Supplements, Vitamins, Vitamin B, B3 (Niacin)',
       'Health & Personal Care, Vitamins & Dietary Supplements, Herbal Supplements',
       'Health & Personal Care, Vitamins & Dietary Supplements, Herbal Supplements, Green Tea',
       'Health & Personal Care, Vitamins & Dietary Supplements, Weight Loss, Supplements, Green Coffee Bean Extract',
       'Health & Personal Care, Vitamins & Dietary Supplements, Weight Loss, Supplements',
       'Health & Personal Care, Vitamins & Dietary Supplements, Supplements, Antioxidants, CoQ10',
       'Health & Personal Care, Vitamins & Dietary Supplements, Herbal

The catergories' list indicates that there may be some reviews in the dataset unrelated to health supplements.  Let's get rid of these.

In [35]:
reviews[reviews.categories_clean.str.contains('CDs & Vinyl')].title.unique()

array(['Liturgy of St. John Chrysostom', 'Origins',
       'Sounds of the Earth: Soft Ocean Sounds', 'Bali',
       'Tranquil Waters', 'Bach: St. John Passion, BWV 245',
       '21st Century Soul', 'Bodies for Strontium', "John's Bunch",
       'An Evening of Paganini', "John's Other Bunch",
       'Sus Mas Grandes Exitos', 'Complex Simplicity',
       'Kidnapped By Neptune', 'Roman Chant / Easter Vespers', 'Dead 60s',
       "Cilla in the 60's", 'Chromium', 'Letters From the Vitamin Sea',
       'The Stinging Nettles', 'Tendres Annees 60', 'Wehiwehi Hawaii',
       'none'], dtype=object)

In [36]:
len(reviews[reviews.categories_clean.str.contains('CDs & Vinyl')])

263

The product titles shown above are all music albums/songs.

In [37]:
reviews_filt = reviews[~(reviews.categories_clean.str.contains('CDs & Vinyl'))]   # remove rows with category including 'CDs & Vinyl'

In [51]:
reviews_filt.categories_clean.unique()[:10]

array(['Health & Personal Care, Vitamins & Dietary Supplements, Multi & Prenatal Vitamins, Multiple Vitamin-Mineral Supplements',
       'Health & Personal Care, Vitamins & Dietary Supplements, Supplements, Antioxidants, Resveratrol',
       'Health & Personal Care, Vitamins & Dietary Supplements, Multi & Prenatal Vitamins, Multivitamins',
       'Health & Personal Care, Vitamins & Dietary Supplements, Vitamins, Vitamin B, B3 (Niacin)',
       'Health & Personal Care, Vitamins & Dietary Supplements, Herbal Supplements',
       'Health & Personal Care, Vitamins & Dietary Supplements, Herbal Supplements, Green Tea',
       'Health & Personal Care, Vitamins & Dietary Supplements, Weight Loss, Supplements, Green Coffee Bean Extract',
       'Health & Personal Care, Vitamins & Dietary Supplements, Weight Loss, Supplements',
       'Health & Personal Care, Vitamins & Dietary Supplements, Supplements, Antioxidants, CoQ10',
       'Health & Personal Care, Vitamins & Dietary Supplements, Herbal

In [39]:
reviews_filt[reviews_filt.categories_clean.str.contains('Software')]

Unnamed: 0,asin,helpful,reviewText,overall,summary,description,title,categories_clean
3639,B00009QP4Q,"[2, 2]",The company has ...,5.0,lives up to its ...,Alpha Five's QLi...,none,Health & Persona...
50015,B0002TIEQQ,"[0, 0]",I ordered this f...,1.0,waste of money,Self help tutori...,none,Health & Persona...


In [40]:
reviews_filt = reviews_filt[~(reviews_filt.categories_clean.str.contains('Software'))]

In [41]:
len(reviews_filt)

585179

In [53]:
# Get rid of reviews of pet-related products
search_for = [' pet ', ' cat ', ' dog ']
pattern = '|'.join(search_for)
reviews_filt.title.str.contains(pattern, case=False).sum()

277

In [50]:
reviews_filt[reviews_filt.title.str.contains(pattern, case=False)]['title'].values[:10]

array(['Power - Mune Tuna Flavor Pet Herbal Supplement From Vetvittles.com',
       'Power - Mune Tuna Flavor Pet Herbal Supplement From Vetvittles.com',
       'Power - Mune Tuna Flavor Pet Herbal Supplement From Vetvittles.com',
       'AniMed Witch Hazel 86-Percent Multi-Species Pet Supplement',
       'AniMed Witch Hazel 86-Percent Multi-Species Pet Supplement',
       'AniMed Witch Hazel 86-Percent Multi-Species Pet Supplement',
       'AniMed Witch Hazel 86-Percent Multi-Species Pet Supplement',
       'AniMed Witch Hazel 86-Percent Multi-Species Pet Supplement',
       'AniMed Witch Hazel 86-Percent Multi-Species Pet Supplement',
       'Composure Liquid for Dogs and Cat (188 SERVINGS)'], dtype=object)

In [54]:
# Get rid of all pet products
reviews_filt = reviews_filt[~(reviews_filt.title.str.contains(pattern, case=False))]

In [55]:
# saving the cleaned dataframe
save_df_s3(df=reviews_filt, bucket_name=bucket_name, filepath='amazon_reviews/reviews_data_clean_v2.feather')

In [56]:
reviews_filt.asin.nunique()     # 48,535 unique products and 585,179 reviews

48501

## Examine One Observation

In [57]:
example = reviews_filt.iloc[0]

In [58]:
example.asin     # Amazon Standard Identification Number

'0929619730'

In [59]:
example.title     # this is the product's name

'New Generation B-Flax-D'

In [60]:
example.categories_clean   # previously filtered/curated categories of interest

'Health & Personal Care, Vitamins & Dietary Supplements, Multi & Prenatal Vitamins, Multiple Vitamin-Mineral Supplements'

In [61]:
example.description       # product description provided by the seller

'Contains Organic Cold-Milled Flaxseed\nValuable source of soluble and insoluble fiber\nProvides Omega-3 essential fats, and many other nutrients to help achieve and maintain optimal bowel function.\n\nContains Vitamin B12\nB12 helps prevent nerve damage\nB12 aids in healthy cell formation.\nB12 helps prevent anemia\n\nContains Vitamin D\nVitamin D assists the body in the absorption of important minerals like calcium.\n\nContains Seleno-yeast\nA source of selenium, a mineral with powerful anti-viral and disease-fighting properties.\n\nContains Vitamin K2\nMenaQ7TM provides vitamin K2 (menaquinone), extracted and concentrated from natto without solvents. Vitamin K2 prevents arterial calcification and promotes strong bones by improving cross-linking of osteocalcin, a protein found in bones. The amount here has been clinically shown not to interfere with blood anti-coagulant medication. \n\nServing Size:\n1/4 Cup (30 Grams)\n\nServings Per Container:\n30 Servings per container\n\nNet Wt. 

In [62]:
example.summary      # review title

'Dpes the job well'

In [63]:
example.reviewText   # review content

'B-flax-D is a regular at our house. It does its job simply and with good results. It is reasonable, lasts a long time, and is able to be obtained with free shipping if you hunt around. Good product, good price, good results.'

Here's what the actual review looks like:

In [64]:
example.overall     # the rating provided by the reviewer

5.0

In [29]:
example.helpful

'[0, 0]'

In [27]:
Image(url= "images/amazon_review_screenshot.png")

# Data Pre-processing

Let us start off using only the title (`summary`) and body (`reviewText`) of each review.

In [5]:
%%time
df = load_df_s3(bucket_name, filepath='amazon_reviews/reviews_data_clean_v2.feather', filetype='feather')

CPU times: user 3.03 s, sys: 4.25 s, total: 7.28 s
Wall time: 20 s


In [6]:
df.dtypes

asin                 object
helpful              object
reviewText           object
overall             float64
summary              object
description          object
title                object
categories_clean     object
dtype: object

In [7]:
df.drop(['helpful', 'overall', 'title', 'categories_clean', 'description'], axis=1, inplace=True)

In [8]:
df.head()

Unnamed: 0,asin,reviewText,summary
0,929619730,B-flax-D is a regular at our house. It does it...,Dpes the job well
1,978559088,Studies show that Resveratrol is poorly absorb...,"Fast shipping, good communication"
2,978559088,I started taking this after both my parents di...,Bioavailability is the key
3,978559088,"I tried Nutrihill, but did not feel any of the...",Other Resveratrol Supplements are Better
4,978559088,I really liked this product because it stayed ...,"I can't find this product any longer, and I wi..."


In [9]:
# for each review, concatenate the review title and body
df.reviewText = df.summary + '. ' + df.reviewText

In [10]:
pd.set_option('max_colwidth', 200)
df.head()

Unnamed: 0,asin,reviewText,summary
0,929619730,"Dpes the job well. B-flax-D is a regular at our house. It does its job simply and with good results. It is reasonable, lasts a long time, and is able to be obtained with free shipping if you hunt ...",Dpes the job well
1,978559088,"Fast shipping, good communication. Studies show that Resveratrol is poorly absorbed when taken by pill, but lozenges are very effectively absorbed. Hardly any companies are selling lozenges. This ...","Fast shipping, good communication"
2,978559088,Bioavailability is the key. I started taking this after both my parents died of cancer as it supposed to enhance your immune system - the story on 60 Minutes on resveratrol was incredibly inspirin...,Bioavailability is the key
3,978559088,"Other Resveratrol Supplements are Better. I tried Nutrihill, but did not feel any of the supposed health benefits. I started reading and realized that even though buccal delivery is the best, the ...",Other Resveratrol Supplements are Better
4,978559088,"I can't find this product any longer, and I wish I could.. I really liked this product because it stayed in my mouth for a long time and I felt it was probably doing some good. I take a number of...","I can't find this product any longer, and I wish I could."


Let's drop the `summary` column now:

In [11]:
df.drop(['summary'], axis=1, inplace=True)

In [12]:
df.head()

Unnamed: 0,asin,reviewText
0,929619730,"Dpes the job well. B-flax-D is a regular at our house. It does its job simply and with good results. It is reasonable, lasts a long time, and is able to be obtained with free shipping if you hunt ..."
1,978559088,"Fast shipping, good communication. Studies show that Resveratrol is poorly absorbed when taken by pill, but lozenges are very effectively absorbed. Hardly any companies are selling lozenges. This ..."
2,978559088,Bioavailability is the key. I started taking this after both my parents died of cancer as it supposed to enhance your immune system - the story on 60 Minutes on resveratrol was incredibly inspirin...
3,978559088,"Other Resveratrol Supplements are Better. I tried Nutrihill, but did not feel any of the supposed health benefits. I started reading and realized that even though buccal delivery is the best, the ..."
4,978559088,"I can't find this product any longer, and I wish I could.. I really liked this product because it stayed in my mouth for a long time and I felt it was probably doing some good. I take a number of..."


In [13]:
pd.set_option('max_colwidth', 20)

## Remove Missing Reviews

In [14]:
df.reviewText.isnull().sum()    # 73 reviews have neither a review body text, nor a review title

73

In [15]:
# drop reviews with no text
df = df[~(df.reviewText.isnull())]

In [16]:
df.asin.isnull().sum()

0

Let's look at a few actual review texts:

In [17]:
df.reviewText.iloc[np.random.randint(0, len(df))]

'Nice protein powder. Vegan, clean-burning; nice flavor and texture; blends well; low sugar. This is a high-quality protein powder product: I highly recommend it.'

In [18]:
df.reviewText.iloc[np.random.randint(0, len(df))]

"So far so good. I have only been on this for a week and have lost a pound so far.  At first I wasn't taking the right dosage but after reading the reviews and usage requirments again on here, I started doing it right.  I am going to continue through the whole first bottle, the free bottle and if I see it is working, I am going to order more.  It has helped with my appetite some but not as much as I had thought.  I do feel better when I take it though.  I have faith and hope to lose much much more."

In [19]:
df.reviewText.iloc[np.random.randint(0, len(df))]

'Great product. My son loves these vitamins.  He prefers these over the regular gummy vites (which he loves). He asks for them multiple times during the day.  Much better than the Flinstones.'

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 584829 entries, 0 to 584901
Data columns (total 2 columns):
asin          584829 non-null object
reviewText    584829 non-null object
dtypes: object(2)
memory usage: 13.4+ MB


## Phrase Detection

In [21]:
text = list(df.reviewText.values)    # make an iterable to store only the review text

In [22]:
[sent for sent in text if len(sent) == 0]   # there are no blank sentences

[]

In [23]:
len(text)

584829

In [24]:
# look at a few sample reviews
for rev in text[:4]:
    print(rev, '\n')

Dpes the job well. B-flax-D is a regular at our house. It does its job simply and with good results. It is reasonable, lasts a long time, and is able to be obtained with free shipping if you hunt around. Good product, good price, good results. 

Fast shipping, good communication. Studies show that Resveratrol is poorly absorbed when taken by pill, but lozenges are very effectively absorbed. Hardly any companies are selling lozenges. This company promises 99% purity and has fast shipping and good communication. I can't comment on the quality of product because I'm not a chemist but they seem to be legitimate. 

Bioavailability is the key. I started taking this after both my parents died of cancer as it supposed to enhance your immune system - the story on 60 Minutes on resveratrol was incredibly inspiring. Doing some research on the Internet, it is indicated that taking resveratrol in lozenge form is preferable as it is broken down by stomach acids.  The ez-melt formula recommended in a

In [11]:
nlp = spacy.load('en')

The helper functions below are from:

http://nbviewer.jupyter.org/github/skipgram/modern-nlp-in-python/blob/master/executable/Modern_NLP_in_Python.ipynb

In order to use `gensim`'s `Phrases` class to detect natural combinations of words (like 'vanilla ice cream'), we need to format our text into a list of sentences, with each sentence being a list of words.  This process takes a large amount of processing time (for reference, the times shown under the cells are for running the tasks on a c5.18xlarge EC2 instance (equivalent spot fleet)), so `text` has been split into 3 parts.

### Generate Unigram Sentences

In [26]:
len(text)

584829

In [28]:
# split text into 9 parts
text_first  = text[:50000]
text_second = text[50000:100000]
text_third  = text[100000:150000]
text_fourth = text[150000:300000]
text_fifth  = text[300000:350000]
text_sixth  = text[350000:400000]
text_seventh= text[400000:450000]
text_eighth = text[450000:500000]
text_ninth = text[500000:]

In [29]:
rev_num = 0    # review tracker
sent_num = 0   # sentence tracker
unigram_sents_pos = [] # to store lists of lemmatized tokens for each sentence

for parsed_review in tqdm(nlp.pipe(text_first, batch_size=10000, n_threads=72)):
    rev_num += 1
    for sent in parsed_review.sents:
        sent_num += 1
        # lemmatize tokens & save corresponding pos tag after filtering whitespace and punctuations
        lemmatized_sent = [(token.lemma_, token.pos_) for token in sent if not (token.is_space or token.is_punct)]
        if len(lemmatized_sent) != 0:
            unigram_sents_pos.append([rev_num, sent_num, lemmatized_sent])

print('current rev_num: ', rev_num)
print('current sent_num: ', sent_num)

50000it [08:06, 102.71it/s]

current rev_num:  50000
current sent_num:  305895





In [30]:
len(unigram_sents_pos)

305455

In [31]:
for i in range(5):
    print(unigram_sents_pos[i])

[1, 1, [('dpe', 'NOUN'), ('the', 'DET'), ('job', 'NOUN'), ('well', 'ADV')]]
[1, 2, [('b', 'NOUN'), ('flax', 'NOUN'), ('d', 'NOUN'), ('be', 'VERB'), ('a', 'DET'), ('regular', 'ADJ'), ('at', 'ADP'), ('-PRON-', 'ADJ'), ('house', 'NOUN')]]
[1, 3, [('-PRON-', 'PRON'), ('do', 'VERB'), ('-PRON-', 'ADJ'), ('job', 'NOUN'), ('simply', 'ADV'), ('and', 'CCONJ'), ('with', 'ADP'), ('good', 'ADJ'), ('result', 'NOUN')]]
[1, 4, [('-PRON-', 'PRON'), ('be', 'VERB'), ('reasonable', 'ADJ'), ('last', 'VERB'), ('a', 'DET'), ('long', 'ADJ'), ('time', 'NOUN'), ('and', 'CCONJ'), ('be', 'VERB'), ('able', 'ADJ'), ('to', 'PART'), ('be', 'VERB'), ('obtain', 'VERB'), ('with', 'ADP'), ('free', 'ADJ'), ('shipping', 'NOUN'), ('if', 'ADP'), ('-PRON-', 'PRON'), ('hunt', 'VERB'), ('around', 'ADV')]]
[1, 5, [('good', 'ADJ'), ('product', 'NOUN'), ('good', 'ADJ'), ('price', 'NOUN'), ('good', 'ADJ'), ('result', 'NOUN')]]


In [32]:
# check if there are any blank sentences
for sent in unigram_sents_pos:
    if len(sent[2]) == 0:
        print(sent)

In [33]:
# Save progress...
review_number = [row[0] for row in unigram_sents_pos]
sentence_number = [row[1] for row in unigram_sents_pos]
words_joined_all = []
pos_joined_all = []
for sent in unigram_sents_pos:
    word_pos = sent[2]
    word_list = [word for word, pos in word_pos]
    pos_list = [pos for word, pos in word_pos]
    words_joined = '+-+||+-+'.join(word for word in word_list)
    pos_joined   = '+-+||+-+'.join(pos for pos in pos_list)
    words_joined_all.append(words_joined)
    pos_joined_all.append(pos_joined)
    
unigram_sentences_savedf = pd.DataFrame({'review_number': review_number,
                                         'sentence_number': sentence_number,
                                         'unigram_sentences': words_joined_all,
                                         'unigram_pos': pos_joined_all})

save_df_s3(unigram_sentences_savedf, bucket_name, 'amazon_reviews/unigram_sentences.feather')

In [34]:
for parsed_review in tqdm(nlp.pipe(text_second, batch_size=20000, n_threads=72)):
    rev_num += 1
    for sent in parsed_review.sents:
        sent_num += 1
        # lemmatize tokens & save corresponding pos tag after filtering whitespace and punctuations
        lemmatized_sent = [(token.lemma_, token.pos_) for token in sent if not (token.is_space or token.is_punct)]
        if len(lemmatized_sent) != 0:
            unigram_sents_pos.append([rev_num, sent_num, lemmatized_sent])

print('current rev_num: ', rev_num)
print('current sent_num: ', sent_num)

50000it [08:14, 101.20it/s]

current rev_num:  100000
current sent_num:  616751





In [35]:
print(len(unigram_sents_pos))

615760


In [36]:
# Save progress...
review_number = [row[0] for row in unigram_sents_pos]
sentence_number = [row[1] for row in unigram_sents_pos]
words_joined_all = []
pos_joined_all = []
for sent in unigram_sents_pos:
    word_pos = sent[2]
    word_list = [word for word, pos in word_pos]
    pos_list = [pos for word, pos in word_pos]
    words_joined = '+-+||+-+'.join(word for word in word_list)
    pos_joined   = '+-+||+-+'.join(pos for pos in pos_list)
    words_joined_all.append(words_joined)
    pos_joined_all.append(pos_joined)
    
unigram_sentences_savedf = pd.DataFrame({'review_number': review_number,
                                         'sentence_number': sentence_number,
                                         'unigram_sentences': words_joined_all,
                                         'unigram_pos': pos_joined_all})

save_df_s3(unigram_sentences_savedf, bucket_name, 'amazon_reviews/unigram_sentences.feather')

In [37]:
for parsed_review in tqdm(nlp.pipe(text_third, batch_size=20000, n_threads=72)):
    rev_num += 1
    for sent in parsed_review.sents:
        sent_num += 1
        # lemmatize tokens & save corresponding pos tag after filtering whitespace and punctuations
        lemmatized_sent = [(token.lemma_, token.pos_) for token in sent if not (token.is_space or token.is_punct)]
        if len(lemmatized_sent) != 0:
            unigram_sents_pos.append([rev_num, sent_num, lemmatized_sent])

print('current rev_num: ', rev_num)
print('current sent_num: ', sent_num)

50000it [08:08, 102.38it/s]

current rev_num:  150000
current sent_num:  923642





In [38]:
# Save progress...
review_number = [row[0] for row in unigram_sents_pos]
sentence_number = [row[1] for row in unigram_sents_pos]
words_joined_all = []
pos_joined_all = []
for sent in unigram_sents_pos:
    word_pos = sent[2]
    word_list = [word for word, pos in word_pos]
    pos_list = [pos for word, pos in word_pos]
    words_joined = '+-+||+-+'.join(word for word in word_list)
    pos_joined   = '+-+||+-+'.join(pos for pos in pos_list)
    words_joined_all.append(words_joined)
    pos_joined_all.append(pos_joined)
    
unigram_sentences_savedf = pd.DataFrame({'review_number': review_number,
                                         'sentence_number': sentence_number,
                                         'unigram_sentences': words_joined_all,
                                         'unigram_pos': pos_joined_all})

save_df_s3(unigram_sentences_savedf, bucket_name, 'amazon_reviews/unigram_sentences.feather')

In [39]:
for parsed_review in tqdm(nlp.pipe(text_fourth, batch_size=20000, n_threads=72)):
    rev_num += 1
    for sent in parsed_review.sents:
        sent_num += 1
        # lemmatize tokens & save corresponding pos tag after filtering whitespace and punctuations
        lemmatized_sent = [(token.lemma_, token.pos_) for token in sent if not (token.is_space or token.is_punct)]
        if len(lemmatized_sent) != 0:
            unigram_sents_pos.append([rev_num, sent_num, lemmatized_sent])

print('current rev_num: ', rev_num)
print('current sent_num: ', sent_num)

150000it [24:10, 103.43it/s]

current rev_num:  300000
current sent_num:  1843092





In [40]:
# Save progress...
review_number = [row[0] for row in unigram_sents_pos]
sentence_number = [row[1] for row in unigram_sents_pos]
words_joined_all = []
pos_joined_all = []
for sent in unigram_sents_pos:
    word_pos = sent[2]
    word_list = [word for word, pos in word_pos]
    pos_list = [pos for word, pos in word_pos]
    words_joined = '+-+||+-+'.join(word for word in word_list)
    pos_joined   = '+-+||+-+'.join(pos for pos in pos_list)
    words_joined_all.append(words_joined)
    pos_joined_all.append(pos_joined)
    
unigram_sentences_savedf = pd.DataFrame({'review_number': review_number,
                                         'sentence_number': sentence_number,
                                         'unigram_sentences': words_joined_all,
                                         'unigram_pos': pos_joined_all})

save_df_s3(unigram_sentences_savedf, bucket_name, 'amazon_reviews/unigram_sentences.feather')

In [41]:
for parsed_review in tqdm(nlp.pipe(text_fifth, batch_size=20000, n_threads=72)):
    rev_num += 1
    for sent in parsed_review.sents:
        sent_num += 1
        # lemmatize tokens & save corresponding pos tag after filtering whitespace and punctuations
        lemmatized_sent = [(token.lemma_, token.pos_) for token in sent if not (token.is_space or token.is_punct)]
        if len(lemmatized_sent) != 0:
            unigram_sents_pos.append([rev_num, sent_num, lemmatized_sent])

print('current rev_num: ', rev_num)
print('current sent_num: ', sent_num)

50000it [07:44, 107.72it/s]

current rev_num:  350000
current sent_num:  2144424





In [42]:
# Save progress...
review_number = [row[0] for row in unigram_sents_pos]
sentence_number = [row[1] for row in unigram_sents_pos]
words_joined_all = []
pos_joined_all = []
for sent in unigram_sents_pos:
    word_pos = sent[2]
    word_list = [word for word, pos in word_pos]
    pos_list = [pos for word, pos in word_pos]
    words_joined = '+-+||+-+'.join(word for word in word_list)
    pos_joined   = '+-+||+-+'.join(pos for pos in pos_list)
    words_joined_all.append(words_joined)
    pos_joined_all.append(pos_joined)
    
unigram_sentences_savedf = pd.DataFrame({'review_number': review_number,
                                         'sentence_number': sentence_number,
                                         'unigram_sentences': words_joined_all,
                                         'unigram_pos': pos_joined_all})

save_df_s3(unigram_sentences_savedf, bucket_name, 'amazon_reviews/unigram_sentences.feather')

In [43]:
for parsed_review in tqdm(nlp.pipe(text_sixth, batch_size=20000, n_threads=72)):
    rev_num += 1
    for sent in parsed_review.sents:
        sent_num += 1
        # lemmatize tokens & save corresponding pos tag after filtering whitespace and punctuations
        lemmatized_sent = [(token.lemma_, token.pos_) for token in sent if not (token.is_space or token.is_punct)]
        if len(lemmatized_sent) != 0:
            unigram_sents_pos.append([rev_num, sent_num, lemmatized_sent])

print('current rev_num: ', rev_num)
print('current sent_num: ', sent_num)

50000it [07:48, 106.78it/s]

current rev_num:  400000
current sent_num:  2447985





In [44]:
# Save progress...
review_number = [row[0] for row in unigram_sents_pos]
sentence_number = [row[1] for row in unigram_sents_pos]
words_joined_all = []
pos_joined_all = []
for sent in unigram_sents_pos:
    word_pos = sent[2]
    word_list = [word for word, pos in word_pos]
    pos_list = [pos for word, pos in word_pos]
    words_joined = '+-+||+-+'.join(word for word in word_list)
    pos_joined   = '+-+||+-+'.join(pos for pos in pos_list)
    words_joined_all.append(words_joined)
    pos_joined_all.append(pos_joined)
    
unigram_sentences_savedf = pd.DataFrame({'review_number': review_number,
                                         'sentence_number': sentence_number,
                                         'unigram_sentences': words_joined_all,
                                         'unigram_pos': pos_joined_all})

save_df_s3(unigram_sentences_savedf, bucket_name, 'amazon_reviews/unigram_sentences.feather')

In [45]:
for parsed_review in tqdm(nlp.pipe(text_seventh, batch_size=20000, n_threads=72)):
    rev_num += 1
    for sent in parsed_review.sents:
        sent_num += 1
        # lemmatize tokens & save corresponding pos tag after filtering whitespace and punctuations
        lemmatized_sent = [(token.lemma_, token.pos_) for token in sent if not (token.is_space or token.is_punct)]
        if len(lemmatized_sent) != 0:
            unigram_sents_pos.append([rev_num, sent_num, lemmatized_sent])

print('current rev_num: ', rev_num)
print('current sent_num: ', sent_num)

50000it [07:49, 106.46it/s]

current rev_num:  450000
current sent_num:  2754623





In [46]:
# Save progress...
review_number = [row[0] for row in unigram_sents_pos]
sentence_number = [row[1] for row in unigram_sents_pos]
words_joined_all = []
pos_joined_all = []
for sent in unigram_sents_pos:
    word_pos = sent[2]
    word_list = [word for word, pos in word_pos]
    pos_list = [pos for word, pos in word_pos]
    words_joined = '+-+||+-+'.join(word for word in word_list)
    pos_joined   = '+-+||+-+'.join(pos for pos in pos_list)
    words_joined_all.append(words_joined)
    pos_joined_all.append(pos_joined)
    
unigram_sentences_savedf = pd.DataFrame({'review_number': review_number,
                                         'sentence_number': sentence_number,
                                         'unigram_sentences': words_joined_all,
                                         'unigram_pos': pos_joined_all})

save_df_s3(unigram_sentences_savedf, bucket_name, 'amazon_reviews/unigram_sentences.feather')

In [47]:
for parsed_review in tqdm(nlp.pipe(text_eighth, batch_size=20000, n_threads=72)):
    rev_num += 1
    for sent in parsed_review.sents:
        sent_num += 1
        # lemmatize tokens & save corresponding pos tag after filtering whitespace and punctuations
        lemmatized_sent = [(token.lemma_, token.pos_) for token in sent if not (token.is_space or token.is_punct)]
        if len(lemmatized_sent) != 0:
            unigram_sents_pos.append([rev_num, sent_num, lemmatized_sent])

print('current rev_num: ', rev_num)
print('current sent_num: ', sent_num)

50000it [08:05, 103.01it/s]

current rev_num:  500000
current sent_num:  3073060





In [48]:
# Save progress...
review_number = [row[0] for row in unigram_sents_pos]
sentence_number = [row[1] for row in unigram_sents_pos]
words_joined_all = []
pos_joined_all = []
for sent in unigram_sents_pos:
    word_pos = sent[2]
    word_list = [word for word, pos in word_pos]
    pos_list = [pos for word, pos in word_pos]
    words_joined = '+-+||+-+'.join(word for word in word_list)
    pos_joined   = '+-+||+-+'.join(pos for pos in pos_list)
    words_joined_all.append(words_joined)
    pos_joined_all.append(pos_joined)
    
unigram_sentences_savedf = pd.DataFrame({'review_number': review_number,
                                         'sentence_number': sentence_number,
                                         'unigram_sentences': words_joined_all,
                                         'unigram_pos': pos_joined_all})

save_df_s3(unigram_sentences_savedf, bucket_name, 'amazon_reviews/unigram_sentences.feather')

In [49]:
for parsed_review in tqdm(nlp.pipe(text_ninth, batch_size=20000, n_threads=72)):
    rev_num += 1
    for sent in parsed_review.sents:
        sent_num += 1
        # lemmatize tokens & save corresponding pos tag after filtering whitespace and punctuations
        lemmatized_sent = [(token.lemma_, token.pos_) for token in sent if not (token.is_space or token.is_punct)]
        if len(lemmatized_sent) != 0:
            unigram_sents_pos.append([rev_num, sent_num, lemmatized_sent])

print('current rev_num: ', rev_num)
print('current sent_num: ', sent_num)

84829it [13:31, 104.52it/s]

current rev_num:  584829
current sent_num:  3605491





In [50]:
# Save progress...
review_number = [row[0] for row in unigram_sents_pos]
sentence_number = [row[1] for row in unigram_sents_pos]
words_joined_all = []
pos_joined_all = []
for sent in unigram_sents_pos:
    word_pos = sent[2]
    word_list = [word for word, pos in word_pos]
    pos_list = [pos for word, pos in word_pos]
    words_joined = '+-+||+-+'.join(word for word in word_list)
    pos_joined   = '+-+||+-+'.join(pos for pos in pos_list)
    words_joined_all.append(words_joined)
    pos_joined_all.append(pos_joined)
    
unigram_sentences_savedf = pd.DataFrame({'review_number': review_number,
                                         'sentence_number': sentence_number,
                                         'unigram_sentences': words_joined_all,
                                         'unigram_pos': pos_joined_all})

save_df_s3(unigram_sentences_savedf, bucket_name, 'amazon_reviews/unigram_sentences.feather')

In [21]:
# DON'T LOAD THIS FILE - there's a _v1 version further down!
# del unigram_sentences_savedf
unigram_sentences_savedf = load_df_s3(bucket_name, 'amazon_reviews/unigram_sentences.feather', filetype='feather')

In [22]:
unigram_sentences_savedf.head()

Unnamed: 0,review_number,sentence_number,unigram_pos,unigram_sentences
0,1,1,NOUN+-+||+-+DET+...,dpe+-+||+-+the+-...
1,1,2,NOUN+-+||+-+NOUN...,b+-+||+-+flax+-+...
2,1,3,PRON+-+||+-+VERB...,-PRON-+-+||+-+do...
3,1,4,PRON+-+||+-+VERB...,-PRON-+-+||+-+be...
4,1,5,ADJ+-+||+-+NOUN+...,good+-+||+-+prod...


In [23]:
unigram_sentences_savedf[unigram_sentences_savedf.unigram_pos == ''].shape

(0, 4)

In [24]:
unigram_sentences_savedf[unigram_sentences_savedf.unigram_sentences == ''].shape

(0, 4)

In [25]:
unigram_sentences_savedf[unigram_sentences_savedf.unigram_pos == ''].head()  # no blank sentences

Unnamed: 0,review_number,sentence_number,unigram_pos,unigram_sentences


In [26]:
unigram_sentences_savedf[unigram_sentences_savedf.unigram_pos == ''].shape

(0, 4)

#### Additional Data Cleaning

In [27]:
def clean_up(sentence, sentence_pos, sep):
    """Expects a sentence as a single string as input 1, and its corresponding part-of-speech tags as input 2 (also single string).
    sep is the string pattern used to separate words in each sentence string
    Cleans it up and returns a single string.
    Also updates corresponding part-of-speech string.
    """
    # get rid of webpage links
    cond = ['http' in sentence, 'www' in sentence]
    if any(cond):
        words = sentence.split(sep)
        words_pos = sentence_pos.split(sep)
        to_remove = []
        for i in range(len(words)):
            cond_word = ['http' in words[i], 'www' in words[i]]
            if any(cond_word):
                to_remove.append(i)
        # remove words that are links
        for j in sorted(to_remove, reverse=True):
            del words[j]
            del words_pos[j]
        # reconstruct sentence after deleting links
        sentence = sep.join(words)
        sentence_pos = sep.join(words_pos)

    # replace underscores with blanks to avoid mix-up with paired words later
    # cannot replace with spaces because the strings are split on spaces later 
    # and this would create new words with no corresponding pos tags
    if '_' in sentence:
        sentence = sentence.replace('_', '')
    return sentence, sentence_pos

In [29]:
test_clean = ['whoa watch out for them links boy http://sup.com and also BAM! underscore_time!', 'this is a normal sentence', 
              '__ what is this ____ http', '_', 'http']
test_clean

['whoa watch out for them links boy http://sup.com and also BAM! underscore_time!',
 'this is a normal sentence',
 '__ what is this ____ http',
 '_',
 'http']

In [30]:
test_clean_pos = ['X X X X X X X X X X X X', 'X X X X X', 'X X X X X X', 'X', 'X']

In [31]:
[len(e.split(' ')) for e in test_clean]

[12, 5, 6, 1, 1]

In [32]:
[e.count('X') for e in test_clean_pos]

[12, 5, 6, 1, 1]

In [33]:
# check if clean_up works as expected
to_remove = []
for i in range(len(test_clean)):
    sentence = test_clean[i]
    sentence_pos = test_clean_pos[i]
    test_clean[i], test_clean_pos[i] = clean_up(sentence, sentence_pos, sep=' ')
    
    # mark elements to delete if empty
    if test_clean[i] == '':
        to_remove.append(i)

# delete elements that are empty
for j in sorted(to_remove, reverse=True):
    del test_clean[j]
    del test_clean_pos[j]

test_clean

['whoa watch out for them links boy and also BAM! underscoretime!',
 'this is a normal sentence',
 ' what is this ']

In [34]:
test_clean_pos

['X X X X X X X X X X X', 'X X X X X', 'X X X X X']

In [35]:
[e.count('X') for e in test_clean_pos]

[11, 5, 5]

In [36]:
[len(e.split(' ')) for e in test_clean]

[11, 5, 5]

In [37]:
words_joined_all = unigram_sentences_savedf.unigram_sentences.tolist()

In [38]:
pos_joined_all = unigram_sentences_savedf.unigram_pos.tolist()

In [39]:
len(words_joined_all)

3599392

In [40]:
len([sentence for sentence in words_joined_all if '_' in sentence])

605

In [41]:
len([sentence for sentence in words_joined_all if 'http' in sentence])

513

In [42]:
len([sentence for sentence in words_joined_all if 'www' in sentence])

630

In [43]:
unigram_sentences_savedf[unigram_sentences_savedf.unigram_sentences.str.contains('_')].head()

Unnamed: 0,review_number,sentence_number,unigram_pos,unigram_sentences
7532,1290,7548,X,http://www.amazo...
16153,2775,16180,DET+-+||+-+NOUN+...,no+-+||+-+jet_la...
16602,2837,16629,PRON+-+||+-+VERB...,-PRON-+-+||+-+do...
22975,3833,23010,PRON+-+||+-+VERB...,-PRON-+-+||+-+be...
25080,4169,25118,ADJ+-+||+-+PART+...,easy+-+||+-+to+-...


In [44]:
[sentence for sentence in words_joined_all if '_' in sentence][:10]

['http://www.amazon.com/gp/product/b0000533z8/ref=cm_cr_rev_prod_title',
 'no+-+||+-+jet_lag+-+||+-+pill',
 "-PRON-+-+||+-+do+-+||+-+recommend+-+||+-+women+-+||+-+'s+-+||+-+one+-+||+-+a_day+-+||+-+though+-+||+-+with+-+||+-+extra+-+||+-+calcium",
 '-PRON-+-+||+-+be+-+||+-+less+-+||+-+money+-+||+-+and+-+||+-+good+-+||+-+quality+-+||+-+https://www.amazon.com/review/review-your-purchases/ref=pe_6680_116681230_cm_add_2_star3?_encoding=utf8&asins;=b0000ccw1n%3a3%2cb000sar2dk&channel;=ec_phy&crauthtoken;=ge5g%2bbf%2btr%2f%2fdliytbmmzxn6ajjlfxjdtx902p0aaaadaaaaafnfv%2bbyyxcaaaaa&customerid;=a1pansxlpbgvng#top',
 'easy+-+||+-+to+-+||+-+use_work+-+||+-+well',
 '-PRON-+-+||+-+have+-+||+-+have+-+||+-+pedometer+-+||+-+in+-+||+-+the+-+||+-+past_all+-+||+-+difficult+-+||+-+and+-+||+-+confusing+-+||+-+to+-+||+-+use+-+||+-+to+-+||+-+the+-+||+-+point+-+||+-+-PRON-+-+||+-+simply+-+||+-+give+-+||+-+up+-+||+-+on+-+||+-+-PRON-',
 'overall+-+||+-+-PRON-+-+||+-+mother+-+||+-+be+-+||+-+very+-+||+-+satisfied+-+

In [45]:
# clean up all unigrams
to_remove = []
for i in range(len(words_joined_all)):
    sentence = words_joined_all[i]
    sentence_pos = pos_joined_all[i]
    words_joined_all[i], pos_joined_all[i] = clean_up(sentence, sentence_pos, sep='+-+||+-+')
    
    # mark elements to delete if empty
    if words_joined_all[i] == '':
        to_remove.append(i)

# delete elements that are empty
for j in sorted(to_remove, reverse=True):
    del words_joined_all[j]
    del pos_joined_all[j]

In [46]:
# drop rows from unigram_sentences_savedf corresponding to the row numbers (indices) of sentences
# that will be blank after the transformation above
unigram_sentences_savedf.drop(unigram_sentences_savedf.index[to_remove], axis=0, inplace=True)

In [47]:
unigram_sentences_savedf.drop(['unigram_sentences'], axis=1, inplace=True)
unigram_sentences_savedf.drop(['unigram_pos'], axis=1, inplace=True)

In [48]:
unigram_sentences_savedf['unigram_sentences'] = words_joined_all
unigram_sentences_savedf['unigram_pos'] = pos_joined_all

In [49]:
unigram_sentences_savedf.head()

Unnamed: 0,review_number,sentence_number,unigram_sentences,unigram_pos
0,1,1,dpe+-+||+-+the+-...,NOUN+-+||+-+DET+...
1,1,2,b+-+||+-+flax+-+...,NOUN+-+||+-+NOUN...
2,1,3,-PRON-+-+||+-+do...,PRON+-+||+-+VERB...
3,1,4,-PRON-+-+||+-+be...,PRON+-+||+-+VERB...
4,1,5,good+-+||+-+prod...,ADJ+-+||+-+NOUN+...


In [50]:
unigram_sentences_savedf.shape

(3599286, 4)

In [51]:
# updated, cleaned up version of unigram_sentences.feather
save_df_s3(unigram_sentences_savedf, bucket_name, 'amazon_reviews/unigram_sentences_v1.feather')

### Phrase Detection

In [21]:
unigram_sentences_savedf = load_df_s3(bucket_name, 'amazon_reviews/unigram_sentences_v1.feather', filetype='feather')

In [22]:
words_joined_all = unigram_sentences_savedf.unigram_sentences.tolist()

In [52]:
unigram_sentences = [sentence.split('+-+||+-+') for sentence in words_joined_all]

In [53]:
print(unigram_sentences[:4])

[['dpe', 'the', 'job', 'well'], ['b', 'flax', 'd', 'be', 'a', 'regular', 'at', '-PRON-', 'house'], ['-PRON-', 'do', '-PRON-', 'job', 'simply', 'and', 'with', 'good', 'result'], ['-PRON-', 'be', 'reasonable', 'last', 'a', 'long', 'time', 'and', 'be', 'able', 'to', 'be', 'obtain', 'with', 'free', 'shipping', 'if', '-PRON-', 'hunt', 'around']]


In [54]:
len(words_joined_all)

3599286

In [55]:
%%time
# The common_terms parameter add a way to give special treatment to common terms 
# (aka stop words) such that their presence between two words won’t prevent bigram detection. 
# It allows to detect expressions like “bank of america”
common_terms = ["of", "with", "without", "and", "or"]

# Train a first-order phrase detector
bigram_model = Phrases(unigram_sentences, threshold=0.6, scoring='npmi', common_terms=common_terms)

# Transform unigram sentences into bigram sentences
# Paired words are connected by an underscore, e.g. ice_cream
bigram_sentences = []
for sentence in unigram_sentences:
    bigram_sentences.append(bigram_model[sentence])



CPU times: user 3min 36s, sys: 2.58 s, total: 3min 39s
Wall time: 3min 39s


In [56]:
%%time
# Train a second-order phrase detector
# trigram_model = Phrases(bigram_sentences, min_count=5)
trigram_model = Phrases(bigram_sentences, threshold=0.5, scoring='npmi')

# Transform bigram sentences into trigram sentences
trigram_sentences = []
for sentence in bigram_sentences:
    trigram_sentences.append(trigram_model[sentence])

# remove any remaining stopwords
# trigram_sentences = [[word for word in sentence if word not in nlp.Defaults.stop_words] for sentence in trigram_sentences]



CPU times: user 3min 36s, sys: 3.18 s, total: 3min 40s
Wall time: 3min 40s


In [57]:
# the trigrams will be saved in a dataframe with a single column.
# each row is one sentence from any review
# each sentence is a single string separated by a single space.
trigram_sentences_savedf = pd.DataFrame([u'+-+||+-+'.join(sentence) for sentence in trigram_sentences], columns=['preprocessed_review'])
save_df_s3(trigram_sentences_savedf, bucket_name, 'amazon_reviews/preprocessed_reviews.feather')

In [5]:
trigram_sentences_savedf = load_df_s3(bucket_name, 'amazon_reviews/preprocessed_reviews.feather', filetype='feather')

In [58]:
trigram_sentences_savedf.head()

Unnamed: 0,preprocessed_review
0,dpe+-+||+-+the+-...
1,b+-+||+-+flax+-+...
2,-PRON-+-+||+-+do...
3,-PRON-+-+||+-+be...
4,good+-+||+-+prod...


In [8]:
# trigram_sentences = trigram_sentences_savedf.preprocessed_review.tolist()

In [10]:
# len(trigram_sentences)

3605491

In [59]:
del unigram_sentences_savedf
unigram_sents_pos_df = load_df_s3(bucket_name, 'amazon_reviews/unigram_sentences_v1.feather', filetype='feather')

In [60]:
unigram_sents_pos_df.head()

Unnamed: 0,review_number,sentence_number,unigram_sentences,unigram_pos
0,1,1,dpe+-+||+-+the+-...,NOUN+-+||+-+DET+...
1,1,2,b+-+||+-+flax+-+...,NOUN+-+||+-+NOUN...
2,1,3,-PRON-+-+||+-+do...,PRON+-+||+-+VERB...
3,1,4,-PRON-+-+||+-+be...,PRON+-+||+-+VERB...
4,1,5,good+-+||+-+prod...,ADJ+-+||+-+NOUN+...


In [61]:
unigram_sents_pos_df[unigram_sents_pos_df.unigram_pos == ''].shape

(0, 4)

In [62]:
unigram_sents_pos_df.shape

(3599286, 4)

In [63]:
unigram_sents_pos_df = pd.merge(unigram_sents_pos_df, trigram_sentences_savedf, how='inner', left_index=True, right_index=True)

In [64]:
unigram_sents_pos_df.head(10)

Unnamed: 0,review_number,sentence_number,unigram_sentences,unigram_pos,preprocessed_review
0,1,1,dpe+-+||+-+the+-...,NOUN+-+||+-+DET+...,dpe+-+||+-+the+-...
1,1,2,b+-+||+-+flax+-+...,NOUN+-+||+-+NOUN...,b+-+||+-+flax+-+...
2,1,3,-PRON-+-+||+-+do...,PRON+-+||+-+VERB...,-PRON-+-+||+-+do...
3,1,4,-PRON-+-+||+-+be...,PRON+-+||+-+VERB...,-PRON-+-+||+-+be...
4,1,5,good+-+||+-+prod...,ADJ+-+||+-+NOUN+...,good+-+||+-+prod...
5,2,6,fast+-+||+-+ship...,ADJ+-+||+-+NOUN+...,fast_shipping+-+...
6,2,7,study+-+||+-+sho...,NOUN+-+||+-+VERB...,study+-+||+-+sho...
7,2,8,hardly+-+||+-+an...,ADV+-+||+-+DET+-...,hardly+-+||+-+an...
8,2,9,this+-+||+-+comp...,DET+-+||+-+NOUN+...,this+-+||+-+comp...
9,2,10,-PRON-+-+||+-+ca...,PRON+-+||+-+VERB...,-PRON-+-+||+-+ca...


In [65]:
save_df_s3(unigram_sents_pos_df, bucket_name, 'amazon_reviews/preprocessed_reviews_v1.feather')

In [113]:
unigram_sents_pos_df = load_df_s3(bucket_name, 'amazon_reviews/preprocessed_reviews_v1.feather', filetype='feather')

In [66]:
unigram_sents_pos_df.shape

(3599286, 5)

In [68]:
unigram_sents_pos_df.iloc[400:410]

Unnamed: 0,review_number,sentence_number,unigram_sentences,unigram_pos,preprocessed_review
400,70,401,-PRON-+-+||+-+do...,PRON+-+||+-+VERB...,-PRON-+-+||+-+do...
401,70,402,-PRON-+-+||+-+th...,PRON+-+||+-+VERB...,-PRON-+-+||+-+th...
402,70,403,do+-+||+-+not+-+...,VERB+-+||+-+ADV+...,do_not+-+||+-+re...
403,71,404,mould+-+||+-+mot...,VERB+-+||+-+PROP...,mould_motion+-+|...
404,71,405,-PRON-+-+||+-+do...,PRON+-+||+-+VERB...,-PRON-+-+||+-+do...
405,71,406,and+-+||+-+i+-+|...,CCONJ+-+||+-+PRO...,and+-+||+-+i+-+|...
406,71,407,and+-+||+-+besid...,CCONJ+-+||+-+ADP...,and+-+||+-+besid...
407,71,408,just+-+||+-+too+...,ADV+-+||+-+ADV+-...,just+-+||+-+too_...
408,71,409,do+-+||+-+not+-+...,VERB+-+||+-+ADV+...,do_not+-+||+-+buy
409,72,410,be+-+||+-+a+-+||...,VERB+-+||+-+DET+...,be+-+||+-+a+-+||...


In [69]:
unigram_sents_pos_df.isnull().sum()

review_number          0
sentence_number        0
unigram_sentences      0
unigram_pos            0
preprocessed_review    0
dtype: int64

In [70]:
unigram_sents_pos_df['has_paired_words'] = 0

In [71]:
unigram_sents_pos_df.loc[unigram_sents_pos_df.preprocessed_review.str.contains('_'), ['has_paired_words']] = 1

In [72]:
unigram_sents_pos_df.has_paired_words.sum()  # number of sentences with paired words

1565595

In [73]:
unigram_sents_pos_df.head()

Unnamed: 0,review_number,sentence_number,unigram_sentences,unigram_pos,preprocessed_review,has_paired_words
0,1,1,dpe+-+||+-+the+-...,NOUN+-+||+-+DET+...,dpe+-+||+-+the+-...,0
1,1,2,b+-+||+-+flax+-+...,NOUN+-+||+-+NOUN...,b+-+||+-+flax+-+...,0
2,1,3,-PRON-+-+||+-+do...,PRON+-+||+-+VERB...,-PRON-+-+||+-+do...,0
3,1,4,-PRON-+-+||+-+be...,PRON+-+||+-+VERB...,-PRON-+-+||+-+be...,1
4,1,5,good+-+||+-+prod...,ADJ+-+||+-+NOUN+...,good+-+||+-+prod...,0


In [74]:
%%time
unigram_sents_pos_df.unigram_pos = [sent.split('+-+||+-+') for sent in unigram_sents_pos_df.unigram_pos.tolist()]
unigram_sents_pos_df.unigram_sentences = [sent.split('+-+||+-+') for sent in unigram_sents_pos_df.unigram_sentences.tolist()]
unigram_sents_pos_df.preprocessed_review = [sent.split('+-+||+-+') for sent in unigram_sents_pos_df.preprocessed_review.tolist()]

CPU times: user 30 s, sys: 5.74 s, total: 35.7 s
Wall time: 35.7 s


In [75]:
unigram_sents_pos_df.iloc[400:410]

Unnamed: 0,review_number,sentence_number,unigram_sentences,unigram_pos,preprocessed_review,has_paired_words
400,70,401,"[-PRON-, do, not...","[PRON, VERB, ADV...","[-PRON-, do_not,...",1
401,70,402,"[-PRON-, think, ...","[PRON, VERB, ADJ...","[-PRON-, think, ...",0
402,70,403,"[do, not, recomm...","[VERB, ADV, VERB...","[do_not, recomme...",1
403,71,404,"[mould, motion, ...","[VERB, PROPN, NU...","[mould_motion, 5...",1
404,71,405,"[-PRON-, do, not...","[PRON, VERB, ADV...","[-PRON-, do_not,...",1
405,71,406,"[and, i, have, -...","[CCONJ, PRON, VE...","[and, i, have, -...",0
406,71,407,"[and, besides, -...","[CCONJ, ADP, PRO...","[and, besides, -...",1
407,71,408,"[just, too, much...","[ADV, ADV, ADJ, ...","[just, too_much,...",1
408,71,409,"[do, not, buy]","[VERB, ADV, VERB]","[do_not, buy]",1
409,72,410,"[be, a, gift]","[VERB, DET, NOUN]","[be, a, gift]",0


In [76]:
unigram_sents_pos_df.isnull().sum()

review_number          0
sentence_number        0
unigram_sentences      0
unigram_pos            0
preprocessed_review    0
has_paired_words       0
dtype: int64

Let's look at an arbitrary sentence and it's transformation:

In [77]:
print(unigram_sents_pos_df.unigram_sentences.iloc[105])

['liver', 'support', 'supports', 'liver', 'function', 'stimulate', 'des', 'intoxication', 'and', 'restore', 'liver', 'function', 'eliminate', 'harmful', 'metabolite']


In [78]:
print(unigram_sents_pos_df.unigram_pos.iloc[105])

['PROPN', 'PROPN', 'PROPN', 'NOUN', 'NOUN', 'VERB', 'X', 'NOUN', 'CCONJ', 'VERB', 'NOUN', 'NOUN', 'VERB', 'ADJ', 'NOUN']


In [79]:
print(unigram_sents_pos_df.preprocessed_review.iloc[105])

['liver', 'support', 'supports', 'liver_function', 'stimulate_des_intoxication', 'and', 'restore', 'liver_function', 'eliminate', 'harmful', 'metabolite']


In [80]:
gramlist = [word for sent in trigram_sentences for word in sent if '_' in word]

In [81]:
paired_words_frq = Counter(gramlist)
paired_words_frq.most_common(100)

[('do_not', 268437),
 ('this_product', 207554),
 ('can_not', 45528),
 ('great_product', 41158),
 ('a_few', 38755),
 ('weight_loss', 35341),
 ('so_far', 29550),
 ('lot_of', 28975),
 ('as_well', 26527),
 ('at_all', 25321),
 ('this_stuff', 23679),
 ('highly_recommend', 23157),
 ('lose_weight', 23118),
 ('fish_oil', 22805),
 ('side_effect', 17882),
 ('would_recommend', 16050),
 ('at_least', 14776),
 ('will_continue', 14454),
 ('along_with', 13198),
 ('per_day', 11691),
 ('every_day', 11147),
 ('garcinia_cambogia', 10208),
 ('at_night', 8951),
 ('very_happy', 8355),
 ('too_much', 8303),
 ('year_ago', 8008),
 ('no_side_effect', 7872),
 ('high_quality', 7664),
 ('energy_level', 7583),
 ('vitamin_d', 7483),
 ('vitamin_c', 7408),
 ('year_old', 7247),
 ('run_out', 7056),
 ('no_longer', 7045),
 ('five_star', 6783),
 ('dr._oz', 6583),
 ('suffer_from', 6446),
 ('wake_up', 6439),
 ('immune_system', 6191),
 ('krill_oil', 6048),
 ('customer_service', 5996),
 ('even_though', 5805),
 ('omega_3', 5641),


In [82]:
# Find the 100 most infrequent paired words
paired_words_frq.most_common()[::-1][:100]

[('overturn_conventional', 1),
 ('portion_veep_university---', 1),
 ('expereienc_with_veep', 1),
 ('representation_veep', 1),
 ('veep_lookcut_program', 1),
 ('enthusiast_mtn_bike', 1),
 ('mountain_biking_rowing', 1),
 ('decker_cheeseburger', 1),
 ('tri_atholon', 1),
 ('8220_recommended&#8221', 1),
 ('trade_show&#8230', 1),
 ('go!upon_arrival', 1),
 ('sharp_edges2', 1),
 ('crash_dieting).in_conclusion', 1),
 ('wishful_thinking!ftc_disclosure', 1),
 ('34;healthy_fat&#34', 1),
 ('atrail_fibrillationso', 1),
 ('holy_cr*p', 1),
 ('bootle_of_uberday_women', 1),
 ('superior_product!paula', 1),
 ('deem_morbidly_obese', 1),
 ('ever!!!highly_recommended', 1),
 ('george_flansbaum_whom', 1),
 ('onelife_pharma', 1),
 ('melissa_jones', 1),
 ('coco_mak_seriously', 1),
 ('i&#8217;m_assuming', 1),
 ('yeast_infection_every2', 1),
 ('a++standadrized_forskolin_excellent!.', 1),
 ('brazilian_jiujitsu_brown', 1),
 ('wrestling_and_bjj', 1),
 ('garcinia_cambhogia', 1),
 ('protease_enzym', 1),
 ('camp_induceme

In [83]:
len(paired_words_frq)  # number of paired terms  (this drops down to 46,785 after further processing)

161028

In [84]:
unigram_sents_pos_df.head()

Unnamed: 0,review_number,sentence_number,unigram_sentences,unigram_pos,preprocessed_review,has_paired_words
0,1,1,"[dpe, the, job, ...","[NOUN, DET, NOUN...","[dpe, the, job, ...",0
1,1,2,"[b, flax, d, be,...","[NOUN, NOUN, NOU...","[b, flax, d, be,...",0
2,1,3,"[-PRON-, do, -PR...","[PRON, VERB, ADJ...","[-PRON-, do, -PR...",0
3,1,4,"[-PRON-, be, rea...","[PRON, VERB, ADJ...","[-PRON-, be, rea...",1
4,1,5,"[good, product, ...","[ADJ, NOUN, ADJ,...","[good, product, ...",0


In [90]:
def handle_failed_pairing(i, skip, num_paired, sent, sent_paired, to_remove):
    # split up paired words failing our format requirements
    to_remove.extend([i])
    sent_paired.extend(sent[i + skip: i + skip + num_paired])


def filter_pairs(sent, sent_paired, sent_pos):
    """modify sent_paired in place"""
    paired_sent_len = len(sent_paired)
    skip = 0
    to_remove = []
    
    for i in range(paired_sent_len):
        word = sent_paired[i]
        if '_' in word:
            num_paired = word.count('_') + 1
            
            # more than 3 words paired - ignore pairing
            if num_paired > 3:
                handle_failed_pairing(i, skip, num_paired, sent, sent_paired, to_remove)
                skip += num_paired - 1
                continue
            
            # bigrams: noun/adj, noun
            elif num_paired == 2:
                pos_word_1 = sent_pos[i + skip]
                pos_word_2 = sent_pos[i + skip + 1]
                cond = (pos_word_1 in ('NOUN', 'ADJ'), pos_word_2 == 'NOUN')
                if not all(cond):
                    handle_failed_pairing(i, skip, num_paired, sent, sent_paired, to_remove)
                    skip += num_paired - 1
                    continue
            
            # trigrams: noun/adj, all types, noun/adj
            elif num_paired == 3:
                pos_word_1 = sent_pos[i + skip]
                pos_word_2 = sent_pos[i + skip + 1]
                pos_word_3 = sent_pos[i + skip + 2]
                cond = (pos_word_1 in ('NOUN', 'ADJ'), pos_word_3 in ('NOUN', 'ADJ'))
                if not all(cond):
                    handle_failed_pairing(i, skip, num_paired, sent, sent_paired, to_remove)
                    skip += num_paired - 1
                    continue
        
            # num. of words to skip indexing over sent and sent_pos in the next iter
            skip += num_paired - 1
        
    # remove rejected pairs that are already split and added back individually
    if len(to_remove) > 0:
        for j in sorted(to_remove, reverse=True):
            del sent_paired[j]

**Test the filtering function:**

Test 1:

In [91]:
sent = ['liver', 'support', 'supports', 'liver', 'function', 'stimulate', 'des', 'intoxication', 'and', 'restore', 'liver', 'function', 'eliminate', 'harmful', 'metabolite']
print(sent)

['liver', 'support', 'supports', 'liver', 'function', 'stimulate', 'des', 'intoxication', 'and', 'restore', 'liver', 'function', 'eliminate', 'harmful', 'metabolite']


In [92]:
sent_pos = ['PROPN', 'PROPN', 'PROPN', 'NOUN', 'NOUN', 'VERB', 'X', 'NOUN', 'CCONJ', 'VERB', 'NOUN', 'NOUN', 'VERB', 'ADJ', 'NOUN']
print(sent_pos)

['PROPN', 'PROPN', 'PROPN', 'NOUN', 'NOUN', 'VERB', 'X', 'NOUN', 'CCONJ', 'VERB', 'NOUN', 'NOUN', 'VERB', 'ADJ', 'NOUN']


In [93]:
sent_paired = ['liver', 'support', 'supports', 'liver_function', 'stimulate_des_intoxication_and_restore', 'liver_function', 'eliminate', 'harmful', 'metabolite']
print(sent_paired)

['liver', 'support', 'supports', 'liver_function', 'stimulate_des_intoxication_and_restore', 'liver_function', 'eliminate', 'harmful', 'metabolite']


In [94]:
filter_pairs(sent, sent_paired, sent_pos)

In [95]:
# Expected output:
print(['liver', 'support', 'supports', 'liver_function', 'liver_function', 'eliminate', 'harmful', 'metabolite', 'stimulate', 'des', 'intoxication', 'and', 'restore'])

['liver', 'support', 'supports', 'liver_function', 'liver_function', 'eliminate', 'harmful', 'metabolite', 'stimulate', 'des', 'intoxication', 'and', 'restore']


In [96]:
print(sent_paired)

['liver', 'support', 'supports', 'liver_function', 'liver_function', 'eliminate', 'harmful', 'metabolite', 'stimulate', 'des', 'intoxication', 'and', 'restore']


Test 2:

In [97]:
sent = ['-PRON-', 'have', 'a', 'lot', 'more', 'energy', 'and', 'have', 'not', 'be', 'sick', 'at', 'all']
print(sent)

['-PRON-', 'have', 'a', 'lot', 'more', 'energy', 'and', 'have', 'not', 'be', 'sick', 'at', 'all']


In [98]:
sent_pos = ['PRON', 'VERB', 'DET', 'NOUN', 'ADJ', 'NOUN', 'CCONJ', 'VERB', 'ADV', 'VERB', 'ADJ', 'ADV', 'ADV']
print(sent_pos)

['PRON', 'VERB', 'DET', 'NOUN', 'ADJ', 'NOUN', 'CCONJ', 'VERB', 'ADV', 'VERB', 'ADJ', 'ADV', 'ADV']


In [99]:
sent_paired = ['-PRON-', 'have', 'a_lot', 'more_energy', 'and', 'have', 'not', 'be', 'sick', 'at_all']
print(sent_paired)

['-PRON-', 'have', 'a_lot', 'more_energy', 'and', 'have', 'not', 'be', 'sick', 'at_all']


In [100]:
filter_pairs(sent, sent_paired, sent_pos)

In [101]:
print(sent_paired)

['-PRON-', 'have', 'more_energy', 'and', 'have', 'not', 'be', 'sick', 'a', 'lot', 'at', 'all']


### Filter Phrases

In [102]:
unigram_sents_pos_df.head()

Unnamed: 0,review_number,sentence_number,unigram_sentences,unigram_pos,preprocessed_review,has_paired_words
0,1,1,"[dpe, the, job, ...","[NOUN, DET, NOUN...","[dpe, the, job, ...",0
1,1,2,"[b, flax, d, be,...","[NOUN, NOUN, NOU...","[b, flax, d, be,...",0
2,1,3,"[-PRON-, do, -PR...","[PRON, VERB, ADJ...","[-PRON-, do, -PR...",0
3,1,4,"[-PRON-, be, rea...","[PRON, VERB, ADJ...","[-PRON-, be, rea...",1
4,1,5,"[good, product, ...","[ADJ, NOUN, ADJ,...","[good, product, ...",0


In [103]:
preprocessed_reviews = unigram_sents_pos_df.preprocessed_review.tolist()
unigram_sentences = unigram_sents_pos_df.unigram_sentences.tolist()
unigram_pos = unigram_sents_pos_df.unigram_pos.tolist()
has_paired_words = unigram_sents_pos_df.has_paired_words.tolist()

In [107]:
# get rid of paired words from the corpus which
# (1) have more than 3 words joined
# (2) bigrams not in the format: noun/adj, noun
# (3) trigrams not in the format: noun/adj, all types, noun/adj
for i in tqdm(range(len(preprocessed_reviews))):
    if has_paired_words[i] == 1:
        filter_pairs(sent=unigram_sentences[i], sent_paired=preprocessed_reviews[i], sent_pos=unigram_pos[i])

100%|██████████| 3599286/3599286 [00:07<00:00, 503726.93it/s]


In [None]:
# save picked dataframe to S3.  Pickle format allows the columns to store lists
save_df_s3(unigram_sents_pos_df, bucket_name, filepath='amazon_reviews/preprocessed_reviews_v2.pkl', filetype='pickle')

In [5]:
# load from the pickled dataframe on S3
unigram_sents_pos_df = load_df_s3(bucket_name, filepath='amazon_reviews/preprocessed_reviews_v2.pkl', filetype='pickle')

In [6]:
unigram_sents_pos_df.head()

Unnamed: 0,review_number,sentence_number,unigram_sentences,unigram_pos,preprocessed_review,has_paired_words
0,1,1,"[dpe, the, job, well]","[NOUN, DET, NOUN, ADV]","[dpe, the, job, well]",0
1,1,2,"[b, flax, d, be, a, regular, at, -PRON-, house]","[NOUN, NOUN, NOUN, VERB, DET, ADJ, ADP, ADJ, N...","[b, flax, d, be, a, regular, at, -PRON-, house]",0
2,1,3,"[-PRON-, do, -PRON-, job, simply, and, with, g...","[PRON, VERB, ADJ, NOUN, ADV, CCONJ, ADP, ADJ, ...","[-PRON-, do, -PRON-, job, simply, and, with, g...",0
3,1,4,"[-PRON-, be, reasonable, last, a, long, time, ...","[PRON, VERB, ADJ, VERB, DET, ADJ, NOUN, CCONJ,...","[-PRON-, be, reasonable, last, a, long, time, ...",1
4,1,5,"[good, product, good, price, good, result]","[ADJ, NOUN, ADJ, NOUN, ADJ, NOUN]","[good, product, good, price, good, result]",0


In [7]:
unigram_sents_pos_df.shape

(3599286, 6)

In [9]:
preprocessed_review_updated = unigram_sents_pos_df.preprocessed_review.tolist()

In [109]:
len(preprocessed_review_updated)

3599286

In [110]:
preprocessed_review_updated[:3]

[['dpe', 'the', 'job', 'well'],
 ['b', 'flax', 'd', 'be', 'a', 'regular', 'at', '-PRON-', 'house'],
 ['-PRON-', 'do', '-PRON-', 'job', 'simply', 'and', 'with', 'good', 'result']]

In [111]:
gramlist_updated = [word for sent in preprocessed_review_updated for word in sent if '_' in word]

In [112]:
paired_words_frq_updated = Counter(gramlist_updated)
paired_words_frq_updated.most_common(100)

[('great_product', 34189),
 ('weight_loss', 32942),
 ('fish_oil', 18131),
 ('side_effect', 17720),
 ('energy_level', 7491),
 ('high_quality', 7143),
 ('vitamin_d', 6331),
 ('immune_system', 5981),
 ('blood_pressure', 5431),
 ('customer_service', 5425),
 ('anyone_who', 5374),
 ('vitamin_c', 5201),
 ('waste_of_money', 4609),
 ('multi_vitamin', 4236),
 ('people_who', 4156),
 ('blood_sugar', 3939),
 ('little_bit', 3616),
 ('second_bottle', 3613),
 ('food_store', 3512),
 ('health_benefit', 3287),
 ('hot_flash', 3246),
 ('long_term', 3235),
 ('joint_pain', 3143),
 ('raspberry_ketone', 3095),
 ('appetite_suppressant', 2795),
 ('vitamin_e', 2624),
 ('gel_cap', 2529),
 ('green_tea', 2501),
 ('krill_oil', 2492),
 ('digestive_system', 2487),
 ('glass_of_water', 2234),
 ('small_amount', 2179),
 ('fast_shipping', 2164),
 ('whole_food', 2146),
 ('blood_test', 2113),
 ('expiration_date', 2030),
 ('fat_burner', 2026),
 ('huge_difference', 1994),
 ('protein_powder', 1952),
 ('acid_reflux', 1902),
 ('ne

In [113]:
len(paired_words_frq_updated)   # final number of cleaned-up paired words in the specified phrase format

46785

#### Final Clean-up: Remove Stop Words

In [8]:
unigram_sents_pos_df.head()

Unnamed: 0,review_number,sentence_number,unigram_sentences,unigram_pos,preprocessed_review,has_paired_words
0,1,1,"[dpe, the, job, well]","[NOUN, DET, NOUN, ADV]","[dpe, the, job, well]",0
1,1,2,"[b, flax, d, be, a, regular, at, -PRON-, house]","[NOUN, NOUN, NOUN, VERB, DET, ADJ, ADP, ADJ, N...","[b, flax, d, be, a, regular, at, -PRON-, house]",0
2,1,3,"[-PRON-, do, -PRON-, job, simply, and, with, g...","[PRON, VERB, ADJ, NOUN, ADV, CCONJ, ADP, ADJ, ...","[-PRON-, do, -PRON-, job, simply, and, with, g...",0
3,1,4,"[-PRON-, be, reasonable, last, a, long, time, ...","[PRON, VERB, ADJ, VERB, DET, ADJ, NOUN, CCONJ,...","[-PRON-, be, reasonable, last, a, long, time, ...",1
4,1,5,"[good, product, good, price, good, result]","[ADJ, NOUN, ADJ, NOUN, ADJ, NOUN]","[good, product, good, price, good, result]",0


In [21]:
unigram_sents_pos_df.shape

(3599286, 6)

In [12]:
preprocessed_review_final = [[word for word in sentence if word not in nlp.Defaults.stop_words] for sentence in preprocessed_review_updated]

In [14]:
unigram_sents_pos_df.drop(['preprocessed_review'], axis=1, inplace=True)
unigram_sents_pos_df['preprocessed_review'] = preprocessed_review_final
unigram_sents_pos_df.head()

Unnamed: 0,review_number,sentence_number,unigram_sentences,unigram_pos,has_paired_words,preprocessed_review
0,1,1,"[dpe, the, job, well]","[NOUN, DET, NOUN, ADV]",0,"[dpe, job]"
1,1,2,"[b, flax, d, be, a, regular, at, -PRON-, house]","[NOUN, NOUN, NOUN, VERB, DET, ADJ, ADP, ADJ, N...",0,"[b, flax, d, regular, -PRON-, house]"
2,1,3,"[-PRON-, do, -PRON-, job, simply, and, with, g...","[PRON, VERB, ADJ, NOUN, ADV, CCONJ, ADP, ADJ, ...",0,"[-PRON-, -PRON-, job, simply, good, result]"
3,1,4,"[-PRON-, be, reasonable, last, a, long, time, ...","[PRON, VERB, ADJ, VERB, DET, ADJ, NOUN, CCONJ,...",1,"[-PRON-, reasonable, long, time, able, obtain,..."
4,1,5,"[good, product, good, price, good, result]","[ADJ, NOUN, ADJ, NOUN, ADJ, NOUN]",0,"[good, product, good, price, good, result]"


In [17]:
# save picked dataframe to S3.  Pickle format allows the columns to store lists
save_df_s3(unigram_sents_pos_df, bucket_name, filepath='amazon_reviews/preprocessed_reviews_v3.pkl', filetype='pickle')

In [None]:
# load from the pickled dataframe on S3
unigram_sents_pos_df = load_df_s3(bucket_name, filepath='amazon_reviews/preprocessed_reviews_v3.pkl', filetype='pickle')

# Training the LDA Model

In [24]:
tokenized_reviews = unigram_sents_pos_df.preprocessed_review.tolist()

In [25]:
tokenized_reviews[:3]

[['dpe', 'job'],
 ['b', 'flax', 'd', 'regular', '-PRON-', 'house'],
 ['-PRON-', '-PRON-', 'job', 'simply', 'good', 'result']]

In [18]:
%%time
# we need to learn the full vocabulary of the corpus to be modeled
# learn the dictionary by iterating over all of the reviews
vocab_dictionary = Dictionary(tokenized_reviews)

CPU times: user 31.6 s, sys: 600 ms, total: 32.2 s
Wall time: 32.3 s


In [22]:
# filter tokens that are very rare or too common from
# the dictionary (filter_extremes) and reassign integer ids (compactify)
vocab_dictionary.filter_extremes(no_below=1000, no_above=0.6)
vocab_dictionary.compactify()   # remove gaps in id sequence after words that were removed

In [23]:
save_df_s3(vocab_dictionary, bucket_name, filepath='amazon_reviews/vocab_dictionary.dict', filetype='pickle')

In [182]:
vocab_dictionary = Dictionary.load('../vocab_dictionary.dict')  # load the finished dictionary from disk

In [26]:
bow_corpus = []    # bag-of-words representation of the corpus
for review in tokenized_reviews:
    bow_corpus.append(vocab_dictionary.doc2bow(review))

In [28]:
type(bow_corpus)

list

In [29]:
bow_corpus[:10]

[[(0, 1)],
 [(1, 1), (2, 1), (3, 1), (4, 1), (5, 1)],
 [(0, 1), (6, 1), (7, 1), (8, 1)],
 [(9, 1), (10, 1), (11, 1), (12, 1), (13, 1)],
 [(6, 3), (7, 1), (14, 1), (15, 1)],
 [(6, 1), (16, 1)],
 [(17, 2), (18, 1), (19, 1), (20, 1), (21, 1)],
 [(22, 1), (23, 1), (24, 1)],
 [(6, 1), (16, 1), (22, 1), (25, 1), (26, 1)],
 [(15, 1), (27, 1), (28, 1)]]