In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
from collections import Counter
import warnings

import spacy
from gensim.models import Phrases
from gensim.models.word2vec import LineSentence
from gensim.corpora import Dictionary, MmCorpus
from gensim.models.ldamulticore import LdaMulticore
# import pyLDAvis
# import pyLDAvis.gensim

# import cPickle as pickle

from tqdm._tqdm_notebook import tqdm, tqdm_notebook, tnrange
from S3_read_write import load_df_s3, save_df_s3

from IPython.display import Image
from IPython.core.display import HTML

In [3]:
tqdm_notebook.pandas('Progress')

In [4]:
bucket_name = 'amazon-reviews-project'

# Load Amazon Reviews Data

In [65]:
reviews = load_df_s3(bucket_name, 'amazon_reviews/reviews_data_clean', filetype='text', sep='|')

In [31]:
reviews.shape    # 585,444 records

(585444, 8)

In [32]:
reviews.head()

Unnamed: 0,asin,helpful,reviewText,overall,summary,description,title,categories_clean
0,929619730,"[0, 0]",B-flax-D is a re...,5.0,Dpes the job well,Contains Organic...,New Generation B...,Health & Persona...
1,978559088,"[1, 1]",Studies show tha...,4.0,"Fast shipping, g...",Everyone knows t...,Nutrihill Resver...,Health & Persona...
2,978559088,"[1, 1]",I started taking...,5.0,Bioavailability ...,Everyone knows t...,Nutrihill Resver...,Health & Persona...
3,978559088,"[0, 1]",I tried Nutrihil...,1.0,Other Resveratro...,Everyone knows t...,Nutrihill Resver...,Health & Persona...
4,978559088,"[0, 0]",I really liked t...,5.0,I can't find thi...,Everyone knows t...,Nutrihill Resver...,Health & Persona...


In [33]:
reviews.dtypes

asin                 object
helpful              object
reviewText           object
overall             float64
summary              object
description          object
title                object
categories_clean     object
dtype: object

## Data Cleaning

In [52]:
reviews.categories_clean.unique()[:10]

array(['Health & Personal Care, Vitamins & Dietary Supplements, Multi & Prenatal Vitamins, Multiple Vitamin-Mineral Supplements',
       'Health & Personal Care, Vitamins & Dietary Supplements, Supplements, Antioxidants, Resveratrol',
       'Health & Personal Care, Vitamins & Dietary Supplements, Multi & Prenatal Vitamins, Multivitamins',
       'Health & Personal Care, Vitamins & Dietary Supplements, Vitamins, Vitamin B, B3 (Niacin)',
       'Health & Personal Care, Vitamins & Dietary Supplements, Herbal Supplements',
       'Health & Personal Care, Vitamins & Dietary Supplements, Herbal Supplements, Green Tea',
       'Health & Personal Care, Vitamins & Dietary Supplements, Weight Loss, Supplements, Green Coffee Bean Extract',
       'Health & Personal Care, Vitamins & Dietary Supplements, Weight Loss, Supplements',
       'Health & Personal Care, Vitamins & Dietary Supplements, Supplements, Antioxidants, CoQ10',
       'Health & Personal Care, Vitamins & Dietary Supplements, Herbal

The catergories' list indicates that there may be some reviews in the dataset unrelated to health supplements.  Let's get rid of these.

In [35]:
reviews[reviews.categories_clean.str.contains('CDs & Vinyl')].title.unique()

array(['Liturgy of St. John Chrysostom', 'Origins',
       'Sounds of the Earth: Soft Ocean Sounds', 'Bali',
       'Tranquil Waters', 'Bach: St. John Passion, BWV 245',
       '21st Century Soul', 'Bodies for Strontium', "John's Bunch",
       'An Evening of Paganini', "John's Other Bunch",
       'Sus Mas Grandes Exitos', 'Complex Simplicity',
       'Kidnapped By Neptune', 'Roman Chant / Easter Vespers', 'Dead 60s',
       "Cilla in the 60's", 'Chromium', 'Letters From the Vitamin Sea',
       'The Stinging Nettles', 'Tendres Annees 60', 'Wehiwehi Hawaii',
       'none'], dtype=object)

In [36]:
len(reviews[reviews.categories_clean.str.contains('CDs & Vinyl')])

263

The product titles shown above are all music albums/songs.

In [37]:
reviews_filt = reviews[~(reviews.categories_clean.str.contains('CDs & Vinyl'))]   # remove rows with category including 'CDs & Vinyl'

In [51]:
reviews_filt.categories_clean.unique()[:10]

array(['Health & Personal Care, Vitamins & Dietary Supplements, Multi & Prenatal Vitamins, Multiple Vitamin-Mineral Supplements',
       'Health & Personal Care, Vitamins & Dietary Supplements, Supplements, Antioxidants, Resveratrol',
       'Health & Personal Care, Vitamins & Dietary Supplements, Multi & Prenatal Vitamins, Multivitamins',
       'Health & Personal Care, Vitamins & Dietary Supplements, Vitamins, Vitamin B, B3 (Niacin)',
       'Health & Personal Care, Vitamins & Dietary Supplements, Herbal Supplements',
       'Health & Personal Care, Vitamins & Dietary Supplements, Herbal Supplements, Green Tea',
       'Health & Personal Care, Vitamins & Dietary Supplements, Weight Loss, Supplements, Green Coffee Bean Extract',
       'Health & Personal Care, Vitamins & Dietary Supplements, Weight Loss, Supplements',
       'Health & Personal Care, Vitamins & Dietary Supplements, Supplements, Antioxidants, CoQ10',
       'Health & Personal Care, Vitamins & Dietary Supplements, Herbal

In [39]:
reviews_filt[reviews_filt.categories_clean.str.contains('Software')]

Unnamed: 0,asin,helpful,reviewText,overall,summary,description,title,categories_clean
3639,B00009QP4Q,"[2, 2]",The company has ...,5.0,lives up to its ...,Alpha Five's QLi...,none,Health & Persona...
50015,B0002TIEQQ,"[0, 0]",I ordered this f...,1.0,waste of money,Self help tutori...,none,Health & Persona...


In [40]:
reviews_filt = reviews_filt[~(reviews_filt.categories_clean.str.contains('Software'))]

In [41]:
len(reviews_filt)

585179

In [53]:
# Get rid of reviews of pet-related products
search_for = [' pet ', ' cat ', ' dog ']
pattern = '|'.join(search_for)
reviews_filt.title.str.contains(pattern, case=False).sum()

277

In [50]:
reviews_filt[reviews_filt.title.str.contains(pattern, case=False)]['title'].values[:10]

array(['Power - Mune Tuna Flavor Pet Herbal Supplement From Vetvittles.com',
       'Power - Mune Tuna Flavor Pet Herbal Supplement From Vetvittles.com',
       'Power - Mune Tuna Flavor Pet Herbal Supplement From Vetvittles.com',
       'AniMed Witch Hazel 86-Percent Multi-Species Pet Supplement',
       'AniMed Witch Hazel 86-Percent Multi-Species Pet Supplement',
       'AniMed Witch Hazel 86-Percent Multi-Species Pet Supplement',
       'AniMed Witch Hazel 86-Percent Multi-Species Pet Supplement',
       'AniMed Witch Hazel 86-Percent Multi-Species Pet Supplement',
       'AniMed Witch Hazel 86-Percent Multi-Species Pet Supplement',
       'Composure Liquid for Dogs and Cat (188 SERVINGS)'], dtype=object)

In [54]:
# Get rid of all pet products
reviews_filt = reviews_filt[~(reviews_filt.title.str.contains(pattern, case=False))]

In [55]:
# saving the cleaned dataframe
save_df_s3(df=reviews_filt, bucket_name=bucket_name, filepath='amazon_reviews/reviews_data_clean_v2.feather')

In [56]:
reviews_filt.asin.nunique()     # 48,535 unique products and 585,179 reviews

48501

## Examine One Observation

In [57]:
example = reviews_filt.iloc[0]

In [58]:
example.asin     # Amazon Standard Identification Number

'0929619730'

In [59]:
example.title     # this is the product's name

'New Generation B-Flax-D'

In [60]:
example.categories_clean   # previously filtered/curated categories of interest

'Health & Personal Care, Vitamins & Dietary Supplements, Multi & Prenatal Vitamins, Multiple Vitamin-Mineral Supplements'

In [61]:
example.description       # product description provided by the seller

'Contains Organic Cold-Milled Flaxseed\nValuable source of soluble and insoluble fiber\nProvides Omega-3 essential fats, and many other nutrients to help achieve and maintain optimal bowel function.\n\nContains Vitamin B12\nB12 helps prevent nerve damage\nB12 aids in healthy cell formation.\nB12 helps prevent anemia\n\nContains Vitamin D\nVitamin D assists the body in the absorption of important minerals like calcium.\n\nContains Seleno-yeast\nA source of selenium, a mineral with powerful anti-viral and disease-fighting properties.\n\nContains Vitamin K2\nMenaQ7TM provides vitamin K2 (menaquinone), extracted and concentrated from natto without solvents. Vitamin K2 prevents arterial calcification and promotes strong bones by improving cross-linking of osteocalcin, a protein found in bones. The amount here has been clinically shown not to interfere with blood anti-coagulant medication. \n\nServing Size:\n1/4 Cup (30 Grams)\n\nServings Per Container:\n30 Servings per container\n\nNet Wt. 

In [62]:
example.summary      # review title

'Dpes the job well'

In [63]:
example.reviewText   # review content

'B-flax-D is a regular at our house. It does its job simply and with good results. It is reasonable, lasts a long time, and is able to be obtained with free shipping if you hunt around. Good product, good price, good results.'

Here's what the actual review looks like:

In [64]:
example.overall     # the rating provided by the reviewer

5.0

In [29]:
example.helpful

'[0, 0]'

In [27]:
Image(url= "images/amazon_review_screenshot.png")

# Data Pre-processing

Let us start off using only the title (`summary`) and body (`reviewText`) of each review.

In [147]:
%%time
df = load_df_s3(bucket_name, filepath='amazon_reviews/reviews_data_clean_v2.feather', filetype='feather')

CPU times: user 3.3 s, sys: 3.88 s, total: 7.18 s
Wall time: 24 s


In [148]:
df.dtypes

asin                 object
helpful              object
reviewText           object
overall             float64
summary              object
description          object
title                object
categories_clean     object
dtype: object

In [149]:
df.drop(['helpful', 'overall', 'title', 'categories_clean', 'description'], axis=1, inplace=True)

In [150]:
df.head()

Unnamed: 0,asin,reviewText,summary
0,929619730,B-flax-D is a regular at our house. It does it...,Dpes the job well
1,978559088,Studies show that Resveratrol is poorly absorb...,"Fast shipping, good communication"
2,978559088,I started taking this after both my parents di...,Bioavailability is the key
3,978559088,"I tried Nutrihill, but did not feel any of the...",Other Resveratrol Supplements are Better
4,978559088,I really liked this product because it stayed ...,"I can't find this product any longer, and I wi..."


In [151]:
# for each review, concatenate the review title and body
df.reviewText = df.summary + '. ' + df.reviewText

In [152]:
pd.set_option('max_colwidth', 200)
df.head()

Unnamed: 0,asin,reviewText,summary
0,929619730,"Dpes the job well. B-flax-D is a regular at our house. It does its job simply and with good results. It is reasonable, lasts a long time, and is able to be obtained with free shipping if you hunt ...",Dpes the job well
1,978559088,"Fast shipping, good communication. Studies show that Resveratrol is poorly absorbed when taken by pill, but lozenges are very effectively absorbed. Hardly any companies are selling lozenges. This ...","Fast shipping, good communication"
2,978559088,Bioavailability is the key. I started taking this after both my parents died of cancer as it supposed to enhance your immune system - the story on 60 Minutes on resveratrol was incredibly inspirin...,Bioavailability is the key
3,978559088,"Other Resveratrol Supplements are Better. I tried Nutrihill, but did not feel any of the supposed health benefits. I started reading and realized that even though buccal delivery is the best, the ...",Other Resveratrol Supplements are Better
4,978559088,"I can't find this product any longer, and I wish I could.. I really liked this product because it stayed in my mouth for a long time and I felt it was probably doing some good. I take a number of...","I can't find this product any longer, and I wish I could."


Let's drop the `summary` column now:

In [153]:
df.drop(['summary'], axis=1, inplace=True)

In [154]:
df.head()

Unnamed: 0,asin,reviewText
0,929619730,"Dpes the job well. B-flax-D is a regular at our house. It does its job simply and with good results. It is reasonable, lasts a long time, and is able to be obtained with free shipping if you hunt ..."
1,978559088,"Fast shipping, good communication. Studies show that Resveratrol is poorly absorbed when taken by pill, but lozenges are very effectively absorbed. Hardly any companies are selling lozenges. This ..."
2,978559088,Bioavailability is the key. I started taking this after both my parents died of cancer as it supposed to enhance your immune system - the story on 60 Minutes on resveratrol was incredibly inspirin...
3,978559088,"Other Resveratrol Supplements are Better. I tried Nutrihill, but did not feel any of the supposed health benefits. I started reading and realized that even though buccal delivery is the best, the ..."
4,978559088,"I can't find this product any longer, and I wish I could.. I really liked this product because it stayed in my mouth for a long time and I felt it was probably doing some good. I take a number of..."


In [155]:
pd.set_option('max_colwidth', 20)

## Remove Missing Reviews

In [156]:
df.reviewText.isnull().sum()    # 73 reviews have neither a review body text, nor a review title

73

In [157]:
# drop reviews with no text
df = df[~(df.reviewText.isnull())]

In [158]:
df.asin.isnull().sum()

0

Let's look at a few actual review texts:

In [159]:
df.reviewText.iloc[np.random.randint(0, len(df))]

"Had high hopes but very dissapointed.. After listening to Dr. Oz speak highly of Collagen +C benefits on skin, I ordered and tried for two weeks.  Only took half dose, 3 a day instead of 6, very hard to swallow these horse pills.  Can cause terrible pain in esophagus if not enough water is taken along with pills.I must have had a very bad reaction to Super Collagen because canker sores started appearing on tongue and  top of tongue got very sore.  Haven't been able to eat for a few days.  Stopped taking and still waiting for tongue to heal."

In [160]:
df.reviewText.iloc[np.random.randint(0, len(df))]

"Too Strong!. I just had a bad reaction from it. Very light headed. I'm not blaming the product, for someone else it might work well, but for me it was too strong..."

In [161]:
df.reviewText.iloc[np.random.randint(0, len(df))]

"rotten oil. I take flax seed oil, and when the local supermarket stopped selling it, I was happy to find it on Amazon. But I find this brand of oil gives me bad gas when I take it, which I believe is due to the oil being bad. I bought two bottles which I don't know what I am going to do with, hate to throw them away."

In [162]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 584829 entries, 0 to 584901
Data columns (total 2 columns):
asin          584829 non-null object
reviewText    584829 non-null object
dtypes: object(2)
memory usage: 13.4+ MB


## Phrase Detection

In [21]:
text = list(df.reviewText.values)    # make an iterable to store only the review text

In [22]:
len(text)

584829

In [23]:
# look at a few sample reviews
for rev in text[:4]:
    print(rev, '\n')

Dpes the job well. B-flax-D is a regular at our house. It does its job simply and with good results. It is reasonable, lasts a long time, and is able to be obtained with free shipping if you hunt around. Good product, good price, good results. 

Fast shipping, good communication. Studies show that Resveratrol is poorly absorbed when taken by pill, but lozenges are very effectively absorbed. Hardly any companies are selling lozenges. This company promises 99% purity and has fast shipping and good communication. I can't comment on the quality of product because I'm not a chemist but they seem to be legitimate. 

Bioavailability is the key. I started taking this after both my parents died of cancer as it supposed to enhance your immune system - the story on 60 Minutes on resveratrol was incredibly inspiring. Doing some research on the Internet, it is indicated that taking resveratrol in lozenge form is preferable as it is broken down by stomach acids.  The ez-melt formula recommended in a

In [24]:
nlp = spacy.load('en')

The helper functions below are from:

http://nbviewer.jupyter.org/github/skipgram/modern-nlp-in-python/blob/master/executable/Modern_NLP_in_Python.ipynb

In order to use `gensim`'s `Phrases` class to detect natural combinations of words (like 'vanilla ice cream'), we need to format our text into a list of sentences, with each sentence being a list of words.  This process takes a large amount of processing time (for reference, the times shown under the cells are for running the tasks on a c5.18xlarge EC2 instance (equivalent spot fleet)), so `text` has been split into 3 parts.

### Generate Unigram Sentences

In [25]:
len(text)

584829

In [26]:
# split text into 9 parts
text_first  = text[:50000]
text_second = text[50000:100000]
text_third  = text[100000:150000]
text_fourth = text[150000:300000]
text_fifth  = text[300000:350000]
text_sixth  = text[350000:400000]
text_seventh= text[400000:450000]
text_eighth = text[450000:500000]
text_ninth = text[500000:]

In [30]:
rev_num = 0    # review tracker
sent_num = 0   # sentence tracker
unigram_sents_pos = [] # to store lists of lemmatized tokens for each sentence

for parsed_review in tqdm(nlp.pipe(text_first, batch_size=20000, n_threads=72)):
    rev_num += 1
    for sent in parsed_review.sents:
        sent_num += 1
        # lemmatize tokens & save corresponding pos tag after filtering whitespace and punctuations
        lemmatized_sent = [(token.lemma_, token.pos_) for token in sent if not (token.is_space or token.is_punct)]
        unigram_sents_pos.append([rev_num, sent_num, lemmatized_sent])

print('current rev_num: ', rev_num)
print('current sent_num: ', sent_num)

50000it [08:06, 102.75it/s]

current rev_num:  50000
current sent_num:  305895





In [32]:
len(unigram_sents_pos)

305895

In [33]:
for i in range(5):
    print(unigram_sents_pos[i])

[1, 1, [('dpe', 'NOUN'), ('the', 'DET'), ('job', 'NOUN'), ('well', 'ADV')]]
[1, 2, [('b', 'NOUN'), ('flax', 'NOUN'), ('d', 'NOUN'), ('be', 'VERB'), ('a', 'DET'), ('regular', 'ADJ'), ('at', 'ADP'), ('-PRON-', 'ADJ'), ('house', 'NOUN')]]
[1, 3, [('-PRON-', 'PRON'), ('do', 'VERB'), ('-PRON-', 'ADJ'), ('job', 'NOUN'), ('simply', 'ADV'), ('and', 'CCONJ'), ('with', 'ADP'), ('good', 'ADJ'), ('result', 'NOUN')]]
[1, 4, [('-PRON-', 'PRON'), ('be', 'VERB'), ('reasonable', 'ADJ'), ('last', 'VERB'), ('a', 'DET'), ('long', 'ADJ'), ('time', 'NOUN'), ('and', 'CCONJ'), ('be', 'VERB'), ('able', 'ADJ'), ('to', 'PART'), ('be', 'VERB'), ('obtain', 'VERB'), ('with', 'ADP'), ('free', 'ADJ'), ('shipping', 'NOUN'), ('if', 'ADP'), ('-PRON-', 'PRON'), ('hunt', 'VERB'), ('around', 'ADV')]]
[1, 5, [('good', 'ADJ'), ('product', 'NOUN'), ('good', 'ADJ'), ('price', 'NOUN'), ('good', 'ADJ'), ('result', 'NOUN')]]


In [124]:
# Save progress...
review_number = [row[0] for row in unigram_sents_pos]
sentence_number = [row[1] for row in unigram_sents_pos]
words_joined_all = []
pos_joined_all = []
for sent in unigram_sents_pos:
    word_pos = sent[2]
    word_list = [word for word, pos in word_pos]
    pos_list = [pos for word, pos in word_pos]
    words_joined = ' '.join(word for word in word_list)
    pos_joined   = ' '.join(pos for pos in pos_list)
    words_joined_all.append(words_joined)
    pos_joined_all.append(pos_joined)
    
unigram_sentences_savedf = pd.DataFrame({'review_number': review_number,
                                         'sentence_number': sentence_number,
                                         'unigram_sentences': words_joined_all,
                                         'unigram_pos': pos_joined_all})

save_df_s3(unigram_sentences_savedf, bucket_name, 'amazon_reviews/unigram_sentences.feather')

In [131]:
unigram_sentences_savedf.head()

Unnamed: 0,review_number,sentence_number,unigram_pos,unigram_sentences
0,1,1,NOUN DET NOUN ADV,dpe the job well
1,1,2,NOUN NOUN NOUN V...,b flax d be a re...
2,1,3,PRON VERB ADJ NO...,-PRON- do -PRON-...
3,1,4,PRON VERB ADJ VE...,-PRON- be reason...
4,1,5,ADJ NOUN ADJ NOU...,good product goo...


In [134]:
for parsed_review in tqdm(nlp.pipe(text_second, batch_size=20000, n_threads=72)):
    rev_num += 1
    for sent in parsed_review.sents:
        sent_num += 1
        # lemmatize tokens & save corresponding pos tag after filtering whitespace and punctuations
        lemmatized_sent = [(token.lemma_, token.pos_) for token in sent if not (token.is_space or token.is_punct)]
        unigram_sents_pos.append([rev_num, sent_num, lemmatized_sent])

print('current rev_num: ', rev_num)
print('current sent_num: ', sent_num)

50000it [08:04, 103.18it/s]

current rev_num:  100000
current sent_num:  616751





In [136]:
print(len(unigram_sents_pos))

616751


In [137]:
# Save progress...
review_number = [row[0] for row in unigram_sents_pos]
sentence_number = [row[1] for row in unigram_sents_pos]
words_joined_all = []
pos_joined_all = []
for sent in unigram_sents_pos:
    word_pos = sent[2]
    word_list = [word for word, pos in word_pos]
    pos_list = [pos for word, pos in word_pos]
    words_joined = ' '.join(word for word in word_list)
    pos_joined   = ' '.join(pos for pos in pos_list)
    words_joined_all.append(words_joined)
    pos_joined_all.append(pos_joined)
    
unigram_sentences_savedf = pd.DataFrame({'review_number': review_number,
                                         'sentence_number': sentence_number,
                                         'unigram_sentences': words_joined_all,
                                         'unigram_pos': pos_joined_all})

save_df_s3(unigram_sentences_savedf, bucket_name, 'amazon_reviews/unigram_sentences.feather')

In [138]:
for parsed_review in tqdm(nlp.pipe(text_third, batch_size=20000, n_threads=72)):
    rev_num += 1
    for sent in parsed_review.sents:
        sent_num += 1
        # lemmatize tokens & save corresponding pos tag after filtering whitespace and punctuations
        lemmatized_sent = [(token.lemma_, token.pos_) for token in sent if not (token.is_space or token.is_punct)]
        unigram_sents_pos.append([rev_num, sent_num, lemmatized_sent])

print('current rev_num: ', rev_num)
print('current sent_num: ', sent_num)

50000it [07:55, 105.06it/s]

current rev_num:  150000
current sent_num:  923642





In [139]:
# Save progress...
review_number = [row[0] for row in unigram_sents_pos]
sentence_number = [row[1] for row in unigram_sents_pos]
words_joined_all = []
pos_joined_all = []
for sent in unigram_sents_pos:
    word_pos = sent[2]
    word_list = [word for word, pos in word_pos]
    pos_list = [pos for word, pos in word_pos]
    words_joined = ' '.join(word for word in word_list)
    pos_joined   = ' '.join(pos for pos in pos_list)
    words_joined_all.append(words_joined)
    pos_joined_all.append(pos_joined)
    
unigram_sentences_savedf = pd.DataFrame({'review_number': review_number,
                                         'sentence_number': sentence_number,
                                         'unigram_sentences': words_joined_all,
                                         'unigram_pos': pos_joined_all})

save_df_s3(unigram_sentences_savedf, bucket_name, 'amazon_reviews/unigram_sentences.feather')

In [140]:
for parsed_review in tqdm(nlp.pipe(text_fourth, batch_size=20000, n_threads=72)):
    rev_num += 1
    for sent in parsed_review.sents:
        sent_num += 1
        # lemmatize tokens & save corresponding pos tag after filtering whitespace and punctuations
        lemmatized_sent = [(token.lemma_, token.pos_) for token in sent if not (token.is_space or token.is_punct)]
        unigram_sents_pos.append([rev_num, sent_num, lemmatized_sent])

print('current rev_num: ', rev_num)
print('current sent_num: ', sent_num)

150000it [23:51, 104.82it/s]

current rev_num:  300000
current sent_num:  1843092





In [141]:
# Save progress...
review_number = [row[0] for row in unigram_sents_pos]
sentence_number = [row[1] for row in unigram_sents_pos]
words_joined_all = []
pos_joined_all = []
for sent in unigram_sents_pos:
    word_pos = sent[2]
    word_list = [word for word, pos in word_pos]
    pos_list = [pos for word, pos in word_pos]
    words_joined = ' '.join(word for word in word_list)
    pos_joined   = ' '.join(pos for pos in pos_list)
    words_joined_all.append(words_joined)
    pos_joined_all.append(pos_joined)
    
unigram_sentences_savedf = pd.DataFrame({'review_number': review_number,
                                         'sentence_number': sentence_number,
                                         'unigram_sentences': words_joined_all,
                                         'unigram_pos': pos_joined_all})

save_df_s3(unigram_sentences_savedf, bucket_name, 'amazon_reviews/unigram_sentences.feather')

In [142]:
for parsed_review in tqdm(nlp.pipe(text_fifth, batch_size=20000, n_threads=72)):
    rev_num += 1
    for sent in parsed_review.sents:
        sent_num += 1
        # lemmatize tokens & save corresponding pos tag after filtering whitespace and punctuations
        lemmatized_sent = [(token.lemma_, token.pos_) for token in sent if not (token.is_space or token.is_punct)]
        unigram_sents_pos.append([rev_num, sent_num, lemmatized_sent])

print('current rev_num: ', rev_num)
print('current sent_num: ', sent_num)

50000it [07:43, 107.98it/s]

current rev_num:  350000
current sent_num:  2144424





In [143]:
# Save progress...
review_number = [row[0] for row in unigram_sents_pos]
sentence_number = [row[1] for row in unigram_sents_pos]
words_joined_all = []
pos_joined_all = []
for sent in unigram_sents_pos:
    word_pos = sent[2]
    word_list = [word for word, pos in word_pos]
    pos_list = [pos for word, pos in word_pos]
    words_joined = ' '.join(word for word in word_list)
    pos_joined   = ' '.join(pos for pos in pos_list)
    words_joined_all.append(words_joined)
    pos_joined_all.append(pos_joined)
    
unigram_sentences_savedf = pd.DataFrame({'review_number': review_number,
                                         'sentence_number': sentence_number,
                                         'unigram_sentences': words_joined_all,
                                         'unigram_pos': pos_joined_all})

save_df_s3(unigram_sentences_savedf, bucket_name, 'amazon_reviews/unigram_sentences.feather')

In [144]:
for parsed_review in tqdm(nlp.pipe(text_sixth, batch_size=20000, n_threads=72)):
    rev_num += 1
    for sent in parsed_review.sents:
        sent_num += 1
        # lemmatize tokens & save corresponding pos tag after filtering whitespace and punctuations
        lemmatized_sent = [(token.lemma_, token.pos_) for token in sent if not (token.is_space or token.is_punct)]
        unigram_sents_pos.append([rev_num, sent_num, lemmatized_sent])

print('current rev_num: ', rev_num)
print('current sent_num: ', sent_num)

50000it [07:46, 107.22it/s]

current rev_num:  400000
current sent_num:  2447985





In [145]:
# Save progress...
review_number = [row[0] for row in unigram_sents_pos]
sentence_number = [row[1] for row in unigram_sents_pos]
words_joined_all = []
pos_joined_all = []
for sent in unigram_sents_pos:
    word_pos = sent[2]
    word_list = [word for word, pos in word_pos]
    pos_list = [pos for word, pos in word_pos]
    words_joined = ' '.join(word for word in word_list)
    pos_joined   = ' '.join(pos for pos in pos_list)
    words_joined_all.append(words_joined)
    pos_joined_all.append(pos_joined)
    
unigram_sentences_savedf = pd.DataFrame({'review_number': review_number,
                                         'sentence_number': sentence_number,
                                         'unigram_sentences': words_joined_all,
                                         'unigram_pos': pos_joined_all})

save_df_s3(unigram_sentences_savedf, bucket_name, 'amazon_reviews/unigram_sentences.feather')

In [146]:
for parsed_review in tqdm(nlp.pipe(text_seventh, batch_size=20000, n_threads=72)):
    rev_num += 1
    for sent in parsed_review.sents:
        sent_num += 1
        # lemmatize tokens & save corresponding pos tag after filtering whitespace and punctuations
        lemmatized_sent = [(token.lemma_, token.pos_) for token in sent if not (token.is_space or token.is_punct)]
        unigram_sents_pos.append([rev_num, sent_num, lemmatized_sent])

print('current rev_num: ', rev_num)
print('current sent_num: ', sent_num)

50000it [07:41, 108.43it/s]

current rev_num:  450000
current sent_num:  2754623





In [147]:
# Save progress...
review_number = [row[0] for row in unigram_sents_pos]
sentence_number = [row[1] for row in unigram_sents_pos]
words_joined_all = []
pos_joined_all = []
for sent in unigram_sents_pos:
    word_pos = sent[2]
    word_list = [word for word, pos in word_pos]
    pos_list = [pos for word, pos in word_pos]
    words_joined = ' '.join(word for word in word_list)
    pos_joined   = ' '.join(pos for pos in pos_list)
    words_joined_all.append(words_joined)
    pos_joined_all.append(pos_joined)
    
unigram_sentences_savedf = pd.DataFrame({'review_number': review_number,
                                         'sentence_number': sentence_number,
                                         'unigram_sentences': words_joined_all,
                                         'unigram_pos': pos_joined_all})

save_df_s3(unigram_sentences_savedf, bucket_name, 'amazon_reviews/unigram_sentences.feather')

In [148]:
for parsed_review in tqdm(nlp.pipe(text_eighth, batch_size=20000, n_threads=72)):
    rev_num += 1
    for sent in parsed_review.sents:
        sent_num += 1
        # lemmatize tokens & save corresponding pos tag after filtering whitespace and punctuations
        lemmatized_sent = [(token.lemma_, token.pos_) for token in sent if not (token.is_space or token.is_punct)]
        unigram_sents_pos.append([rev_num, sent_num, lemmatized_sent])

print('current rev_num: ', rev_num)
print('current sent_num: ', sent_num)

50000it [08:04, 103.24it/s]


current rev_num:  500000
current sent_num:  3073060


In [149]:
# Save progress...
review_number = [row[0] for row in unigram_sents_pos]
sentence_number = [row[1] for row in unigram_sents_pos]
words_joined_all = []
pos_joined_all = []
for sent in unigram_sents_pos:
    word_pos = sent[2]
    word_list = [word for word, pos in word_pos]
    pos_list = [pos for word, pos in word_pos]
    words_joined = ' '.join(word for word in word_list)
    pos_joined   = ' '.join(pos for pos in pos_list)
    words_joined_all.append(words_joined)
    pos_joined_all.append(pos_joined)
    
unigram_sentences_savedf = pd.DataFrame({'review_number': review_number,
                                         'sentence_number': sentence_number,
                                         'unigram_sentences': words_joined_all,
                                         'unigram_pos': pos_joined_all})

save_df_s3(unigram_sentences_savedf, bucket_name, 'amazon_reviews/unigram_sentences.feather')

In [150]:
for parsed_review in tqdm(nlp.pipe(text_ninth, batch_size=20000, n_threads=72)):
    rev_num += 1
    for sent in parsed_review.sents:
        sent_num += 1
        # lemmatize tokens & save corresponding pos tag after filtering whitespace and punctuations
        lemmatized_sent = [(token.lemma_, token.pos_) for token in sent if not (token.is_space or token.is_punct)]
        unigram_sents_pos.append([rev_num, sent_num, lemmatized_sent])

print('current rev_num: ', rev_num)
print('current sent_num: ', sent_num)

84829it [13:30, 104.70it/s]

current rev_num:  584829
current sent_num:  3605491





In [151]:
# Save progress...
review_number = [row[0] for row in unigram_sents_pos]
sentence_number = [row[1] for row in unigram_sents_pos]
words_joined_all = []
pos_joined_all = []
for sent in unigram_sents_pos:
    word_pos = sent[2]
    word_list = [word for word, pos in word_pos]
    pos_list = [pos for word, pos in word_pos]
    words_joined = ' '.join(word for word in word_list)
    pos_joined   = ' '.join(pos for pos in pos_list)
    words_joined_all.append(words_joined)
    pos_joined_all.append(pos_joined)
    
unigram_sentences_savedf = pd.DataFrame({'review_number': review_number,
                                         'sentence_number': sentence_number,
                                         'unigram_sentences': words_joined_all,
                                         'unigram_pos': pos_joined_all})

save_df_s3(unigram_sentences_savedf, bucket_name, 'amazon_reviews/unigram_sentences.feather')

In [5]:
# DON'T LOAD THIS FILE - there's a _v1 version further down!
unigram_sentences_savedf = load_df_s3(bucket_name, 'amazon_reviews/unigram_sentences.feather', filetype='feather')

In [6]:
unigram_sentences_savedf.head()

Unnamed: 0,review_number,sentence_number,unigram_pos,unigram_sentences
0,1,1,NOUN DET NOUN ADV,dpe the job well
1,1,2,NOUN NOUN NOUN VERB DET ADJ ADP ADJ NOUN,b flax d be a regular at -PRON- house
2,1,3,PRON VERB ADJ NOUN ADV CCONJ ADP ADJ NOUN,-PRON- do -PRON- job simply and with good result
3,1,4,PRON VERB ADJ VERB DET ADJ NOUN CCONJ VERB ADJ...,-PRON- be reasonable last a long time and be a...
4,1,5,ADJ NOUN ADJ NOUN ADJ NOUN,good product good price good result


#### Additional Data Cleaning

In [12]:
def clean_up(sentence, sentence_pos):
    """Expects a sentence as a single string as input 1, and its corresponding part-of-speech tags as input 2 (also single string).
    Cleans it up and returns a single string.
    Also updates corresponding part-of-speech string.
    """
    # get rid of webpage links
    cond = ['http' in sentence, 'www' in sentence]
    if any(cond):
        words = sentence.split(' ')
        words_pos = sentence_pos.split(' ')
        to_remove = []
        for i in range(len(words)):
            cond_word = ['http' in words[i], 'www' in words[i]]
            if any(cond_word):
                to_remove.append(i)
        # remove words that are links
        for j in sorted(to_remove, reverse=True):
            del words[j]
            del words_pos[j]
        # reconstruct sentence after deleting links
        sentence = ' '.join(words)
        sentence_pos = ' '.join(words_pos)

    # replace underscores with blanks to avoid mix-up with paired words later
    # cannot replace with spaces because the strings are split on spaces later 
    # and this would create new words with no corresponding pos tags
    if '_' in sentence:
        sentence = sentence.replace('_', '')
    return sentence, sentence_pos

In [25]:
test_clean = ['whoa watch out for them links boy http://sup.com and also BAM! underscore_time!', 'this is a normal sentence', 
              '__ what is this ____ http', '_', 'http']
test_clean

['whoa watch out for them links boy http://sup.com and also BAM! underscore_time!',
 'this is a normal sentence',
 '__ what is this ____ http',
 '_',
 'http']

In [29]:
test_clean_pos = ['X X X X X X X X X X X X', 'X X X X X', 'X X X X X X', 'X', 'X']

In [30]:
[len(e.split(' ')) for e in test_clean]

[12, 5, 6, 1, 1]

In [31]:
[e.count('X') for e in test_clean_pos]

[12, 5, 6, 1, 1]

In [32]:
# check if clean_up works as expected
for i in range(len(test_clean)):
    sentence = test_clean[i]
    sentence_pos = test_clean_pos[i]
    test_clean[i], test_clean_pos[i] = clean_up(sentence, sentence_pos)

test_clean

['whoa watch out for them links boy and also BAM! underscoretime!',
 'this is a normal sentence',
 ' what is this ',
 '',
 '']

In [33]:
test_clean_pos

['X X X X X X X X X X X', 'X X X X X', 'X X X X X', 'X', '']

In [34]:
[e.count('X') for e in test_clean_pos]

[11, 5, 5, 1, 0]

In [35]:
[len(e.split(' ')) for e in test_clean]

[11, 5, 5, 1, 1]

In [36]:
words_joined_all = unigram_sentences_savedf.unigram_sentences.tolist()

In [37]:
pos_joined_all = unigram_sentences_savedf.unigram_pos.tolist()

In [38]:
len(words_joined_all)

3605491

In [39]:
len([sentence for sentence in words_joined_all if '_' in sentence])

605

In [40]:
len([sentence for sentence in words_joined_all if 'http' in sentence])

513

In [41]:
len([sentence for sentence in words_joined_all if 'www' in sentence])

630

In [42]:
unigram_sentences_savedf[unigram_sentences_savedf.unigram_sentences.str.contains('_')].head()

Unnamed: 0,review_number,sentence_number,unigram_pos,unigram_sentences
7547,1290,7548,X,http://www.amazon.com/gp/product/b0000533z8/re...
16179,2775,16180,DET NOUN NOUN,no jet_lag pill
16628,2837,16629,PRON VERB VERB PROPN PART NUM PROPN ADP ADP AD...,-PRON- do recommend women 's one a_day though ...
23009,3833,23010,PRON VERB ADJ NOUN CCONJ ADJ NOUN NOUN,-PRON- be less money and good quality https://...
25117,4169,25118,ADJ PART NOUN ADV,easy to use_work well


In [43]:
[sentence for sentence in words_joined_all if '_' in sentence][:10]

['http://www.amazon.com/gp/product/b0000533z8/ref=cm_cr_rev_prod_title',
 'no jet_lag pill',
 "-PRON- do recommend women 's one a_day though with extra calcium",
 '-PRON- be less money and good quality https://www.amazon.com/review/review-your-purchases/ref=pe_6680_116681230_cm_add_2_star3?_encoding=utf8&asins;=b0000ccw1n%3a3%2cb000sar2dk&channel;=ec_phy&crauthtoken;=ge5g%2bbf%2btr%2f%2fdliytbmmzxn6ajjlfxjdtx902p0aaaadaaaaafnfv%2bbyyxcaaaaa&customerid;=a1pansxlpbgvng#top',
 'easy to use_work well',
 '-PRON- have have pedometer in the past_all difficult and confusing to use to the point -PRON- simply give up on -PRON-',
 'overall -PRON- mother be very satisfied with this product!-d_lionz',
 'this inexpensive strap with a metal clip http://www.amazon.com/gp/product/b000bitymg/ref=oh_details_o00_s00_i00?ie=utf8&psc;=1 be a good replacement for the flimsy omron plastic clip but -PRON- have not be use -PRON- long',
 'hj_112 digital pemium pedometer update',
 'accordingly for 100gr serving s

In [44]:
# clean up all unigrams
for i in range(len(words_joined_all)):
    sentence = words_joined_all[i]
    sentence_pos = pos_joined_all[i]
    words_joined_all[i], pos_joined_all[i] = clean_up(sentence, sentence_pos)

In [45]:
len([sentence for sentence in words_joined_all if 'http' in sentence])

0

In [46]:
len([sentence for sentence in words_joined_all if '_' in sentence])

0

In [47]:
unigram_sentences_savedf.unigram_sentences.iloc[:5]

0                                     dpe the job well
1                b flax d be a regular at -PRON- house
2     -PRON- do -PRON- job simply and with good result
3    -PRON- be reasonable last a long time and be a...
4                  good product good price good result
Name: unigram_sentences, dtype: object

In [48]:
words_joined_all[:5]

['dpe the job well',
 'b flax d be a regular at -PRON- house',
 '-PRON- do -PRON- job simply and with good result',
 '-PRON- be reasonable last a long time and be able to be obtain with free shipping if -PRON- hunt around',
 'good product good price good result']

In [49]:
unigram_sentences_savedf.drop(['unigram_sentences'], axis=1, inplace=True)
unigram_sentences_savedf.drop(['unigram_pos'], axis=1, inplace=True)

In [50]:
unigram_sentences_savedf['unigram_sentences'] = words_joined_all
unigram_sentences_savedf['unigram_pos'] = pos_joined_all

In [51]:
unigram_sentences_savedf.head()

Unnamed: 0,review_number,sentence_number,unigram_sentences,unigram_pos
0,1,1,dpe the job well,NOUN DET NOUN ADV
1,1,2,b flax d be a regular at -PRON- house,NOUN NOUN NOUN VERB DET ADJ ADP ADJ NOUN
2,1,3,-PRON- do -PRON- job simply and with good result,PRON VERB ADJ NOUN ADV CCONJ ADP ADJ NOUN
3,1,4,-PRON- be reasonable last a long time and be a...,PRON VERB ADJ VERB DET ADJ NOUN CCONJ VERB ADJ...
4,1,5,good product good price good result,ADJ NOUN ADJ NOUN ADJ NOUN


In [52]:
# updated, cleaned up version of unigram_sentences.feather
save_df_s3(unigram_sentences_savedf, bucket_name, 'amazon_reviews/unigram_sentences_v1.feather')

### Phrase Detection

In [21]:
unigram_sentences_savedf = load_df_s3(bucket_name, 'amazon_reviews/unigram_sentences_v1.feather', filetype='feather')

In [22]:
words_joined_all = unigram_sentences_savedf.unigram_sentences.tolist()

In [53]:
unigram_sentences = [sentence.split(' ') for sentence in words_joined_all]

In [54]:
print(unigram_sentences[:4])

[['dpe', 'the', 'job', 'well'], ['b', 'flax', 'd', 'be', 'a', 'regular', 'at', '-PRON-', 'house'], ['-PRON-', 'do', '-PRON-', 'job', 'simply', 'and', 'with', 'good', 'result'], ['-PRON-', 'be', 'reasonable', 'last', 'a', 'long', 'time', 'and', 'be', 'able', 'to', 'be', 'obtain', 'with', 'free', 'shipping', 'if', '-PRON-', 'hunt', 'around']]


In [55]:
len(words_joined_all)

3605491

In [56]:
%%time
# The common_terms parameter add a way to give special treatment to common terms 
# (aka stop words) such that their presence between two words won’t prevent bigram detection. 
# It allows to detect expressions like “bank of america”
common_terms = ["of", "with", "without", "and", "or"]

# Train a first-order phrase detector
bigram_model = Phrases(unigram_sentences, threshold=0.7, scoring='npmi', common_terms=common_terms)

# Transform unigram sentences into bigram sentences
# Paired words are connected by an underscore, e.g. ice_cream
bigram_sentences = []
for sentence in unigram_sentences:
    bigram_sentences.append(bigram_model[sentence])



CPU times: user 3min 54s, sys: 1.5 s, total: 3min 55s
Wall time: 3min 55s


In [57]:
%%time
# Train a second-order phrase detector
# trigram_model = Phrases(bigram_sentences, min_count=5)
trigram_model = Phrases(bigram_sentences, threshold=0.7, scoring='npmi')

# Transform bigram sentences into trigram sentences
trigram_sentences = []
for sentence in bigram_sentences:
    trigram_sentences.append(trigram_model[sentence])

# remove any remaining stopwords
# trigram_sentences = [[word for word in sentence if word not in nlp.Defaults.stop_words] for sentence in trigram_sentences]



CPU times: user 3min 59s, sys: 1.44 s, total: 4min
Wall time: 4min


In [58]:
# the trigrams will be saved in a dataframe with a single column.
# each row is one sentence from any review
# each sentence is a single string separated by a single space.
trigram_sentences_savedf = pd.DataFrame([u' '.join(sentence) for sentence in trigram_sentences], columns=['preprocessed_review'])
# note: v1 is the version with higher threshold (0.7); the file without v1 uses 0.5.
save_df_s3(trigram_sentences_savedf, bucket_name, 'amazon_reviews/preprocessed_reviews_v1.feather')

In [5]:
trigram_sentences_savedf = load_df_s3(bucket_name, 'amazon_reviews/preprocessed_reviews_v1.feather', filetype='feather')

In [59]:
trigram_sentences_savedf.head()

Unnamed: 0,preprocessed_review
0,dpe the job well
1,b flax d be a regular at -PRON- house
2,-PRON- do -PRON- job simply and with good result
3,-PRON- be reasonable last a long time and be a...
4,good product good price good result


In [8]:
# trigram_sentences = trigram_sentences_savedf.preprocessed_review.tolist()

In [10]:
# len(trigram_sentences)

3605491

In [60]:
unigram_sents_pos_df = load_df_s3(bucket_name, 'amazon_reviews/unigram_sentences_v1.feather', filetype='feather')

In [61]:
unigram_sents_pos_df.head()

Unnamed: 0,review_number,sentence_number,unigram_sentences,unigram_pos
0,1,1,dpe the job well,NOUN DET NOUN ADV
1,1,2,b flax d be a regular at -PRON- house,NOUN NOUN NOUN VERB DET ADJ ADP ADJ NOUN
2,1,3,-PRON- do -PRON- job simply and with good result,PRON VERB ADJ NOUN ADV CCONJ ADP ADJ NOUN
3,1,4,-PRON- be reasonable last a long time and be a...,PRON VERB ADJ VERB DET ADJ NOUN CCONJ VERB ADJ...
4,1,5,good product good price good result,ADJ NOUN ADJ NOUN ADJ NOUN


In [62]:
unigram_sents_pos_df.shape

(3605491, 4)

In [63]:
del unigram_sentences_savedf

In [64]:
unigram_sents_pos_df = pd.merge(unigram_sents_pos_df, trigram_sentences_savedf, how='inner', left_index=True, right_index=True)

In [65]:
unigram_sents_pos_df.head(10)

Unnamed: 0,review_number,sentence_number,unigram_sentences,unigram_pos,preprocessed_review
0,1,1,dpe the job well,NOUN DET NOUN ADV,dpe the job well
1,1,2,b flax d be a regular at -PRON- house,NOUN NOUN NOUN VERB DET ADJ ADP ADJ NOUN,b flax d be a regular at -PRON- house
2,1,3,-PRON- do -PRON- job simply and with good result,PRON VERB ADJ NOUN ADV CCONJ ADP ADJ NOUN,-PRON- do -PRON- job simply and with good result
3,1,4,-PRON- be reasonable last a long time and be a...,PRON VERB ADJ VERB DET ADJ NOUN CCONJ VERB ADJ...,-PRON- be reasonable last a long time and be a...
4,1,5,good product good price good result,ADJ NOUN ADJ NOUN ADJ NOUN,good product good price good result
5,2,6,fast shipping good communication,ADJ NOUN ADJ NOUN,fast shipping good communication
6,2,7,study show that resveratrol be poorly absorb w...,NOUN VERB ADP PROPN VERB ADV VERB ADV VERB ADP...,study show that resveratrol be poorly absorb w...
7,2,8,hardly any company be sell lozenge,ADV DET NOUN VERB VERB NOUN,hardly any company be sell lozenge
8,2,9,this company promise 99 purity and have fast s...,DET NOUN VERB NUM NOUN CCONJ VERB ADJ NOUN CCO...,this company promise 99 purity and have fast s...
9,2,10,-PRON- can not comment on the quality of produ...,PRON VERB ADV VERB ADP DET NOUN ADP NOUN ADP P...,-PRON- can not comment on the quality of produ...


In [66]:
save_df_s3(unigram_sents_pos_df, bucket_name, 'amazon_reviews/preprocessed_reviews_v1.feather')

In [5]:
unigram_sents_pos_df = load_df_s3(bucket_name, 'amazon_reviews/preprocessed_reviews_v1.feather', filetype='feather')

In [67]:
unigram_sents_pos_df.iloc[400:410]

Unnamed: 0,review_number,sentence_number,unigram_sentences,unigram_pos,preprocessed_review
400,70,401,-PRON- do not know buy this product will becom...,PRON VERB ADV VERB VERB DET NOUN VERB VERB DET...,-PRON- do_not know buy this product will becom...
401,70,402,-PRON- think -PRON- just fraud,PRON VERB ADJ ADJ NOUN,-PRON- think -PRON- just fraud
402,70,403,do not recommend this product,VERB ADV VERB DET NOUN,do_not recommend this product
403,71,404,mould motion 5 do not work!!. -PRON- recently ...,VERB PROPN NUM VERB ADV ADJ PRON ADV VERB DET ...,mould_motion 5 do_not work!!. -PRON- recently ...
404,71,405,-PRON- do not sweat or ne thing just burn,PRON VERB ADV VERB CCONJ NOUN NOUN ADV VERB,-PRON- do_not sweat or ne thing just burn
405,71,406,and i have -PRON- over -PRON- shirt because -P...,CCONJ PRON VERB PRON ADP ADJ NOUN ADP PRON VER...,and i have -PRON- over -PRON- shirt because -P...
406,71,407,and besides -PRON- still have to boil -PRON- f...,CCONJ ADP PRON ADV VERB PART VERB PRON ADP NUM...,and besides -PRON- still have to boil -PRON- f...
407,71,408,just too much to wait til -PRON- put -PRON- on,ADV ADV ADJ PART VERB ADV PRON VERB PRON PART,just too much to wait til -PRON- put -PRON- on
408,71,409,do not buy,VERB ADV VERB,do_not buy
409,72,410,be a gift,VERB DET NOUN,be a gift


In [68]:
unigram_sents_pos_df.isnull().sum()

review_number          0
sentence_number        0
unigram_sentences      0
unigram_pos            0
preprocessed_review    0
dtype: int64

In [69]:
unigram_sents_pos_df['has_paired_words'] = 0

In [70]:
unigram_sents_pos_df.loc[unigram_sents_pos_df.preprocessed_review.str.contains('_'), ['has_paired_words']] = 1

In [71]:
unigram_sents_pos_df.has_paired_words.sum()  # number of sentences with paired words

650163

In [72]:
unigram_sents_pos_df.head()

Unnamed: 0,review_number,sentence_number,unigram_sentences,unigram_pos,preprocessed_review,has_paired_words
0,1,1,dpe the job well,NOUN DET NOUN ADV,dpe the job well,0
1,1,2,b flax d be a regular at -PRON- house,NOUN NOUN NOUN VERB DET ADJ ADP ADJ NOUN,b flax d be a regular at -PRON- house,0
2,1,3,-PRON- do -PRON- job simply and with good result,PRON VERB ADJ NOUN ADV CCONJ ADP ADJ NOUN,-PRON- do -PRON- job simply and with good result,0
3,1,4,-PRON- be reasonable last a long time and be a...,PRON VERB ADJ VERB DET ADJ NOUN CCONJ VERB ADJ...,-PRON- be reasonable last a long time and be a...,0
4,1,5,good product good price good result,ADJ NOUN ADJ NOUN ADJ NOUN,good product good price good result,0


In [73]:
%%time
unigram_sents_pos_df.unigram_pos = unigram_sents_pos_df.unigram_pos.str.split(' ')
unigram_sents_pos_df.unigram_sentences = unigram_sents_pos_df.unigram_sentences.str.split(' ')
unigram_sents_pos_df.preprocessed_review = unigram_sents_pos_df.preprocessed_review.str.split(' ')

CPU times: user 25.9 s, sys: 2.94 s, total: 28.9 s
Wall time: 28.9 s


In [74]:
unigram_sents_pos_df.iloc[400:410]

Unnamed: 0,review_number,sentence_number,unigram_sentences,unigram_pos,preprocessed_review,has_paired_words
400,70,401,"[-PRON-, do, not, know, buy, this, product, wi...","[PRON, VERB, ADV, VERB, VERB, DET, NOUN, VERB,...","[-PRON-, do_not, know, buy, this, product, wil...",1
401,70,402,"[-PRON-, think, -PRON-, just, fraud]","[PRON, VERB, ADJ, ADJ, NOUN]","[-PRON-, think, -PRON-, just, fraud]",0
402,70,403,"[do, not, recommend, this, product]","[VERB, ADV, VERB, DET, NOUN]","[do_not, recommend, this, product]",1
403,71,404,"[mould, motion, 5, do, not, work!!., -PRON-, r...","[VERB, PROPN, NUM, VERB, ADV, ADJ, PRON, ADV, ...","[mould_motion, 5, do_not, work!!., -PRON-, rec...",1
404,71,405,"[-PRON-, do, not, sweat, or, ne, thing, just, ...","[PRON, VERB, ADV, VERB, CCONJ, NOUN, NOUN, ADV...","[-PRON-, do_not, sweat, or, ne, thing, just, b...",1
405,71,406,"[and, i, have, -PRON-, over, -PRON-, shirt, be...","[CCONJ, PRON, VERB, PRON, ADP, ADJ, NOUN, ADP,...","[and, i, have, -PRON-, over, -PRON-, shirt, be...",0
406,71,407,"[and, besides, -PRON-, still, have, to, boil, ...","[CCONJ, ADP, PRON, ADV, VERB, PART, VERB, PRON...","[and, besides, -PRON-, still, have, to, boil, ...",0
407,71,408,"[just, too, much, to, wait, til, -PRON-, put, ...","[ADV, ADV, ADJ, PART, VERB, ADV, PRON, VERB, P...","[just, too, much, to, wait, til, -PRON-, put, ...",0
408,71,409,"[do, not, buy]","[VERB, ADV, VERB]","[do_not, buy]",1
409,72,410,"[be, a, gift]","[VERB, DET, NOUN]","[be, a, gift]",0


In [75]:
unigram_sents_pos_df.isnull().sum()

review_number          0
sentence_number        0
unigram_sentences      0
unigram_pos            0
preprocessed_review    0
has_paired_words       0
dtype: int64

Let's look at an arbitrary sentence and it's transformation:

In [76]:
print(unigram_sents_pos_df.unigram_sentences.iloc[105])

['liver', 'support', 'supports', 'liver', 'function', 'stimulate', 'des', 'intoxication', 'and', 'restore', 'liver', 'function', 'eliminate', 'harmful', 'metabolite']


In [77]:
print(unigram_sents_pos_df.unigram_pos.iloc[105])

['PROPN', 'PROPN', 'PROPN', 'NOUN', 'NOUN', 'VERB', 'X', 'NOUN', 'CCONJ', 'VERB', 'NOUN', 'NOUN', 'VERB', 'ADJ', 'NOUN']


In [78]:
print(unigram_sents_pos_df.preprocessed_review.iloc[105])

['liver', 'support', 'supports', 'liver', 'function', 'stimulate', 'des_intoxication', 'and', 'restore', 'liver', 'function', 'eliminate', 'harmful', 'metabolite']


In [80]:
gramlist = [word for sent in trigram_sentences for word in sent if '_' in word]

In [81]:
paired_words_frq = Counter(gramlist)
paired_words_frq.most_common(100)

[('do_not', 268437),
 ('weight_loss', 35842),
 ('side_effect', 28079),
 ('fish_oil', 24658),
 ('highly_recommend', 23157),
 ('garcinia_cambogia', 11913),
 ('dr._oz', 7675),
 ('blood_pressure', 6993),
 ('five_star', 6845),
 ('krill_oil', 6513),
 ('immune_system', 6365),
 ('customer_service', 6246),
 ('gel_cap', 6239),
 ('green_coffee_bean_extract', 5419),
 ('blood_sugar', 5019),
 ('raspberry_ketone', 4497),
 ('look_forward', 4409),
 ('green_tea', 4303),
 ('empty_stomach', 4001),
 ('500_mg', 3867),
 ('30_minute', 3716),
 ('appetite_suppressant', 3521),
 ('raspberry_ketones', 3372),
 ('hot_flash', 3326),
 ('green_coffee', 3264),
 ('1000_mg', 3197),
 ('fat_burner', 2647),
 ('dr_oz', 2513),
 ('new_chapter', 2357),
 ('fatty_acid', 2238),
 ('green_coffee_bean', 2222),
 ('expiration_date', 2139),
 ('fall_asleep', 2138),
 ('acid_reflux', 2104),
 ('pre_workout', 2028),
 ('b_complex', 1989),
 ('jarrow_formulas', 1824),
 ('bowel_movement', 1757),
 ('anti_inflammatory', 1740),
 ('milk_thistle', 173

In [83]:
# Find the 100 most infrequent paired words
paired_words_frq.most_common()[::-1][:100]

[('veep_university---', 1),
 ('expereienc_with_veep', 1),
 ('veep_lookcut', 1),
 ('8220_recommended&#8221', 1),
 ('wishful_thinking!ftc', 1),
 ('atrail_fibrillationso', 1),
 ('george_flansbaum', 1),
 ('34;daily_supplements&#34', 1),
 ('rebecca_peagler', 1),
 ('channel_uctciyg3wusbfxkgyjfpz8og', 1),
 ('su_rodilla', 1),
 ('productmuy_buen_producto', 1),
 ('bri_nutrition&#8217;s_unconditional', 1),
 ('8220;bowel_issues&#8221', 1),
 ('34;all_natural&#34;i', 1),
 ('occurrence_of_fosmon', 1),
 ('34;last_diet.&#34', 1),
 ('34;total_diet&#34', 1),
 ('defy_reccomemd', 1),
 ('greg_bastin', 1),
 ('navaho_teas', 1),
 ('tea!!._braniac', 1),
 ('34;truth_of_reality&#34;.', 1),
 ('hott_natural!!.', 1),
 ('34;roller_coaster&#34;.', 1),
 ('ashley_sutherland', 1),
 ('pleasantely_surprised!.', 1),
 ('xanthoparmelia_scabrosa', 1),
 ('34;xanthoparmelia_cautioni', 1),
 ('34;caffeine_blues&#34', 1),
 ('3-in1_solution!.', 1),
 ('slowness_and_harshness', 1),
 ('alb_flatten', 1),
 ('fabled_freshman', 1),
 ('eric

In [84]:
len(paired_words_frq)  # number of paired terms

48329

In [85]:
unigram_sents_pos_df.head()

Unnamed: 0,review_number,sentence_number,unigram_sentences,unigram_pos,preprocessed_review,has_paired_words
0,1,1,"[dpe, the, job, well]","[NOUN, DET, NOUN, ADV]","[dpe, the, job, well]",0
1,1,2,"[b, flax, d, be, a, regular, at, -PRON-, house]","[NOUN, NOUN, NOUN, VERB, DET, ADJ, ADP, ADJ, N...","[b, flax, d, be, a, regular, at, -PRON-, house]",0
2,1,3,"[-PRON-, do, -PRON-, job, simply, and, with, g...","[PRON, VERB, ADJ, NOUN, ADV, CCONJ, ADP, ADJ, ...","[-PRON-, do, -PRON-, job, simply, and, with, g...",0
3,1,4,"[-PRON-, be, reasonable, last, a, long, time, ...","[PRON, VERB, ADJ, VERB, DET, ADJ, NOUN, CCONJ,...","[-PRON-, be, reasonable, last, a, long, time, ...",0
4,1,5,"[good, product, good, price, good, result]","[ADJ, NOUN, ADJ, NOUN, ADJ, NOUN]","[good, product, good, price, good, result]",0


In [141]:
def handle_failed_pairing(i, skip, num_paired, sent, sent_paired, to_remove):
    to_remove.extend([i])
    sent_paired.extend(sent[i + skip: i + skip + num_paired])


def filter_pairs(k, sent, sent_paired, sent_pos):
    """modify sent_paired in place"""
    paired_sent_len = len(sent_paired)
    skip = 0
    to_remove = []
    
    if len(sent) != len(sent_pos):
        
        print('len(sent): ', len(sent))
        print('len(sent_pos): ', len(sent_pos))
        print('sent: ', sent)
        print(' pos: ', sent_pos)
        print('k: ', k)
        return
    
    
    for i in range(paired_sent_len):
        word = sent_paired[i]
        if '_' in word:
            num_paired = word.count('_') + 1
            
            # more than 3 words paired - ignore pairing
            if num_paired > 3:
                handle_failed_pairing(i, skip, num_paired, sent, sent_paired, to_remove)
                skip += num_paired - 1
                continue
            
            # bigrams: noun/adj, noun
            elif num_paired == 2:
#                 print('sent_paired: ', sent_paired)
#                 print('len(sent): ', len(sent))
#                 print('len(sent_pos): ', len(sent_pos))
#                 print('i + skip + 1: ', i + skip + 1)
                pos_word_1 = sent_pos[i + skip]
                pos_word_2 = sent_pos[i + skip + 1]
                cond = (pos_word_1 in ('NOUN', 'ADJ'), pos_word_2 == 'NOUN')
                if not all(cond):
                    handle_failed_pairing(i, skip, num_paired, sent, sent_paired, to_remove)
                    skip += num_paired - 1
                    continue
            
            # trigrams: noun/adj, all types, noun/adj
            elif num_paired == 3:
                pos_word_1 = sent_pos[i + skip]
                pos_word_2 = sent_pos[i + skip + 1]
                pos_word_3 = sent_pos[i + skip + 2]
                cond = (pos_word_1 in ('NOUN', 'ADJ'), pos_word_3 in ('NOUN', 'ADJ'))
                if not all(cond):
                    handle_failed_pairing(i, skip, num_paired, sent, sent_paired, to_remove)
                    skip += num_paired - 1
                    continue
        
            # num. of words to skip indexing over sent and sent_pos in the next iter
            skip += num_paired - 1
        
    # remove rejected pairs that are already split and added back individually
    if len(to_remove) > 0:
        for j in sorted(to_remove, reverse=True):
            del sent_paired[j]

**Test the filtering function:**

Test 1:

In [97]:
sent = ['liver', 'support', 'supports', 'liver', 'function', 'stimulate', 'des', 'intoxication', 'and', 'restore', 'liver', 'function', 'eliminate', 'harmful', 'metabolite']
print(sent)

['liver', 'support', 'supports', 'liver', 'function', 'stimulate', 'des', 'intoxication', 'and', 'restore', 'liver', 'function', 'eliminate', 'harmful', 'metabolite']


In [98]:
sent_pos = ['PROPN', 'PROPN', 'PROPN', 'NOUN', 'NOUN', 'VERB', 'X', 'NOUN', 'CCONJ', 'VERB', 'NOUN', 'NOUN', 'VERB', 'ADJ', 'NOUN']
print(sent_pos)

['PROPN', 'PROPN', 'PROPN', 'NOUN', 'NOUN', 'VERB', 'X', 'NOUN', 'CCONJ', 'VERB', 'NOUN', 'NOUN', 'VERB', 'ADJ', 'NOUN']


In [99]:
sent_paired = ['liver', 'support', 'supports', 'liver_function', 'stimulate_des_intoxication_and_restore', 'liver_function', 'eliminate', 'harmful', 'metabolite']
print(sent_paired)

['liver', 'support', 'supports', 'liver_function', 'stimulate_des_intoxication_and_restore', 'liver_function', 'eliminate', 'harmful', 'metabolite']


In [100]:
filter_pairs(sent, sent_paired, sent_pos)

In [101]:
# Expected output:
print(['liver', 'support', 'supports', 'liver_function', 'liver_function', 'eliminate', 'harmful', 'metabolite', 'stimulate', 'des', 'intoxication', 'and', 'restore'])

['liver', 'support', 'supports', 'liver_function', 'liver_function', 'eliminate', 'harmful', 'metabolite', 'stimulate', 'des', 'intoxication', 'and', 'restore']


In [102]:
print(sent_paired)

['liver', 'support', 'supports', 'liver_function', 'liver_function', 'eliminate', 'harmful', 'metabolite', 'stimulate', 'des', 'intoxication', 'and', 'restore']


Test 2:

In [103]:
sent = ['-PRON-', 'have', 'a', 'lot', 'more', 'energy', 'and', 'have', 'not', 'be', 'sick', 'at', 'all']
print(sent)

['-PRON-', 'have', 'a', 'lot', 'more', 'energy', 'and', 'have', 'not', 'be', 'sick', 'at', 'all']


In [104]:
sent_pos = ['PRON', 'VERB', 'DET', 'NOUN', 'ADJ', 'NOUN', 'CCONJ', 'VERB', 'ADV', 'VERB', 'ADJ', 'ADV', 'ADV']
print(sent_pos)

['PRON', 'VERB', 'DET', 'NOUN', 'ADJ', 'NOUN', 'CCONJ', 'VERB', 'ADV', 'VERB', 'ADJ', 'ADV', 'ADV']


In [105]:
sent_paired = ['-PRON-', 'have', 'a_lot', 'more_energy', 'and', 'have', 'not', 'be', 'sick', 'at_all']
print(sent_paired)

['-PRON-', 'have', 'a_lot', 'more_energy', 'and', 'have', 'not', 'be', 'sick', 'at_all']


In [106]:
filter_pairs(sent, sent_paired, sent_pos)

In [107]:
print(sent_paired)

['-PRON-', 'have', 'more_energy', 'and', 'have', 'not', 'be', 'sick', 'a', 'lot', 'at', 'all']


### Filter Phrases

In [108]:
unigram_sents_pos_df.head()

Unnamed: 0,review_number,sentence_number,unigram_sentences,unigram_pos,preprocessed_review,has_paired_words
0,1,1,"[dpe, the, job, well]","[NOUN, DET, NOUN, ADV]","[dpe, the, job, well]",0
1,1,2,"[b, flax, d, be, a, regular, at, -PRON-, house]","[NOUN, NOUN, NOUN, VERB, DET, ADJ, ADP, ADJ, N...","[b, flax, d, be, a, regular, at, -PRON-, house]",0
2,1,3,"[-PRON-, do, -PRON-, job, simply, and, with, g...","[PRON, VERB, ADJ, NOUN, ADV, CCONJ, ADP, ADJ, ...","[-PRON-, do, -PRON-, job, simply, and, with, g...",0
3,1,4,"[-PRON-, be, reasonable, last, a, long, time, ...","[PRON, VERB, ADJ, VERB, DET, ADJ, NOUN, CCONJ,...","[-PRON-, be, reasonable, last, a, long, time, ...",0
4,1,5,"[good, product, good, price, good, result]","[ADJ, NOUN, ADJ, NOUN, ADJ, NOUN]","[good, product, good, price, good, result]",0


In [146]:
unigram_sents_pos_df.iloc[115644].values
# 115644    

array([18396, 115645,
       list(['the', 'bad', 'news', 'about', 'possible', 'copy', 'of', 'the', 'derry', 'New', 'Hampshire', 'product']),
       list(['DET', 'ADJ', 'NOUN', 'ADP', 'ADJ', 'NOUN', 'ADP', 'DET', 'PROPN', 'PROPN', 'NOUN']),
       list(['the', 'bad', 'news', 'about', 'possible', 'copy', 'of', 'the', 'derry_New_Hampshire', 'product']),
       1], dtype=object)

In [179]:
unigram_sents_pos_df.iloc[115645].values

array([18396, 115646,
       list(['-PRON-', 'first', 'experience', 'with', 'buy', 'this', 'product', 'online', 'be', 'talk', 'to', 'the', 'people', 'at', 'derry', 'n.h.my', '1st', 'buy', 'be', 'from', 'a', 'health', 'products', 'store', 'but', 'derry', 'refer', '-PRON-', 'to', 'buy', '-PRON-', 'online']),
       list(['ADJ', 'ADJ', 'NOUN', 'ADP', 'VERB', 'DET', 'NOUN', 'ADV', 'VERB', 'VERB', 'ADP', 'DET', 'NOUN', 'ADP', 'PROPN', 'ADJ', 'NOUN', 'NOUN', 'VERB', 'ADP', 'DET', 'PROPN', 'PROPN', 'NOUN', 'CCONJ', 'PROPN', 'VERB', 'PRON', 'ADP', 'VERB', 'PRON', 'ADV']),
       list(['-PRON-', 'first', 'experience', 'with', 'buy', 'this', 'product', 'online', 'be', 'talk', 'to', 'the', 'people', 'at', '1st', 'buy', 'be', 'from', 'a', 'health', 'products', 'store', 'but', 'derry', 'refer', '-PRON-', 'to', 'buy', '-PRON-', 'online', 'derry', 'n.h.my']),
       1], dtype=object)

In [186]:
doc = "The bad news about possible copies of the Derry, N.H. product.. My first experience with buying this product online was talking to the people at Derry, N.H.My 1st buy was from a Health Products store, but Derry referred me to buying it online.My biggest hope is that this is the same products because it's fantastic.Your delivery was fine.  I tend to trust Amazon, but check everything; even counted the # of tabs I recv'd : )"

In [190]:
[token.lemma_ for token in nlp(doc)][10:15]

['New Hampshire', 'product', '..', '-PRON-', 'first']

In [191]:
[token.pos_ for token in nlp(doc)][10:15]

['PROPN', 'NOUN', 'PUNCT', 'ADJ', 'ADJ']

In [193]:
[token.orth_ for token in nlp(doc)][10:15]

['N.H.', 'product', '..', 'My', 'first']

In [181]:
nlp = spacy.load('en')

In [178]:
df.loc[df.reviewText.str.contains('bad news about'), ['reviewText']].reviewText.values

array([ "The bad news about possible copies of the Derry, N.H. product.. My first experience with buying this product online was talking to the people at Derry, N.H.My 1st buy was from a Health Products store, but Derry referred me to buying it online.My biggest hope is that this is the same products because it's fantastic.Your delivery was fine.  I tend to trust Amazon, but check everything; even counted the # of tabs I recv'd : )",
       "Jarrow is A++++. Jarrow is my preferred brand.  The price and quality can't be beat. I've been told by my doctors to take my vitamin D3. I do not tolerate sunlight, and I am a homebody.  I was taking vitamin D, but my dermatologist has me taking high doses of D3.  D3 is better than D alone.  He is a vitamin nut, and he has me taking all kinds of supplements.  D3 is very important for bones, cells, immune system.  And, I'm sure more.I just had my vitamin levels checked and I am doing really good. I'm 58, and haven't worked out in 2 years.  I worked 

In [167]:
df.reviewText.iloc[0]

'Dpes the job well. B-flax-D is a regular at our house. It does its job simply and with good results. It is reasonable, lasts a long time, and is able to be obtained with free shipping if you hunt around. Good product, good price, good results.'

In [142]:
preprocessed_reviews = unigram_sents_pos_df.preprocessed_review.tolist()
unigram_sentences = unigram_sents_pos_df.unigram_sentences.tolist()
unigram_pos = unigram_sents_pos_df.unigram_pos.tolist()

In [143]:
for i in tqdm(range(len(preprocessed_reviews))):
    filter_pairs(i, sent=unigram_sentences[i], sent_paired=preprocessed_reviews[i], sent_pos=unigram_pos[i])

  5%|▍         | 169409/3605491 [00:00<00:06, 564657.72it/s]

len(sent):  29
len(sent_pos):  28
sent:  ['-PRON-', 'North', 'Dakota', 'turn', '-PRON-', 'on', 'to', 'triphala', 'as', 'a', 'way', 'to', 'stop', 'the', 'pain', 'and', 'discomfort', 'associate', 'with', 'these', 'incident', 'and', 'the', 'herbs', 'work', 'within', '20', '30', 'minute']
 pos:  ['ADJ', 'PROPN', 'VERB', 'PRON', 'PART', 'ADP', 'PROPN', 'ADP', 'DET', 'NOUN', 'PART', 'VERB', 'DET', 'NOUN', 'CCONJ', 'NOUN', 'VERB', 'ADP', 'DET', 'NOUN', 'CCONJ', 'DET', 'NOUN', 'NOUN', 'ADP', 'NUM', 'NUM', 'NOUN']
k:  105365
len(sent):  12
len(sent_pos):  11
sent:  ['the', 'bad', 'news', 'about', 'possible', 'copy', 'of', 'the', 'derry', 'New', 'Hampshire', 'product']
 pos:  ['DET', 'ADJ', 'NOUN', 'ADP', 'ADJ', 'NOUN', 'ADP', 'DET', 'PROPN', 'PROPN', 'NOUN']
k:  115644
len(sent):  9
len(sent_pos):  8
sent:  ['so', '-PRON-', 'be', 'going', 'to', 'cut', '2', 'da', 'chase']
 pos:  ['ADV', 'PRON', 'VERB', 'DET', 'NOUN', 'NUM', 'PROPN', 'NOUN']
k:  120909
len(sent):  10
len(sent_pos):  9
sent:  ['th

 11%|█         | 404989/3605491 [00:00<00:05, 578534.79it/s]

len(sent):  24
len(sent_pos):  23
sent:  ['dear', 'fellow', 'readers', '-PRON-', 'first', 'hear', 'about', 'this', 'product', 'from', 'dr.', 'rosenfeld', 'm.d.', 'professor', 'of', 'clinical', 'medicine', 'at', 'wild', 'cornell', 'university', 'in', 'New', 'York']
 pos:  ['ADJ', 'PROPN', 'PROPN', 'PRON', 'ADV', 'VERB', 'ADP', 'DET', 'NOUN', 'ADP', 'PROPN', 'PROPN', 'PROPN', 'NOUN', 'ADP', 'PROPN', 'PROPN', 'ADP', 'PROPN', 'PROPN', 'PROPN', 'ADP', 'PROPN']
k:  327453
len(sent):  6
len(sent_pos):  5
sent:  ['dr.', 'marlborough', 's.', 'nichols', 'North', 'Dakota']
 pos:  ['PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN']
k:  349535
len(sent):  14
len(sent_pos):  13
sent:  ['in3weeks', 'i', 'take', 'before', 'and', 'after', 'pic', 'totally', 'amazing', 'non', 'believer', 'fayetteville', 'North', 'Carolina']
 pos:  ['ADJ', 'PRON', 'VERB', 'ADV', 'CCONJ', 'ADP', 'NOUN', 'ADV', 'ADJ', 'ADJ', 'NOUN', 'PROPN', 'PROPN']
k:  355290
len(sent):  12
len(sent_pos):  11
sent:  ['have', 'buy', '-PRON-', 'f

 18%|█▊        | 642316/3605491 [00:01<00:05, 583906.98it/s]

len(sent):  12
len(sent_pos):  11
sent:  ['taste', 'like', 'crap', 'but', '-PRON-', 'be', 'going', 'to', 'drink', '-PRON-', 'this', 'once']
 pos:  ['VERB', 'ADP', 'NOUN', 'CCONJ', 'PRON', 'VERB', 'DET', 'NOUN', 'PRON', 'DET', 'ADV']
k:  544200
len(sent):  29
len(sent_pos):  28
sent:  ['going', 'to', 'also', 'say', 'this', '-PRON-', 'have', 'to', 'work', 'twice', 'as', 'hard', 'on', '-PRON-', 'core', 'then', '-PRON-', 'arm', 'because', '-PRON-', 'knock', 'weight', 'off', 'the', 'area', '-PRON-', 'always', 'work', 'on']
 pos:  ['DET', 'ADV', 'VERB', 'DET', 'PRON', 'VERB', 'PART', 'VERB', 'ADV', 'ADV', 'ADJ', 'ADP', 'ADJ', 'NOUN', 'ADV', 'ADJ', 'NOUN', 'ADP', 'PRON', 'VERB', 'NOUN', 'ADP', 'DET', 'NOUN', 'ADJ', 'ADV', 'VERB', 'ADP']
k:  545289
len(sent):  19
len(sent_pos):  18
sent:  ['-PRON-', 'be', 'going', 'to', 'go', 'out', 'on', 'a', 'limb', 'and', 'say', '-PRON-', 'be', 'not', '34;just', '-PRON-', 'and', '-PRON-', 'mixing&#34']
 pos:  ['PRON', 'VERB', 'DET', 'NOUN', 'PART', 'ADP', '

 23%|██▎       | 822561/3605491 [00:01<00:04, 587422.39it/s]

len(sent):  17
len(sent_pos):  16
sent:  ['but', '-PRON-', 'be', 'enteric', 'coat', 'and', 'enteric', 'coating', 'have', 'some', 'bad', 'effect', 'accord', 'to', '-PRON-', 'North', 'Dakota']
 pos:  ['CCONJ', 'PRON', 'VERB', 'ADJ', 'VERB', 'CCONJ', 'ADJ', 'NOUN', 'VERB', 'DET', 'ADJ', 'NOUN', 'VERB', 'ADP', 'ADJ', 'PROPN']
k:  762711
len(sent):  23
len(sent_pos):  22
sent:  ['so', 'many', 'people', 'be', 'deficient', 'in', 'magnesium', 'and', 'the', 'effect', 'be', 'far', 'reach', 'read', 'the', 'magnesium', 'miracle', 'by', 'carolyn', 'dean', 'm.d.', 'North', 'Dakota']
 pos:  ['ADV', 'ADJ', 'NOUN', 'VERB', 'ADJ', 'ADP', 'NOUN', 'CCONJ', 'DET', 'NOUN', 'VERB', 'ADV', 'VERB', 'VERB', 'DET', 'PROPN', 'PROPN', 'ADP', 'PROPN', 'PROPN', 'PROPN', 'PROPN']
k:  767548
len(sent):  22
len(sent_pos):  21
sent:  ['-PRON-', 'tell', 'everyone', 'about', 'this', 'product', 'and', 'refer', '-PRON-', 'to', 'the', 'book', 'the', 'miracle', 'of', 'magnesium', 'by', 'carolyn', 'deane', 'm.d.', 'North', 'Da

 31%|███       | 1123573/3605491 [00:01<00:04, 589629.36it/s]

len(sent):  44
len(sent_pos):  43
sent:  ['michael', 'murray', 'North', 'Dakota', 'author', 'of', '`', 'the', 'pill', 'book', 'guide', 'to', 'natural', 'medicines', 'write', '`', 'subject', 'take', 'carnitine', 'show', 'significant', 'improvement', 'in', 'heart', 'rate', 'blood', 'pressure', 'angina', 'attack', 'rhythm', 'disturbance', 'and', 'clinical', 'sign', 'of', 'impaired', 'heart', 'function', 'compare', 'to', 'the', 'subject', 'take', 'placebo']
 pos:  ['PROPN', 'PROPN', 'PROPN', 'NOUN', 'ADP', 'PUNCT', 'DET', 'PROPN', 'PROPN', 'PROPN', 'ADP', 'PROPN', 'PROPN', 'VERB', 'PUNCT', 'NOUN', 'VERB', 'NOUN', 'VERB', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'NOUN', 'NOUN', 'NOUN', 'NOUN', 'NOUN', 'NOUN', 'NOUN', 'CCONJ', 'ADJ', 'NOUN', 'ADP', 'ADJ', 'NOUN', 'NOUN', 'VERB', 'ADP', 'DET', 'NOUN', 'VERB', 'NOUN']
k:  1037336
len(sent):  2
len(sent_pos):  1
sent:  ['North', 'Dakota']
 pos:  ['PROPN']
k:  1060330


 38%|███▊      | 1365770/3605491 [00:02<00:03, 591953.91it/s]

len(sent):  28
len(sent_pos):  27
sent:  ['-PRON-', 'be', 'very', 'happy', 'to', 'find', 'j.crow', "'s", 'lugol', "'s", '2', 'iodine', 'solution', 'online', 'and', 'at', 'a', 'great', 'price!it', 'be', 'exactly', 'what', '-PRON-', 'North', 'Dakota', 'm.d', 'doctor', 'prescribe']
 pos:  ['PRON', 'VERB', 'ADV', 'ADJ', 'PART', 'VERB', 'PROPN', 'PART', 'PROPN', 'PART', 'NUM', 'PROPN', 'PROPN', 'ADV', 'CCONJ', 'ADP', 'DET', 'ADJ', 'NOUN', 'VERB', 'ADV', 'NOUN', 'ADJ', 'PROPN', 'PROPN', 'NOUN', 'VERB']
k:  1248947
len(sent):  21
len(sent_pos):  20
sent:  ['decrease', 'absorptionone', 'of', 'the', 'main', 'health', 'risk', 'of', 'magnesium', 'stearate', 'accord', 'to', 'ron', 'schmid', 'North', 'Dakota', 'in', '-PRON-', 'article', 'dietary', 'supplements']
 pos:  ['VERB', 'PROPN', 'ADP', 'DET', 'ADJ', 'NOUN', 'NOUN', 'ADP', 'NOUN', 'ADJ', 'VERB', 'ADP', 'PROPN', 'PROPN', 'PROPN', 'ADP', 'ADJ', 'NOUN', 'PROPN', 'PROPN']
k:  1253984
len(sent):  28
len(sent_pos):  27
sent:  ['unlike', 'other', '

 41%|████      | 1486742/3605491 [00:02<00:03, 592732.98it/s]

len(sent):  14
len(sent_pos):  13
sent:  ['manufacture', 'New', 'York', 'in', 'the', 'usai', 'do', 'not', 'see', 'anywhere', 'if', 'this', 'be', 'kosher']
 pos:  ['VERB', 'PROPN', 'ADP', 'DET', 'PROPN', 'VERB', 'ADV', 'VERB', 'ADV', 'ADP', 'DET', 'VERB', 'PROPN']
k:  1375316
len(sent):  18
len(sent_pos):  17
sent:  ['-PRON-', 'be', 'look', 'for', 'this', 'product', 'here', 'in', 'North', 'Carolina', 'and', 'to', '-PRON-', 'surprise', 'no', 'one', 'carry', '-PRON-']
 pos:  ['PRON', 'VERB', 'VERB', 'ADP', 'DET', 'NOUN', 'ADV', 'ADP', 'PROPN', 'CCONJ', 'ADP', 'ADJ', 'NOUN', 'DET', 'NOUN', 'VERB', 'PRON']
k:  1428448
len(sent):  6
len(sent_pos):  5
sent:  ['cathy', 'hopkins', 'North', 'Dakota', 'cnc', 'bep']
 pos:  ['PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN']
k:  1490211
len(sent):  5
len(sent_pos):  4
sent:  ['v', 'gambill', 'hendersonville', 'North', 'Carolina']
 pos:  ['PROPN', 'PROPN', 'PROPN', 'PROPN']
k:  1492454
len(sent):  18
len(sent_pos):  17
sent:  ['-PRON-', 'next', 'thought',

 45%|████▍     | 1607644/3605491 [00:02<00:03, 593021.78it/s]

len(sent):  6
len(sent_pos):  5
sent:  ['-PRON-', 'be', 'going', 'to', 'happy', 'cu']
 pos:  ['PRON', 'VERB', 'DET', 'ADJ', 'ADP']
k:  1514981


 48%|████▊     | 1727903/3605491 [00:02<00:03, 593575.51it/s]

len(sent):  26
len(sent_pos):  25
sent:  ['-PRON-', 'doctor', 'a', 'North', 'Dakota', 'prescribe', 'pgx', 'ultra', 'along', 'with', 'supplement', 'glysen', 'and', 'photoglysen', 'to', 'help', 'control', '-PRON-', 'blood', 'sugar', 'and', 'to', 'help', '-PRON-', 'lose', 'weight']
 pos:  ['ADJ', 'NOUN', 'DET', 'PROPN', 'VERB', 'PROPN', 'PROPN', 'ADP', 'ADP', 'NOUN', 'PROPN', 'CCONJ', 'PROPN', 'PART', 'VERB', 'VERB', 'ADJ', 'NOUN', 'NOUN', 'CCONJ', 'PART', 'VERB', 'PRON', 'VERB', 'NOUN']
k:  1649079
len(sent):  12
len(sent_pos):  11
sent:  ['-PRON-', 'North', 'Dakota', 'say', '-PRON-', 'body', 'process', 'the', 'fish', 'oil', 'better', 'though']
 pos:  ['ADJ', 'PROPN', 'VERB', 'ADJ', 'NOUN', 'VERB', 'DET', 'NOUN', 'NOUN', 'ADV', 'ADV']
k:  1709543


 51%|█████▏    | 1849569/3605491 [00:03<00:02, 594523.63it/s]

len(sent):  64
len(sent_pos):  63
sent:  ['well', '20', 'capsules!speaking', 'of', 'which', 'the', 'member', 'of', 'gaia', 'herb', "'s", 'scientific', 'advisory', 'board', 'be', 'all', 'say', 'to', 'doctor', 'of', 'naturopathy', 'and', '-PRON-', 'even', 'have', 'the', 'designation', 'North', 'Dakota', 'after', '-PRON-', 'name', 'and', 'call', '-PRON-', 'doctor', 'and', '-PRON-', 'take', 'money', 'for', 'diagnose', 'and', 'treat', 'people', 'people', 'who', 'may', 'really', 'have', 'a', 'disease', 'that', 'real', 'doctor', 'could', 'treat', 'use', 'ludicrous', 'belief', 'that', 'flatly', 'contradict', 'science']
 pos:  ['INTJ', 'NUM', 'NOUN', 'ADP', 'ADJ', 'DET', 'NOUN', 'ADP', 'PROPN', 'PROPN', 'PART', 'PROPN', 'PROPN', 'PROPN', 'VERB', 'DET', 'VERB', 'ADP', 'NOUN', 'ADP', 'PROPN', 'CCONJ', 'PRON', 'ADV', 'VERB', 'DET', 'NOUN', 'PROPN', 'ADP', 'ADJ', 'NOUN', 'CCONJ', 'VERB', 'PRON', 'NOUN', 'CCONJ', 'PRON', 'VERB', 'NOUN', 'ADP', 'VERB', 'CCONJ', 'VERB', 'NOUN', 'NOUN', 'NOUN', 'VERB',

 56%|█████▋    | 2031789/3605491 [00:03<00:02, 595396.89it/s]

len(sent):  15
len(sent_pos):  14
sent:  ['-PRON-', 'be', 'going', 'to', 'skeptic', 'so', '-PRON-', 'can', 'imagine', 'just', 'how', 'high', '-PRON-', 'hope', 'be']
 pos:  ['PRON', 'VERB', 'DET', 'NOUN', 'ADP', 'PRON', 'VERB', 'VERB', 'ADV', 'ADV', 'ADJ', 'ADJ', 'NOUN', 'VERB']
k:  1954788
len(sent):  11
len(sent_pos):  10
sent:  ['-PRON-', 'be', 'going', 'to', 'always', 'buy', 'this', 'juice.cactus', 'juice', 'for', 'life']
 pos:  ['PRON', 'VERB', 'DET', 'ADV', 'VERB', 'DET', 'NOUN', 'NOUN', 'ADP', 'NOUN']
k:  2010305
len(sent):  18
len(sent_pos):  17
sent:  ['-PRON-', 'next', 'thought', 'be', 'to', 'contact', 'the', 'bbb', 'of', 'New', 'Jersey', 'to', 'see', 'if', '-PRON-', 'could', 'assist', '-PRON-']
 pos:  ['ADJ', 'ADJ', 'NOUN', 'VERB', 'PART', 'VERB', 'DET', 'NOUN', 'ADP', 'PROPN', 'PART', 'VERB', 'ADP', 'PRON', 'VERB', 'VERB', 'PRON']
k:  2023289
len(sent):  9
len(sent_pos):  8
sent:  ['this', 'be', 'the', 'one.author', 'jan', 'mcbarron', 'm.d.', 'North', 'Dakota']
 pos:  ['DET'

 63%|██████▎   | 2273434/3605491 [00:03<00:02, 595953.90it/s]

len(sent):  19
len(sent_pos):  18
sent:  ['for', 'other', 'cope', 'with', 'adrenal', 'stress', '-PRON-', 'recommend', 'james', 'l.', 'wilson', 'North', 'Dakota', 'd.c.', 'ph.d.', "'s", 'book', 'adrenal', 'fatigue']
 pos:  ['ADP', 'NOUN', 'VERB', 'ADP', 'ADJ', 'NOUN', 'PRON', 'VERB', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'NOUN', 'PART', 'NOUN', 'PROPN', 'PROPN']
k:  2185746
len(sent):  12
len(sent_pos):  11
sent:  ['try', 'and', '-PRON-', 'will', 'see', 'what', '-PRON-', 'be', 'going', 'to', 'talkin', 'about']
 pos:  ['VERB', 'CCONJ', 'PRON', 'VERB', 'VERB', 'NOUN', 'PRON', 'VERB', 'DET', 'NOUN', 'ADP']
k:  2260921
len(sent):  8
len(sent_pos):  7
sent:  ['-PRON-', 'North', 'Dakota', 'recommend', '-PRON-', 'for', 'candida', 'issue']
 pos:  ['ADJ', 'PROPN', 'VERB', 'PRON', 'ADP', 'ADJ', 'NOUN']
k:  2280891


 73%|███████▎  | 2641208/3605491 [00:04<00:01, 597734.27it/s]

len(sent):  35
len(sent_pos):  34
sent:  ['34;the', 'u.s.', 'food', 'and', 'drug', 'administration', 'be', 'warn', 'consumer', 'to', 'immediately', 'stop', 'use', 'hydroxycut', 'product', 'by', 'iovate', 'health', 'sciences', 'inc.', 'of', 'oakville', 'ontario', 'and', 'distribute', 'by', 'iovate', 'health', 'sciences', 'usa', 'inc.', 'of', 'blasdell', 'New', 'York']
 pos:  ['NUM', 'PROPN', 'PROPN', 'CCONJ', 'PROPN', 'PROPN', 'VERB', 'VERB', 'NOUN', 'PART', 'ADV', 'VERB', 'VERB', 'PROPN', 'NOUN', 'ADP', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'ADP', 'PROPN', 'PROPN', 'CCONJ', 'VERB', 'ADP', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'ADP', 'PROPN', 'PROPN']
k:  2522354
len(sent):  42
len(sent_pos):  41
sent:  ['-PRON-', 'have', 'be', 'on', '500', 'mg', 'of', 'acetyl', 'l', 'carnitine', 'and', '200', 'mg', 'of', 'alpha', 'lipoic', 'acid', 'ala', 'to', 'fight', 'off', 'effect', 'of', 'age', 'on', 'the', 'brain', 'nov', '2003', 'reader', 'digest', 'article', 'the', 'end', 'of', 'aging.")i',

 77%|███████▋  | 2762711/3605491 [00:04<00:01, 597912.52it/s]

len(sent):  7
len(sent_pos):  6
sent:  ['be', 'going', 'to', 'on', '-PRON-', '3rd', 'patch']
 pos:  ['VERB', 'DET', 'ADP', 'ADJ', 'ADJ', 'NOUN']
k:  2664790
len(sent):  13
len(sent_pos):  12
sent:  ['going', 'to', 'order', '-PRON-', 'again.why', 'pay', 'money', 'for', 'something', 'that', 'do', 'not', 'work']
 pos:  ['DET', 'NOUN', 'PRON', 'INTJ', 'VERB', 'NOUN', 'ADP', 'NOUN', 'ADJ', 'VERB', 'ADV', 'VERB']
k:  2710977
len(sent):  18
len(sent_pos):  17
sent:  ['this', 'brand', 'of', 'magnesium', 'be', 'prescribe', 'to', '-PRON-', 'by', '-PRON-', 'North', 'Dakota', 'and', 'have', 'truly', 'make', 'a', 'difference']
 pos:  ['DET', 'NOUN', 'ADP', 'NOUN', 'VERB', 'VERB', 'ADP', 'PRON', 'ADP', 'ADJ', 'PROPN', 'CCONJ', 'VERB', 'ADV', 'VERB', 'DET', 'NOUN']
k:  2720310


 82%|████████▏ | 2945322/3605491 [00:04<00:01, 598397.64it/s]

len(sent):  4
len(sent_pos):  3
sent:  ['love', 'North', 'Carolina', 'wholemega']
 pos:  ['VERB', 'PROPN', 'PROPN']
k:  2870783
len(sent):  5
len(sent_pos):  4
sent:  ['-PRON-', 'be', 'going', 'to', 'female']
 pos:  ['PRON', 'VERB', 'DET', 'NOUN']
k:  2941161
len(sent):  14
len(sent_pos):  13
sent:  ['unit', 'j', 'greenfield', 'North', 'Carolina', '29607i', 'follow', 'the', 'direction', 'and', 'have', 'absolutely', 'no', 'result']
 pos:  ['NOUN', 'PROPN', 'NOUN', 'PROPN', 'NOUN', 'VERB', 'DET', 'NOUN', 'CCONJ', 'VERB', 'ADV', 'DET', 'NOUN']
k:  2988696


 87%|████████▋ | 3127623/3605491 [00:05<00:00, 598794.90it/s]

len(sent):  8
len(sent_pos):  7
sent:  ['no', 'filler', 'and', 'produce', 'in', 'brevard', 'North', 'Carolina']
 pos:  ['DET', 'NOUN', 'CCONJ', 'VERB', 'ADP', 'PROPN', 'PROPN']
k:  3033584


 93%|█████████▎| 3370414/3605491 [00:05<00:00, 599077.94it/s]

len(sent):  8
len(sent_pos):  7
sent:  ['but', '-PRON-', 'be', 'going', 'to', 'keep', 'use', '-PRON-']
 pos:  ['CCONJ', 'PRON', 'VERB', 'DET', 'NOUN', 'VERB', 'PRON']
k:  3252070
len(sent):  11
len(sent_pos):  10
sent:  ['going', 'to', 'try', 'one', 'more', 'month', 'and', 'see', 'what', 'happen', 'hopefully']
 pos:  ['DET', 'NOUN', 'NUM', 'ADJ', 'NOUN', 'CCONJ', 'VERB', 'NOUN', 'VERB', 'ADV']
k:  3275967
len(sent):  5
len(sent_pos):  4
sent:  ['-PRON-', 'be', 'going', 'to', 'fan']
 pos:  ['PRON', 'VERB', 'DET', 'NOUN']
k:  3323704


 97%|█████████▋| 3491877/3605491 [00:05<00:00, 598960.33it/s]

len(sent):  55
len(sent_pos):  54
sent:  ['guideline', 'fromm.d.', "'s", 'and', 'North', 'Dakota', "'s", 'who', 'have', 'use', 'nutritional', 'iodine', 'for', 'year', 'in', '-PRON-', 'own', 'practice', 'not', 'only', 'enable', '-PRON-', 'to', 'understand', 'the', 'process', 'which', '-PRON-', 'appreciate', 'but', 'help', '-PRON-', 'feel', 'confident', 'that', '-PRON-', 'supplementation', 'could', 'be', 'do', 'safely', 'and', 'in', 'the', 'manner', 'most', 'likely', 'to', 'provide', 'the', 'result', '-PRON-', 'be', 'look', 'for']
 pos:  ['NOUN', 'NOUN', 'PART', 'CCONJ', 'PROPN', 'PART', 'NOUN', 'VERB', 'VERB', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'ADJ', 'ADJ', 'NOUN', 'ADV', 'ADV', 'VERB', 'PRON', 'PART', 'VERB', 'DET', 'NOUN', 'ADJ', 'PRON', 'VERB', 'CCONJ', 'VERB', 'PRON', 'VERB', 'ADJ', 'ADP', 'ADJ', 'NOUN', 'VERB', 'VERB', 'VERB', 'ADV', 'CCONJ', 'ADP', 'DET', 'NOUN', 'ADV', 'ADJ', 'PART', 'VERB', 'DET', 'NOUN', 'PRON', 'VERB', 'VERB', 'ADP']
k:  3407908
len(sent):  4
len(sent_pos):

100%|██████████| 3605491/3605491 [00:06<00:00, 599260.98it/s]


## BUG: something weird with the row above where it fails - why is sent smaller than sent_paired???
## also, only apply filter_pos to has_paired = 1

In [345]:
sent_pos

['PRON',
 'VERB',
 'VERB',
 'PROPN',
 'PART',
 'NUM',
 'PROPN',
 'ADP',
 'ADP',
 'ADJ',
 'NOUN']

In [346]:
sent

['-PRON-',
 'do',
 'recommend',
 'women',
 "'s",
 'one',
 'a_day',
 'though',
 'with',
 'extra',
 'calcium']

In [347]:
sent_paired

['-PRON-',
 'do',
 'recommend',
 'one',
 'though',
 'with',
 'extra',
 'calcium',
 'women',
 "'s",
 'a_day',
 'though']

In [170]:
len(trigram_sentences)

3605491

In [158]:
# vocabulary size with unigrams
len([word for sentence in unigram_sentences for word in sentence])

43362695

In [159]:
# vocabulary size with trigrams
len([word for sentence in trigram_sentences for word in sentence])

21960569

In [160]:
trigrams_flat = [word for sentence in trigram_sentences for word in sentence]

In [161]:
len(trigrams_flat)

21960569

In [162]:
print(trigrams_flat[:15])

['dpe', 'job', 'b', 'flax', 'd', 'regular', '-PRON-', 'house', '-PRON-', '-PRON-', 'job', 'simply', 'good', 'result', '-PRON-']


In [163]:
paired_words = set([word for word in trigrams_flat if '_' in word])

In [173]:
len(paired_words)

203277

In [164]:
print(trigrams_flat[100:150])

['mouth', 'quickly', 'lozenge', 'formula', 'dissolve', 'slowly', 'preferable', 'accord', '-PRON-', 'research', 'this_product', 'great', 'side_effect', '-PRON-', '-PRON-', 'cold', 'sore_throat', 'soon', 'start', '-PRON-', 'every_day', '-PRON-', 'start', 'come', 'cold', '-PRON-', 'usual', 'symptom', 'anticipate', 'sick', 'day', '-PRON-', 'usual', 'pattern', '-PRON-', 'sick', 'anticipate', 'taking', 'this_product', 'reason', '-PRON-', 'come', '-PRON-', 'cold', 'sore_throat', '-PRON-', 'great', '-PRON-', 'recommend', 'this_product']


In [165]:
# print trigrams containing 'no' or 'not'
for w in paired_words:
    if ('_no_' in w or 'not_' in w):
        print(w)

night.not_a_miracle_cure
pinot_noir
solublenot_certify_kosher_or_halal$8.99
count)*****fat_solublenot_certify_kosher
240_softgels)****fat_solublenot_certify
solublenot_certify_kosher_or_halal$13.78
8220;not_hungry&#8221
distilledmercury_freenot_enteric_coatednot
cholesterolmolecularly_distilledmercury_freenot_enteric
estafa!not_worth_the_money
34;not_guilty&#34
hacking_snot_fill
solublenot_certify_kosher_or_halal$27.99
each)****triglycerides_formnot_certify_kosher
freshness_34;not_rancid&#34
formnot_certify_kosher_or_halal$45.82
formnot_certify_kosher_or_halal$45.46
hungry.not_a_stimulant
240_softgels,)fat_solublenot_certify
stearateschelatedvegetariannot_enteric_coatedcontain_laxative
34;not_work&#34;.
enteric_coatednot_vegetarianone
supply).)ethyl_ester_formnot_certify
22.8=_78.6not_373i_freak
cholesterolmolecularly_distilledno_mercurynot_enteric
solublenot_certify_kosher_or_halal$27.77
90-count)*****ubiquinolfat_solublenot_certify_kosher
mercurynot_enteric_coatednot_vegetarianphosph

In [166]:
unigram_text = unigram_sentences_savedf.unigram_sentences.tolist()

In [167]:
# search for one of the weird paired terms in the list above: 'solublenot_certify_kosher'
# this shows the review it was a part of before getting paired
[sent for sent in unigram_text if 'not certify kosher' in sent][0]

"magnesium malate magnesium glycinatewater solublenot certify kosher or halal$ n a for 120 200 mg capsule on amazonrecommended serving two capsulesprice per gel cap $ n a use amazon 's price)price per 100 mgs magnesium $ n a use amazon 's price)no soyno gmosno cholesterolno stearateschelatedvegetariannot enteric coatedno laxative propertiesno ingredient source from chinaphone number 800 476 3542manufactur in the u.s.a.ingredient magnesium malate chelate magnesium glycinate and vegetarian capsule non gmo plant cellulose)doctor 's good high absorption 100 chelated magnesium"

**Clearly, there was a problem in the unigram terms as well since `soluble` and `not` are joined together (along with other words).**

In [168]:
# find the same review in the original unprocessed reviews dataset
[sent for sent in text if '$17.09' in sent][0][:2000]

"KAL Magnesium Glycinate 400 vs Nine Leading Magnesium Supplements. ***Here is a side-by-side comparison of ten leading magnesium supplements: Nutrigold Magnesium Gold, Doctor's Best High Absorption 100% Chelated Magnesium, JigSaw Magnesium w/SRT, Now Foods Magnesium Citrate (200 mgs), Now Foods Magnesium Capsules (400 mgs), Solgar Magnesium Citrate, Life Extension Magnesium Caps, Thorne Research Magnesium Citrate, Bluebonnet Nutrition Albion Chelated Magnesium, and KAL Magnesium Glycinate 400.Magnesium is needed for more than 300 biochemical reactions in the body. It helps maintain normal muscle and nerve function, keeps heart rhythm steady, supports a healthy immune system, and keeps bones strong. Magnesium also helps regulate blood sugar levels, promotes normal blood pressure, and is known to be involved in energy metabolism and protein synthesis. There is an increased interest in the role of magnesium in preventing and managing disorders such as hypertension, cardiovascular disease

**In the unprocessed reviews as well, `soluble` and `not` are joined together (along with other words).  This is a problem with the data itself; not an outcome of the preprocessing.**

In [169]:
q1 = reviews[reviews.asin.str.contains('B00013YZ1Q')]
q2 = q1[q1.summary.str.contains('KAL Magnesium Glycinate 400 vs Nine Leading Magnesium')]

In [170]:
# let's find the product from the review above:
q2.summary.values

array(['KAL Magnesium Glycinate 400 vs Nine Leading Magnesium Supplements'], dtype=object)

In [171]:
# let's look at the 100 most frequent paired words
paired_words_frq = Counter([word for word in trigrams_flat if '_' in word])
paired_words_frq.most_common(100)

[('do_not', 268437),
 ('this_product', 207554),
 ('seem_to', 45681),
 ('can_not', 45528),
 ('great_product', 41158),
 ('weight_loss', 35438),
 ('so_far', 29550),
 ('at_all', 25321),
 ('this_stuff', 23679),
 ('highly_recommend', 23157),
 ('lose_weight', 23118),
 ('fish_oil', 21909),
 ('side_effect', 17800),
 ('as_well', 17148),
 ('would_recommend', 16050),
 ('in_the_morning', 15725),
 ('at_least', 14776),
 ('will_continue', 14454),
 ('more_than', 13908),
 ('more_energy', 13045),
 ('per_day', 11691),
 ('every_day', 11147),
 ('garcinia_cambogia', 10203),
 ('as_well_as', 9379),
 ('at_night', 8951),
 ('very_happy', 8328),
 ('too_much', 8297),
 ('year_ago', 8001),
 ('no_side_effect', 7872),
 ('high_quality', 7664),
 ('energy_level', 7583),
 ('vitamin_d', 7473),
 ('vitamin_c', 7400),
 ('year_old', 7201),
 ('run_out', 7056),
 ('no_longer', 7043),
 ('five_star', 6781),
 ('suffer_from', 6679),
 ('dr._oz', 6578),
 ('wake_up', 6439),
 ('immune_system', 6167),
 ('twice_a_day', 6086),
 ('on_the_mark

In [175]:
# Find the 100 most infrequent paired words
paired_words_frq.most_common()[::-1][:100]

[('overturn_conventional_wisdom', 1),
 ('eat&#8221_the_wrong_combo', 1),
 ('tub_of_humus_with_veggie', 1),
 ('veep_university', 1),
 ('consumer_of_cookies!!it', 1),
 ('portion_veep_university---', 1),
 ('expereienc_with_veep', 1),
 ('visual_representation_veep', 1),
 ('veep_lookcut_program', 1),
 ('fitness_fanatic_veep_university', 1),
 ('outdoor_enthusiast_mtn', 1),
 ('mountain_biking_rowing', 1),
 ('trx_training', 1),
 ('lilttle_longer', 1),
 ('double_decker_cheeseburger', 1),
 ('marathon_and_a_tri_atholon', 1),
 ('8220_recommended&#8221', 1),
 ('trade_show&#8230', 1),
 ('go!upon_arrival', 1),
 ('hydroxycitric_acid_hca).this', 1),
 ('sharp_edges2', 1),
 ('crash_dieting).in_conclusion', 1),
 ('nuline_nutritionals_and_tomoson', 1),
 ('wishful_thinking!ftc_disclosure', 1),
 ('savor_the_taste).as', 1),
 ('34;healthy_fat&#34', 1),
 ('atrail_fibrillationso', 1),
 ('holy_cr*p', 1),
 ('w700_and_the_ubersurge', 1),
 ("bootle_of_uberday_women_'s", 1),
 ('detail_and_a_superior_product!paula', 1

In [174]:
len(paired_words_frq)   # number of paired words

203277

# Training the LDA Model

In [181]:
%%time

# we need to learn the full vocabulary of the corpus to be modeled
# learn the dictionary by iterating over all of the reviews
trigram_dictionary = Dictionary(trigram_sentences)

CPU times: user 29.5 s, sys: 0 ns, total: 29.5 s
Wall time: 29.5 s


In [182]:
# filter tokens that are very rare or too common from
# the dictionary (filter_extremes) and reassign integer ids (compactify)
trigram_dictionary.filter_extremes(no_below=10, no_above=0.6)
trigram_dictionary.compactify()   # remove gaps in id sequence after words that were removed

In [182]:
trigram_dictionary.save('../vocab_dictionary.dict')     # save vocabulary dict locally

In [182]:
trigram_dictionary = Dictionary.load('../vocab_dictionary.dict')  # load the finished dictionary from disk