In [2]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [3]:
import pandas as pd
import numpy as np
import spacy
from gensim.models import Phrases
from gensim.models.word2vec import LineSentence

from tqdm._tqdm_notebook import tqdm, tqdm_notebook, tnrange
from S3_read_write import load_df_s3, save_df_s3

from IPython.display import Image
from IPython.core.display import HTML 

In [4]:
tqdm_notebook.pandas('Progress')

In [5]:
bucket_name = 'amazon-reviews-project'

# Load Amazon Reviews Data

In [30]:
reviews = load_df_s3(bucket_name, 'amazon_reviews/reviews_data_clean', filetype='text', sep='|')

In [31]:
reviews.shape    # 585,444 records

(585444, 8)

In [32]:
reviews.head()

Unnamed: 0,asin,helpful,reviewText,overall,summary,description,title,categories_clean
0,929619730,"[0, 0]",B-flax-D is a re...,5.0,Dpes the job well,Contains Organic...,New Generation B...,Health & Persona...
1,978559088,"[1, 1]",Studies show tha...,4.0,"Fast shipping, g...",Everyone knows t...,Nutrihill Resver...,Health & Persona...
2,978559088,"[1, 1]",I started taking...,5.0,Bioavailability ...,Everyone knows t...,Nutrihill Resver...,Health & Persona...
3,978559088,"[0, 1]",I tried Nutrihil...,1.0,Other Resveratro...,Everyone knows t...,Nutrihill Resver...,Health & Persona...
4,978559088,"[0, 0]",I really liked t...,5.0,I can't find thi...,Everyone knows t...,Nutrihill Resver...,Health & Persona...


In [33]:
reviews.dtypes

asin                 object
helpful              object
reviewText           object
overall             float64
summary              object
description          object
title                object
categories_clean     object
dtype: object

## Data Cleaning

In [52]:
reviews.categories_clean.unique()[:10]

array(['Health & Personal Care, Vitamins & Dietary Supplements, Multi & Prenatal Vitamins, Multiple Vitamin-Mineral Supplements',
       'Health & Personal Care, Vitamins & Dietary Supplements, Supplements, Antioxidants, Resveratrol',
       'Health & Personal Care, Vitamins & Dietary Supplements, Multi & Prenatal Vitamins, Multivitamins',
       'Health & Personal Care, Vitamins & Dietary Supplements, Vitamins, Vitamin B, B3 (Niacin)',
       'Health & Personal Care, Vitamins & Dietary Supplements, Herbal Supplements',
       'Health & Personal Care, Vitamins & Dietary Supplements, Herbal Supplements, Green Tea',
       'Health & Personal Care, Vitamins & Dietary Supplements, Weight Loss, Supplements, Green Coffee Bean Extract',
       'Health & Personal Care, Vitamins & Dietary Supplements, Weight Loss, Supplements',
       'Health & Personal Care, Vitamins & Dietary Supplements, Supplements, Antioxidants, CoQ10',
       'Health & Personal Care, Vitamins & Dietary Supplements, Herbal

The catergories' list indicates that there may be some reviews in the dataset unrelated to health supplements.  Let's get rid of these.

In [35]:
reviews[reviews.categories_clean.str.contains('CDs & Vinyl')].title.unique()

array(['Liturgy of St. John Chrysostom', 'Origins',
       'Sounds of the Earth: Soft Ocean Sounds', 'Bali',
       'Tranquil Waters', 'Bach: St. John Passion, BWV 245',
       '21st Century Soul', 'Bodies for Strontium', "John's Bunch",
       'An Evening of Paganini', "John's Other Bunch",
       'Sus Mas Grandes Exitos', 'Complex Simplicity',
       'Kidnapped By Neptune', 'Roman Chant / Easter Vespers', 'Dead 60s',
       "Cilla in the 60's", 'Chromium', 'Letters From the Vitamin Sea',
       'The Stinging Nettles', 'Tendres Annees 60', 'Wehiwehi Hawaii',
       'none'], dtype=object)

In [36]:
len(reviews[reviews.categories_clean.str.contains('CDs & Vinyl')])

263

The product titles shown above are all music albums/songs.

In [37]:
reviews_filt = reviews[~(reviews.categories_clean.str.contains('CDs & Vinyl'))]   # remove rows with category including 'CDs & Vinyl'

In [51]:
reviews_filt.categories_clean.unique()[:10]

array(['Health & Personal Care, Vitamins & Dietary Supplements, Multi & Prenatal Vitamins, Multiple Vitamin-Mineral Supplements',
       'Health & Personal Care, Vitamins & Dietary Supplements, Supplements, Antioxidants, Resveratrol',
       'Health & Personal Care, Vitamins & Dietary Supplements, Multi & Prenatal Vitamins, Multivitamins',
       'Health & Personal Care, Vitamins & Dietary Supplements, Vitamins, Vitamin B, B3 (Niacin)',
       'Health & Personal Care, Vitamins & Dietary Supplements, Herbal Supplements',
       'Health & Personal Care, Vitamins & Dietary Supplements, Herbal Supplements, Green Tea',
       'Health & Personal Care, Vitamins & Dietary Supplements, Weight Loss, Supplements, Green Coffee Bean Extract',
       'Health & Personal Care, Vitamins & Dietary Supplements, Weight Loss, Supplements',
       'Health & Personal Care, Vitamins & Dietary Supplements, Supplements, Antioxidants, CoQ10',
       'Health & Personal Care, Vitamins & Dietary Supplements, Herbal

In [39]:
reviews_filt[reviews_filt.categories_clean.str.contains('Software')]

Unnamed: 0,asin,helpful,reviewText,overall,summary,description,title,categories_clean
3639,B00009QP4Q,"[2, 2]",The company has ...,5.0,lives up to its ...,Alpha Five's QLi...,none,Health & Persona...
50015,B0002TIEQQ,"[0, 0]",I ordered this f...,1.0,waste of money,Self help tutori...,none,Health & Persona...


In [40]:
reviews_filt = reviews_filt[~(reviews_filt.categories_clean.str.contains('Software'))]

In [41]:
len(reviews_filt)

585179

In [53]:
# Get rid of reviews of pet-related products
search_for = [' pet ', ' cat ', ' dog ']
pattern = '|'.join(search_for)
reviews_filt.title.str.contains(pattern, case=False).sum()

277

In [50]:
reviews_filt[reviews_filt.title.str.contains(pattern, case=False)]['title'].values[:10]

array(['Power - Mune Tuna Flavor Pet Herbal Supplement From Vetvittles.com',
       'Power - Mune Tuna Flavor Pet Herbal Supplement From Vetvittles.com',
       'Power - Mune Tuna Flavor Pet Herbal Supplement From Vetvittles.com',
       'AniMed Witch Hazel 86-Percent Multi-Species Pet Supplement',
       'AniMed Witch Hazel 86-Percent Multi-Species Pet Supplement',
       'AniMed Witch Hazel 86-Percent Multi-Species Pet Supplement',
       'AniMed Witch Hazel 86-Percent Multi-Species Pet Supplement',
       'AniMed Witch Hazel 86-Percent Multi-Species Pet Supplement',
       'AniMed Witch Hazel 86-Percent Multi-Species Pet Supplement',
       'Composure Liquid for Dogs and Cat (188 SERVINGS)'], dtype=object)

In [54]:
# Get rid of all pet products
reviews_filt = reviews_filt[~(reviews_filt.title.str.contains(pattern, case=False))]

In [55]:
# saving the cleaned dataframe
save_df_s3(df=reviews_filt, bucket_name=bucket_name, filepath='amazon_reviews/reviews_data_clean_v2.feather')

In [56]:
reviews_filt.asin.nunique()     # 48,535 unique products and 585,179 reviews

48501

## Examine One Observation

In [57]:
example = reviews_filt.iloc[0]

In [58]:
example.asin     # Amazon Standard Identification Number

'0929619730'

In [59]:
example.title     # this is the product's name

'New Generation B-Flax-D'

In [60]:
example.categories_clean   # previously filtered/curated categories of interest

'Health & Personal Care, Vitamins & Dietary Supplements, Multi & Prenatal Vitamins, Multiple Vitamin-Mineral Supplements'

In [61]:
example.description       # product description provided by the seller

'Contains Organic Cold-Milled Flaxseed\nValuable source of soluble and insoluble fiber\nProvides Omega-3 essential fats, and many other nutrients to help achieve and maintain optimal bowel function.\n\nContains Vitamin B12\nB12 helps prevent nerve damage\nB12 aids in healthy cell formation.\nB12 helps prevent anemia\n\nContains Vitamin D\nVitamin D assists the body in the absorption of important minerals like calcium.\n\nContains Seleno-yeast\nA source of selenium, a mineral with powerful anti-viral and disease-fighting properties.\n\nContains Vitamin K2\nMenaQ7TM provides vitamin K2 (menaquinone), extracted and concentrated from natto without solvents. Vitamin K2 prevents arterial calcification and promotes strong bones by improving cross-linking of osteocalcin, a protein found in bones. The amount here has been clinically shown not to interfere with blood anti-coagulant medication. \n\nServing Size:\n1/4 Cup (30 Grams)\n\nServings Per Container:\n30 Servings per container\n\nNet Wt. 

In [62]:
example.summary      # review title

'Dpes the job well'

In [63]:
example.reviewText   # review content

'B-flax-D is a regular at our house. It does its job simply and with good results. It is reasonable, lasts a long time, and is able to be obtained with free shipping if you hunt around. Good product, good price, good results.'

Here's what the actual review looks like:

In [64]:
example.overall     # the rating provided by the reviewer

5.0

In [29]:
example.helpful

'[0, 0]'

In [27]:
Image(url= "images/amazon_review_screenshot.png")

# Data Pre-processing

Let us start off using only the title (`summary`) and body (`reviewText`) of each review.

In [6]:
%%time
df = load_df_s3(bucket_name, filepath='amazon_reviews/reviews_data_clean_v2.feather', filetype='feather')

CPU times: user 2.87 s, sys: 4.27 s, total: 7.14 s
Wall time: 22 s


In [7]:
df.dtypes

asin                 object
helpful              object
reviewText           object
overall             float64
summary              object
description          object
title                object
categories_clean     object
dtype: object

In [8]:
df.drop(['helpful', 'overall', 'title', 'categories_clean', 'description'], axis=1, inplace=True)

In [9]:
df.head()

Unnamed: 0,asin,reviewText,summary
0,929619730,B-flax-D is a regular at our house. It does it...,Dpes the job well
1,978559088,Studies show that Resveratrol is poorly absorb...,"Fast shipping, good communication"
2,978559088,I started taking this after both my parents di...,Bioavailability is the key
3,978559088,"I tried Nutrihill, but did not feel any of the...",Other Resveratrol Supplements are Better
4,978559088,I really liked this product because it stayed ...,"I can't find this product any longer, and I wi..."


In [10]:
# for each review, concatenate the review title and body
df.reviewText = df.summary + '. ' + df.reviewText

In [11]:
pd.set_option('max_colwidth', 200)
df.head()

Unnamed: 0,asin,reviewText,summary
0,929619730,"Dpes the job well. B-flax-D is a regular at our house. It does its job simply and with good results. It is reasonable, lasts a long time, and is able to be obtained with free shipping if you hunt ...",Dpes the job well
1,978559088,"Fast shipping, good communication. Studies show that Resveratrol is poorly absorbed when taken by pill, but lozenges are very effectively absorbed. Hardly any companies are selling lozenges. This ...","Fast shipping, good communication"
2,978559088,Bioavailability is the key. I started taking this after both my parents died of cancer as it supposed to enhance your immune system - the story on 60 Minutes on resveratrol was incredibly inspirin...,Bioavailability is the key
3,978559088,"Other Resveratrol Supplements are Better. I tried Nutrihill, but did not feel any of the supposed health benefits. I started reading and realized that even though buccal delivery is the best, the ...",Other Resveratrol Supplements are Better
4,978559088,"I can't find this product any longer, and I wish I could.. I really liked this product because it stayed in my mouth for a long time and I felt it was probably doing some good. I take a number of...","I can't find this product any longer, and I wish I could."


Let's drop the `summary` column now:

In [12]:
df.drop(['summary'], axis=1, inplace=True)

In [13]:
df.head()

Unnamed: 0,asin,reviewText
0,929619730,"Dpes the job well. B-flax-D is a regular at our house. It does its job simply and with good results. It is reasonable, lasts a long time, and is able to be obtained with free shipping if you hunt ..."
1,978559088,"Fast shipping, good communication. Studies show that Resveratrol is poorly absorbed when taken by pill, but lozenges are very effectively absorbed. Hardly any companies are selling lozenges. This ..."
2,978559088,Bioavailability is the key. I started taking this after both my parents died of cancer as it supposed to enhance your immune system - the story on 60 Minutes on resveratrol was incredibly inspirin...
3,978559088,"Other Resveratrol Supplements are Better. I tried Nutrihill, but did not feel any of the supposed health benefits. I started reading and realized that even though buccal delivery is the best, the ..."
4,978559088,"I can't find this product any longer, and I wish I could.. I really liked this product because it stayed in my mouth for a long time and I felt it was probably doing some good. I take a number of..."


In [14]:
pd.set_option('max_colwidth', 20)

## Remove Missing Reviews

In [15]:
df.reviewText.isnull().sum()    # 73 reviews have neither a review body text, nor a review title

73

In [16]:
# drop reviews with no text
df = df[~(df.reviewText.isnull())]

In [17]:
df.asin.isnull().sum()

0

Let's look at a few actual review texts:

In [18]:
df.reviewText.iloc[np.random.randint(0, len(df))]

'Great Fish Oil Formula!. This product has certainly met all my expectations. I have tried many different type formulations in the past, some good, some not so good, but this is a winner. I am very picky about fish oil blends and you will like this one. I do recommend this product!'

In [19]:
df.reviewText.iloc[np.random.randint(0, len(df))]

'Does not work for focus. My 9 year old has has taken this for four weeks now and the only thing I have noticed is a slight calming effect. That is why I gave it the two stars. It does absolutely nothing for the rest of his ADHD symptoms. His focus is still almost non existent. The teachers at school confirmed this.  My next step is to visit the doctor for a script.'

In [20]:
df.reviewText.iloc[np.random.randint(0, len(df))]

'No aftertaste. My husband and I both take this, and it is very painless since there is no fishy taste or aftertaste.'

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 584829 entries, 0 to 584901
Data columns (total 2 columns):
asin          584829 non-null object
reviewText    584829 non-null object
dtypes: object(2)
memory usage: 13.4+ MB


## Pre-processing for LDA

In [22]:
text = list(df.reviewText.values)    # make an iterable to store only the review text

In [23]:
len(text)

584829

In [24]:
# look at a few sample reviews
for rev in text[:4]:
    print(rev, '\n')

Dpes the job well. B-flax-D is a regular at our house. It does its job simply and with good results. It is reasonable, lasts a long time, and is able to be obtained with free shipping if you hunt around. Good product, good price, good results. 

Fast shipping, good communication. Studies show that Resveratrol is poorly absorbed when taken by pill, but lozenges are very effectively absorbed. Hardly any companies are selling lozenges. This company promises 99% purity and has fast shipping and good communication. I can't comment on the quality of product because I'm not a chemist but they seem to be legitimate. 

Bioavailability is the key. I started taking this after both my parents died of cancer as it supposed to enhance your immune system - the story on 60 Minutes on resveratrol was incredibly inspiring. Doing some research on the Internet, it is indicated that taking resveratrol in lozenge form is preferable as it is broken down by stomach acids.  The ez-melt formula recommended in a

In [25]:
nlp = spacy.load('en')

The helper functions below are from:

http://nbviewer.jupyter.org/github/skipgram/modern-nlp-in-python/blob/master/executable/Modern_NLP_in_Python.ipynb

### Phrase Detection

In order to use `gensim`'s `Phrases` class to detect natural combinations of words (like 'vanilla ice cream'), we need to format our text into a list of sentences, with each sentence being a list of words.

In [40]:
def lemmatized_sentence_corpus(text_seq):
    """Generator function to use spaCy to parse reviews, 
    lemmatize the text, and yield sentences.
    Returns one sentence at a time, continuously, 
    without demarcating the baoundary between two reviews.
    Parameters:
        text_seq: a sequence (list) of text (e.g. reviews) to parse.
    """
    for parsed_review in nlp.pipe(text_seq, batch_size=10000, n_threads=18):
        for sent in parsed_review.sents:
            # yield makes this process memory efficient
            yield [token.lemma_ for token in sent if not (token.is_space or token.is_punct)]

In [None]:
# organize reviews into a list of sentences made of lists of words
unigram_sentences = []
num_reviews = len(text)
for sentence in tqdm(lemmatized_sentence_corpus(text_seq=text), total = num_reviews):
    unigram_sentences.append(sentence)




  0%|          | 0/584829 [00:00<?, ?it/s][A[A[A


  0%|          | 1/584829 [01:30<14780:06:14, 90.98s/it][A[A[A


  2%|▏         | 10797/584829 [01:31<1:20:42, 118.54it/s][A[A[A


  4%|▎         | 21640/584829 [01:31<39:33, 237.33it/s]  [A[A[A


  6%|▌         | 32807/584829 [01:31<25:35, 359.41it/s][A[A[A


  8%|▊         | 44482/584829 [01:31<18:30, 486.77it/s][A[A[A
[A


  9%|▉         | 53034/584829 [01:31<15:18, 578.90it/s][A[A[A


 11%|█         | 62905/584829 [03:04<25:27, 341.58it/s][A[A[A


 13%|█▎        | 73216/584829 [03:04<21:27, 397.36it/s][A[A[A


 14%|█▍        | 83598/584829 [03:04<18:25, 453.46it/s][A[A[A


 16%|█▌        | 94177/584829 [03:04<16:01, 510.56it/s][A[A[A


 17%|█▋        | 101050/584829 [03:04<14:44, 547.24it/s][A[A[A


 19%|█▉        | 111687/584829 [03:04<13:02, 604.52it/s][A[A[A


 21%|██        | 122809/584829 [03:04<11:35, 664.35it/s][A[A[A


 22%|██▏       | 131217/584829 [04:35<15:51, 476.98it/s][A

In [None]:
len(unigram_sentences)     # sentences in the review corpus

In [208]:
# Train a first-order phrase detector
bigram_model = Phrases(unigram_sentences, min_count=5)

# Transform unigram sentences into bigram sentences
# Paired words are connected by an underscore, e.g. ice_cream
bigram_sentences = []
for sentence in unigram_sentences:
    bigram_sentences.append(bigram_model[sentence])

In [267]:
# Train a second-order phrase detector
trigram_model = Phrases(bigram_sentences, min_count=5)

# Transform bigram sentences into trigram sentences
trigram_sentences = []
for sentence in bigram_sentences:
    trigram_sentences.append(trigram_model[sentence])

# remove any remaining stopwords
trigram_sentences = [[word for word in sentence if word not in nlp.Defaults.stop_words] for sentence in trigram_sentences]



In [292]:
# the trigrams will be saved in a dataframe with a single column.
# each row is one sentence from any review
# each sentence is a single string separated by a single space.
trigram_sentences_savedf = pd.DataFrame([u' '.join(sentence) for sentence in trigram_sentences], columns=['preprocessed_review'])
save_df_s3(trigram_sentences_savedf, bucket_name, 'amazon_reviews/preprocessed_reviews.feather')

In [293]:
trigram_sentences_savedf.head()

Unnamed: 0,preprocessed_review
0,dpe the job well
1,b flax d be a re...
2,-PRON- do -PRON-...
3,-PRON- be reason...
4,good product goo...


In [209]:
unigram_sentences[0:4]

[['dpe', 'the', 'job', 'well'],
 ['b', 'flax', 'd', 'be', 'a', 'regular', 'at', '-PRON-', 'house'],
 ['-PRON-', 'do', '-PRON-', 'job', 'simply', 'and', 'with', 'good', 'result'],
 ['-PRON-',
  'be',
  'reasonable',
  'last',
  'a',
  'long',
  'time',
  'and',
  'be',
  'able',
  'to',
  'be',
  'obtain',
  'with',
  'free',
  'shipping',
  'if',
  '-PRON-',
  'hunt',
  'around']]

In [210]:
bigram_sentences[0:4]

[['dpe', 'the', 'job', 'well'],
 ['b', 'flax', 'd', 'be', 'a', 'regular', 'at', '-PRON-', 'house'],
 ['-PRON-', 'do', '-PRON-', 'job', 'simply', 'and', 'with', 'good', 'result'],
 ['-PRON-',
  'be',
  'reasonable',
  'last',
  'a',
  'long_time',
  'and',
  'be',
  'able_to',
  'be',
  'obtain',
  'with',
  'free_shipping',
  'if',
  '-PRON-',
  'hunt',
  'around']]

In [211]:
example = [word for sentence in unigram_sentences for word in sentence]  # flattened list

In [212]:
len(example)

373578

In [213]:
example[:10]

['dpe', 'the', 'job', 'well', 'b', 'flax', 'd', 'be', 'a', 'regular']

In [214]:
sample_bigr = bigram_model[example]



In [215]:
len(sample_bigr)

354716

In [216]:
sample_bigr[:10]

['dpe', 'the', 'job', 'well', 'b', 'flax', 'd', 'be', 'a', 'regular']

In [217]:
sample_trigr = trigram_model[sample_bigr]



In [218]:
len(sample_trigr)

349647

In [219]:
sample_trigr[:10]

['dpe', 'the', 'job', 'well', 'b', 'flax', 'd', 'be', 'a', 'regular']

In [241]:
bigr_set = set([word for word in sample_bigr if '_' in word])   # bi-grams

# print arbitrary 20 bigrams
count = 0
for e in bigr_set:
    if count <= 20:
        print(e)
        count += 1
    else:
        break

cell_phone
allergic_reaction
prostate_health
in_conjunction
what_happen
knee_pain
fat_loss
very_disappointed
point_where
whole_food
bald_spot
zone_perfect
sugar_free
brown_spot
pycnogenol_gel
fine_line
fda_approve
next_day
such_as
belt_clip
extra_energy


In [224]:
trigr_set = set([word for word in sample_trigr if '_' in word])   # tri-grams

In [247]:
# print 20 trigrams containing 'no' or 'not'
for e in (trigr_set - bigr_set):
    if ('no_' in e or 'not_' in e):
        print(e)

not_go_wrong
not_live_without
no_longer_carry
no_matter_what
have_no_idea
do_not_know_why
do_not_know_how
no_side_affect
no_jet_lag
do_not_notice_any
no_issue
no_side_effect
no_improvement
do_not_bother
not_sure


The list below shows the additional "trigrams" captured by the model.  Note that all of these are not exactly trigrams - there are some bigrams here as well.  This is because in the formula for calculating the threshold for forming a phrase, the total vocabulary of the text corpus is a factor.  We trained the trigrams model on a different corpus than the one we used for the bigrams model.  Because of this, some additional two-word phrases also get detected.

Some potentially useful examples of phrases detected here:

* no_jet_lag
* not_go_wrong
* no_complaint
* no_side_effect

The meanings of the above words change if we don't treat them as part of a single phrase.

In [None]:
for parsed_review in tqdm_notebook(nlp.pipe(text, batch_size=10000, n_threads=36)):
    unigram_review = [token.lemma_ for token in parsed_review if not (token.is_space or token.is_punct)]

In [None]:
%%time
unigram_sentences_all = [sentence for sentence in lemmatized_sentence_corpus(text)]

In [None]:
%%time

for parsed_review in nlp.pipe(text, batch_size=1000, n_threads=4):

    # lemmatize the text, removing punctuation and whitespace
    unigram_review = [token.lemma_ for token in parsed_review if not (token.is_space or token.is_punct)]

    # apply the first-order and second-order phrase models
    bigram_review = bigram_model[unigram_review]
    trigram_review = trigram_model[bigram_review]

    # remove any remaining stopwords
    trigram_review = [term for term in trigram_review if term not in spacy.en.STOPWORDS]

    # write the transformed review as a line in the new file
    trigram_review = u' '.join(trigram_review)
    f.write(trigram_review + '\n')

In [None]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if 0 == 1:

    with codecs.open(trigram_reviews_filepath, 'w', encoding='utf_8') as f:
        
        for parsed_review in nlp.pipe(line_review(review_txt_filepath),
                                      batch_size=10000, n_threads=4):
            
            # lemmatize the text, removing punctuation and whitespace
            unigram_review = [token.lemma_ for token in parsed_review
                              if not punct_space(token)]
            
            # apply the first-order and second-order phrase models
            bigram_review = bigram_model[unigram_review]
            trigram_review = trigram_model[bigram_review]
            
            # remove any remaining stopwords
            trigram_review = [term for term in trigram_review
                              if term not in spacy.en.STOPWORDS]
            
            # write the transformed review as a line in the new file
            trigram_review = u' '.join(trigram_review)
            f.write(trigram_review + '\n')