## Set up

In [2]:
from pymongo import MongoClient
from pprint import pprint

import pandas as pd
import re
import nltk

import pickle
import json

In [3]:
from bson.objectid import ObjectId

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
# stemming
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/Jocelyn/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
# This creates a client that uses the default port on localhost.
# If connecting to AWS, you need a connection string.
# Can do the same thing with MongoClient("mongodb://localhost:27017")
# client = MongoClient()
client = MongoClient("mongodb://localhost:27017")

In [7]:
# Makes it look similar to shell mongo
db = client.amazon_lb

In [8]:
db.list_collection_names()

['product_reviews',
 'skin_care_face_rev_agg',
 'product_metadata',
 'skin_care_face_reviews',
 'skin_care_face_prod_rev',
 'skin_care_face_products']

In [9]:
# cursor = db.skin_care_face_products.find()
# products = list(cursor)
# products[0]

In [10]:
# cursor = db.skin_care_face_prod_rev.find()
# products = list(cursor)
# products[0]

In [20]:
# import product list
with open('pickles/product_list.pickle', 'rb') as to_read:
    product_list = pickle.load(to_read)

with open('pickles/review_list.pickle', 'rb') as to_read:
    review_list = pickle.load(to_read)
    
with open('pickles/all_review_list.pickle', 'rb') as to_read:
    all_review_list = pickle.load(to_read)
# product_list.head(5)

In [22]:
review_list.head(5)

Unnamed: 0,asin,reviewer_id,review_id,rating,review_text
0,B0000Y3NO6,A3AAWP3AWC4UR1,5dc881afaf3db5220c9582ee,2.0,I gave it a shot for a while until the small b...
1,B0000Y3NO6,A2DKQ5CLJ2KWM3,5dc881afaf3db5220c9582ef,5.0,After trying all the drugstore rosacea product...
2,B0000Y3NO6,ALZPAFF6H463C,5dc881afaf3db5220c9582f0,3.0,"DERMAdoctor Calm, Cool & Corrected 2N1 Rosacea..."
3,B0000Y3NO6,A3UKAU8GVKQMBR,5dc881afaf3db5220c9582f1,1.0,I was very excited to try this as nothing else...
4,B0000Y3NO6,ARTQZJRQRW6NB,5dc881afaf3db5220c9582f2,5.0,Been using this for over 3 years. Helped long ...


In [23]:
pos_revs = review_list[review_list['rating'] >=4]
neg_revs = review_list[review_list['rating'] <=3]

In [24]:
neg_revs.shape

(16582, 5)

In [21]:
review_list.shape

(81038, 5)

#### group by product

In [39]:
def join(x):
    return ' '.join(x)
pos_revs2 = pos_revs.groupby('asin')['review_text'].apply(join)
neg_revs2 = neg_revs.groupby('asin')['review_text'].apply(join)

In [45]:
# pos_revs[pos_revs['asin']=='B0000Y3NO6']

pandas.core.series.Series

In [47]:
neg_revs2 = pd.DataFrame(neg_revs2).reset_index()
neg_revs2
pos_revs2 = pd.DataFrame(pos_revs2).reset_index()
pos_revs2

Unnamed: 0,asin,review_text
0,B0000Y3NO6,After trying all the drugstore rosacea product...
1,B00012C5RS,"I haven't splurged on a full-size bottle yet, ..."
2,B0001EKTTC,Great product . Gets the job done. Leaves your...
3,B0001EL0WC,I love this night cream. It is smooth and soa...
4,B0001EL4M8,"I was gifted with this originally, and when I ..."
...,...,...
2165,B01HBS7WW2,Nice face cream for the dry seasons. New packa...
2166,B01HBS7XP8,Great product! Removes layer of dirt that most...
2167,B01HBS87ZS,an esthetician recommended this product to me ...
2168,B01HEESSHG,Everything in this pack is exceThe perfume see...


#### text processing

In [48]:
#Compound phrases
from nltk.tokenize import word_tokenize
from nltk.tokenize import MWETokenizer # multi-word expression

mwe_tokenizer = MWETokenizer([('times','a','day'),('holy','grail'),('t','zone')])

In [49]:
# Text preprocessing steps - remove numbers, captial letters and punctuation
import re
import string

remove_links = lambda x: re.sub('<[^<]+?>', '', x)
alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x)
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())

#custom stop words
more_stop_words = lambda x: re.sub('oz|ml|mls|ounce|ounces','',x)
more_stop_words2 = lambda x: re.sub('use|using|product|products|work|works|working|buy|buys|buying|stuff','',x)

#custom replacements
mwe = lambda x: ' '.join(mwe_tokenizer.tokenize(word_tokenize(x)))

neg_revs2['review_text'] = neg_revs2.review_text.map(remove_links).map(alphanumeric).map(punc_lower).map(more_stop_words).map(more_stop_words2)
pos_revs2['review_text'] = pos_revs2.review_text.map(remove_links).map(alphanumeric).map(punc_lower).map(more_stop_words).map(more_stop_words2)


In [50]:
# Stemmers

stemmer = LancasterStemmer()
# stemmer = PorterStemmer
# stemmer = SnowballStemmer('english')
# lemmatizer = WordNetLemmatizer()

neg_revs2['review_text']=[' '.join([stemmer.stem(word) for word in text.split(' ')])
          for text in neg_revs2['review_text']]
pos_revs2['review_text']=[' '.join([stemmer.stem(word) for word in text.split(' ')])
          for text in pos_revs2['review_text']]

#### vectorizing

In [56]:
tfidf_n = TfidfVectorizer(stop_words='english', min_df = 20)

rev_td_n = tfidf_n.fit_transform(neg_revs2['review_text'])
rev_td_n_matrix = pd.DataFrame(rev_td_n.toarray(),columns=tfidf_n.get_feature_names())

rev_td_n_matrix.shape

(1664, 1342)

In [57]:
rev_td_n_matrix.head(5)

Unnamed: 0,abl,abov,absolv,absorb,acceiv,accid,accord,achiev,acid,acn,...,yo,young,youth,youtub,yr,yuck,zero,zint,zit,zon
0,0.0,0.047094,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.050528,...,0.078123,0.0,0.0,0.0,0.0,0.0,0.0,0.050549,0.0,0.084763
1,0.0,0.0,0.104303,0.0,0.0,0.0,0.076081,0.0,0.0,0.043134,...,0.033346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.101823,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.061351,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.050458,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.032263,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [58]:
tfidf_p = TfidfVectorizer(stop_words='english', min_df = 20)

rev_td_p = tfidf_p.fit_transform(pos_revs2['review_text'])
rev_td_p_matrix = pd.DataFrame(rev_td_p.toarray(),columns=tfidf_p.get_feature_names())

rev_td_p_matrix.shape

(2170, 2151)

In [59]:
rev_td_p_matrix.head(5)

Unnamed: 0,aa,abl,abov,abras,absolv,absorb,ac,acc,acceiv,access,...,yo,york,young,youth,youtub,yr,zero,zint,zit,zon
0,0.0,0.015647,0.022882,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.036924,0.0,0.0,0.0,0.0,0.0,0.0,0.026237,0.0,0.0
1,0.0,0.020291,0.0,0.0,0.0,0.0,0.0,0.0,0.034572,0.0,...,0.035911,0.0,0.018975,0.0,0.0,0.0,0.0,0.0,0.028468,0.0
2,0.0,0.0,0.0,0.044297,0.024453,0.0,0.0,0.0,0.0,0.0,...,0.138615,0.0,0.027465,0.0,0.0,0.0,0.0,0.0,0.0,0.034839
3,0.0,0.019657,0.0,0.0,0.049098,0.034914,0.0,0.0,0.0,0.0,...,0.092771,0.0,0.018382,0.024524,0.0,0.0,0.031675,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.011351,0.0,0.0,0.0,0.0,0.0,...,0.152822,0.0,0.0,0.01701,0.0,0.018693,0.0,0.0,0.0,0.0


#### topic modeling

In [60]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [65]:
nmf_n = NMF(5)
doc_topic = nmf_n.fit_transform(rev_td_n_matrix)
topic_word = pd.DataFrame(nmf_n.components_.round(3),
#                         index = ["component_1","component_2","component_3","component_4"],
                        columns = tfidf_n.get_feature_names())
# print(topic_word)

display_topics(nmf_n, tfidf_n.get_feature_names(), 10)


Topic  0
skin, thi, moist, hav, cream, dry, fac, feel, sensit, lik

Topic  1
cleans, fac, thi, makeup, cle, skin, wash, remov, feel, lik

Topic  2
thi, hav, bottl, diff, purchas, wil, tim, money, ord, ar

Topic  3
smel, lik, cream, thi, scent, strong, bad, don, real, perfum

Topic  4
mask, fac, yo, clay, peel, ar, ther, box, thi, didn


In [66]:
nmf_p = NMF(5)
doc_topic_p = nmf_p.fit_transform(rev_td_p_matrix)
topic_word_p = pd.DataFrame(nmf_p.components_.round(3),
#                         index = ["component_1","component_2","component_3","component_4"],
                        columns = tfidf_p.get_feature_names())
# print(topic_word_p)

display_topics(nmf_p, tfidf_p.get_feature_names(), 10)


Topic  0
thi, skin, hav, acn, ser, week, wil, look, ar, aft

Topic  1
cleans, skin, fac, thi, cle, wash, dry, feel, hav, gentl

Topic  2
lov, gre, thi, year, smel, good, excel, pric, wond, purchas

Topic  3
mask, fac, skin, aft, thi, feel, minut, thes, yo, soft

Topic  4
skin, moist, cream, thi, hav, feel, fac, lik, dry, greasy
