## Set up

In [1]:
from pymongo import MongoClient
from pprint import pprint

import pandas as pd
import re
import nltk

import pickle
import json

In [2]:
from bson.objectid import ObjectId

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
# stemming
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/Jocelyn/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
# This creates a client that uses the default port on localhost.
# If connecting to AWS, you need a connection string.
# Can do the same thing with MongoClient("mongodb://localhost:27017")
# client = MongoClient()
client = MongoClient("mongodb://localhost:27017")

In [6]:
# Makes it look similar to shell mongo
db = client.amazon_lb

In [7]:
db.list_collection_names()

['product_reviews',
 'skin_care_face_rev_agg',
 'product_metadata',
 'skin_care_face_reviews',
 'skin_care_face_prod_rev',
 'skin_care_face_products']

In [8]:
# cursor = db.skin_care_face_products.find()
# products = list(cursor)
# products[0]

In [9]:
# cursor = db.skin_care_face_prod_rev.find()
# products = list(cursor)
# products[0]

In [199]:
# import product list
with open('pickles/product_list.pickle', 'rb') as to_read:
    product_list = pickle.load(to_read)

with open('pickles/review_list.pickle', 'rb') as to_read:
    review_list = pickle.load(to_read)
    
with open('pickles/all_review_list.pickle', 'rb') as to_read:
    all_review_list = pickle.load(to_read)
# product_list.head(5)

In [200]:
print(product_list['description'][8])

A 5% liquid benzoyl peroxide acne treatment used to spot-treat affected areas, penetrate pores to clear existing acne blemishes and prevent the occurrence of future breakouts. PCA SKIN is a trusted innovator in the development of highly effective skincare products. Our vision is to improve peoples lives by providing results-oriented skin care solutions for the health of your unique skin. 


### Text processing

In [201]:
#Compound phrases
from nltk.tokenize import word_tokenize
from nltk.tokenize import MWETokenizer # multi-word expression

mwe_tokenizer = MWETokenizer([('times','a','day'),('holy','grail'),('t','zone')])

In [202]:
# Text preprocessing steps - remove numbers, captial letters and punctuation
import re
import string

remove_links = lambda x: re.sub('<[^<]+?>', '', x)
alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x)
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())

#custom stop words
more_stop_words = lambda x: re.sub('oz|ml|mls|ounce|ounces','',x)
# more_stop_words2 = lambda x: re.sub('use|using|product|products|work|works|working|buy|buys|buying|stuff','',x)
more_stop_words2 = lambda x: re.sub('use|using|product|products|work|works|working|buy|buys|buying|stuff|good|excellent|great|bad|terrible|like|love|really','',x)
more_stop_words3 = lambda x: re.sub('face|skin','',x)
#custom replacements
mwe = lambda x: ' '.join(mwe_tokenizer.tokenize(word_tokenize(x)))

product_list['description'] = product_list.description.map(remove_links).map(alphanumeric).map(punc_lower)
all_review_list['all_review_text'] = all_review_list.all_review_text.map(remove_links).map(alphanumeric).map(punc_lower).map(more_stop_words).map(more_stop_words2).map(mwe)


  alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x)


In [203]:
# Stemmers

stemmer = LancasterStemmer()
# stemmer = PorterStemmer()
# stemmer = SnowballStemmer('english')
# lemmatizer = WordNetLemmatizer()

all_review_list['all_review_text']=[' '.join([stemmer.stem(word) for word in text.split(' ')])
          for text in all_review_list['all_review_text']]

### Vectorize

In [204]:
tfidf_d = TfidfVectorizer(stop_words='english', min_df = 20)

rev_td_d = tfidf_d.fit_transform(product_list['description'])
rev_td_d_matrix = pd.DataFrame(rev_td_d.toarray(),columns=tfidf_d.get_feature_names())
# rev_td_d_matrix.head(5)
rev_td_d_matrix.shape

(2233, 1272)

In [205]:
rev_td_d_matrix.head(5)

Unnamed: 0,ability,absolute,absorb,absorbed,absorbing,absorbs,absorption,acetate,achieve,acid,...,worldwide,wrinkle,wrinkles,xanthan,year,years,younger,youth,youthful,zinc
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.051457,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.131415,0.0,0.0,0.0,0.0,0.0


In [206]:
tfidf_r = TfidfVectorizer(stop_words='english', min_df = 20)

rev_td_r = tfidf_r.fit_transform(all_review_list['all_review_text'])
rev_td_r_matrix = pd.DataFrame(rev_td_r.toarray(),columns=tfidf_r.get_feature_names())
# rev_td_r_matrix.head(5)
rev_td_r_matrix.shape

(2241, 2538)

In [207]:
rev_td_r_matrix.head(5)

Unnamed: 0,ab,abl,abov,abras,absolv,absorb,ac,acc,acceiv,access,...,young,youth,youtub,yr,yuck,yummy,zero,zint,zit,zon
0,0.0,0.011502,0.033792,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.038877,0.0,0.0
1,0.0,0.015829,0.0,0.0,0.026817,0.0,0.0,0.0,0.025723,0.0,...,0.015414,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.022444,0.0
2,0.0,0.0,0.0,0.04415,0.024126,0.0,0.0,0.0,0.0,0.0,...,0.027734,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.054831
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.040096,0.0,0.0,0.05743,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.062505,0.116967,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.218245,0.0,0.0


### Reduce dimensionality

In [208]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [209]:
nmf = NMF(10)
doc_topic = nmf.fit_transform(rev_td_r_matrix)
topic_word = pd.DataFrame(nmf.components_.round(3),
#                         index = ["component_1","component_2","component_3","component_4"],
                        columns = tfidf_r.get_feature_names())
print(topic_word)

display_topics(nmf, tfidf_r.get_feature_names(), 10)

      ab    abl   abov  abras  absolv  absorb     ac    acc  acceiv  access  \
0  0.000  0.010  0.009  0.000   0.032   0.236  0.001  0.005   0.010   0.000   
1  0.000  0.008  0.003  0.019   0.018   0.000  0.000  0.003   0.000   0.002   
2  0.004  0.012  0.001  0.002   0.015   0.000  0.007  0.000   0.001   0.001   
3  0.000  0.006  0.000  0.007   0.007   0.000  0.000  0.000   0.000   0.000   
4  0.000  0.018  0.000  0.011   0.041   0.000  0.000  0.003   0.001   0.003   
5  0.002  0.048  0.020  0.010   0.031   0.072  0.004  0.002   0.007   0.012   
6  0.004  0.030  0.003  0.000   0.027   0.117  0.000  0.002   0.013   0.003   
7  0.003  0.011  0.000  0.000   0.060   0.012  0.000  0.000   0.003   0.000   
8  0.002  0.009  0.000  0.016   0.028   0.000  0.000  0.000   0.001   0.002   
9  0.000  0.025  0.002  0.000   0.044   0.000  0.003  0.002   0.000   0.000   

   ...  young  youth  youtub     yr   yuck  yummy   zero   zint    zit    zon  
0  ...  0.000  0.000   0.003  0.000  0.001  0.000 

In [34]:
H_r = pd.DataFrame(doc_topic.round(5),
             index = all_review_list['asin'])
H_r

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9
asin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
B0000Y3NO6,0.05427,0.00904,0.00433,0.02289,0.10064,0.03511,0.04922,0.06427,0.00000,0.00097
B00012C5RS,0.05172,0.01705,0.00866,0.01602,0.04922,0.01919,0.12587,0.00586,0.00748,0.01431
B0001EKTTC,0.04839,0.02002,0.04258,0.07218,0.10511,0.00331,0.00418,0.00000,0.01376,0.00000
B0001EL5Q8,0.07574,0.00000,0.00000,0.08743,0.01846,0.00526,0.08463,0.01515,0.01931,0.00000
B0001EL5JA,0.03347,0.00424,0.00000,0.06206,0.03392,0.10670,0.02497,0.00439,0.00000,0.00969
...,...,...,...,...,...,...,...,...,...,...
B01HBS87ZS,0.03087,0.00000,0.00198,0.02127,0.07257,0.00000,0.04794,0.00000,0.00925,0.00000
B01HBS7WW2,0.08177,0.00252,0.00628,0.02420,0.00000,0.03299,0.00000,0.02416,0.03790,0.01069
B01HEESSHG,0.04060,0.00000,0.00492,0.00000,0.00000,0.01717,0.03695,0.03207,0.03331,0.03320
B01HBS7XP8,0.06500,0.02685,0.00368,0.03079,0.00000,0.00000,0.00000,0.00000,0.00880,0.02398


In [131]:
# save out dictionary of topic words
def save_topic_words(model, feature_names, no_top_words, topic_names=None):
    topic_words = {}
    for ix, topic in enumerate(model.components_):
        lst = [feature_names[i]
            for i in topic.argsort()[:-no_top_words - 1:-1]]
        topic_words.update({ix:lst})
    return topic_words

In [132]:
topic_dict = save_topic_words(nmf,tfidf_r.get_feature_names(), 10)
topic_dict[0]

['thi',
 'year',
 'hav',
 'ord',
 'purchas',
 'recommend',
 've',
 'tim',
 'amazon',
 'pric']

### Cosine similarity on product description

In [22]:
from sklearn.metrics.pairwise import cosine_similarity

In [23]:
# calculate the cosine similarity between all combinations of documents
from itertools import combinations

# # list all of the combinations of 5 take 2 as well as the pairs of phrases
# corpus = []
# for i in product_list['product'][0:100]:
#     corpus.append(i)
    
# pairs = list(combinations(enumerate(corpus),2))
# combos = [(a[0], b[0]) for a, b in pairs]
# phrases = [(a[1], b[1]) for a, b in pairs]

In [134]:
# results_tfidf = [cosine_similarity(rev_td_d_matrix.iloc[a].values.reshape(1,-1), rev_td_d_matrix.iloc[b].values.reshape(1,-1)) for a, b in combos]

# results_tfidf
# sorted(zip(results_tfidf, phrases), reverse=True)

### Cosine similarity on product reviews

In [25]:
# calculate the cosine similarity between all combinations of documents
from itertools import combinations

# list all of the combinations of 5 take 2 as well as the pairs of phrases
corpus = []
for i in all_review_list['asin']:
    corpus.append(i)
    
pairs = list(combinations(enumerate(corpus),2))
combos = [(a[0], b[0]) for a, b in pairs]
phrases = [(a[1], b[1]) for a, b in pairs]

In [26]:
results_tfidf = [cosine_similarity(H_r.iloc[a].values.reshape(1,-1), H_r.iloc[b].values.reshape(1,-1)) for a, b in combos]

results_tfidf
sim_asin = sorted(zip(results_tfidf, phrases), reverse=True)

In [27]:
sim_asin[0:5]

[(array([[1.]]), ('B00E6QU4WS', 'B01EM45E4G')),
 (array([[1.]]), ('B00AZJDNR8', 'B00JLQOO7O')),
 (array([[1.]]), ('B00AGOOFYM', 'B00IS41WTS')),
 (array([[1.]]), ('B0075RKSR6', 'B00IS41WTS')),
 (array([[1.]]), ('B0075RKSR6', 'B00AGOOFYM'))]

In [28]:
# save out all_review_list
with open('pickles/sim_asin.pickle', 'wb') as to_write:
    pickle.dump(sim_asin, to_write)

In [35]:
# save out all_review_list
with open('pickles/doc_topic.pickle', 'wb') as to_write:
    pickle.dump(H_r, to_write)

In [133]:
# save out topic words
with open('pickles/topic_words_dict.pickle', 'wb') as to_write:
    pickle.dump(topic_dict, to_write)

###  Visualizing w/ pyLDAvis

In [42]:

import pyLDAvis

import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

In [69]:
# need to drop documents that don't match any of the topics (the row sums to zero)
a = np.sum(rev_td_r_matrix,axis=1)[np.sum(rev_td_r_matrix,axis=1)==0].index
rev_matrix2 = rev_td_r_matrix.drop(index=a)

In [85]:
# turn back into sparse matrix
from scipy import sparse

In [86]:
sparse_rev = sparse.csr_matrix(rev_matrix2.values)
type(sparse_rev)

In [91]:
# dimension model
# vectorizer model
# doc-term matrix
pyLDAvis.sklearn.prepare(nmf, sparse_rev, tfidf_r)

  kernel = (topic_given_term * np.log((topic_given_term.T / topic_proportion).T))
  log_lift = np.log(topic_term_dists / term_proportion)
  log_ttd = np.log(topic_term_dists)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
