## Set up

In [1]:
from pymongo import MongoClient
from pprint import pprint

import pandas as pd
import re
import nltk

import pickle
import json

In [2]:
from bson.objectid import ObjectId

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
# stemming
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/Jocelyn/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
# This creates a client that uses the default port on localhost.
# If connecting to AWS, you need a connection string.
# Can do the same thing with MongoClient("mongodb://localhost:27017")
# client = MongoClient()
client = MongoClient("mongodb://localhost:27017")

In [6]:
# Makes it look similar to shell mongo
db = client.amazon_lb

In [7]:
db.list_collection_names()

['product_reviews',
 'skin_care_face_rev_agg',
 'product_metadata',
 'skin_care_face_reviews',
 'skin_care_face_prod_rev',
 'skin_care_face_products']

In [8]:
# cursor = db.skin_care_face_products.find()
# products = list(cursor)
# products[0]

In [9]:
# cursor = db.skin_care_face_prod_rev.find()
# products = list(cursor)
# products[0]

In [10]:
# import product list
with open('pickles/product_list.pickle', 'rb') as to_read:
    product_list = pickle.load(to_read)

with open('pickles/review_list.pickle', 'rb') as to_read:
    review_list = pickle.load(to_read)
    
with open('pickles/all_review_list.pickle', 'rb') as to_read:
    all_review_list = pickle.load(to_read)
# product_list.head(5)

In [11]:
print(product_list['description'][8])

A 5% liquid benzoyl peroxide acne treatment used to spot-treat affected areas, penetrate pores to clear existing acne blemishes and prevent the occurrence of future breakouts. PCA SKIN is a trusted innovator in the development of highly effective skincare products. Our vision is to improve peoples lives by providing results-oriented skin care solutions for the health of your unique skin. 


### Text processing

In [12]:
#Compound phrases
from nltk.tokenize import word_tokenize
from nltk.tokenize import MWETokenizer # multi-word expression

mwe_tokenizer = MWETokenizer([('times','a','day'),('holy','grail'),('t','zone')])

In [13]:
# Text preprocessing steps - remove numbers, captial letters and punctuation
import re
import string

remove_links = lambda x: re.sub('<[^<]+?>', '', x)
alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x)
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())

#custom stop words
more_stop_words = lambda x: re.sub('oz|ml|mls|ounce|ounces','',x)
# more_stop_words2 = lambda x: re.sub('use|using|product|products|work|works|working|buy|buys|buying|stuff','',x)
more_stop_words2 = lambda x: re.sub('use|using|product|products|work|works|working|buy|buys|buying|stuff|good|excellent|great|bad|terrible|like|love|really','',x)
more_stop_words3 = lambda x: re.sub('face|skin','',x)
#custom replacements
mwe = lambda x: ' '.join(mwe_tokenizer.tokenize(word_tokenize(x)))

product_list['description'] = product_list.description.map(remove_links).map(alphanumeric).map(punc_lower)
all_review_list['all_review_text'] = all_review_list.all_review_text.map(remove_links).map(alphanumeric).map(punc_lower).map(more_stop_words).map(more_stop_words2).map(mwe)


In [14]:
# Stemmers

stemmer = LancasterStemmer()
# stemmer = PorterStemmer()
# stemmer = SnowballStemmer('english')
# lemmatizer = WordNetLemmatizer()

all_review_list['all_review_text']=[' '.join([stemmer.stem(word) for word in text.split(' ')])
          for text in all_review_list['all_review_text']]

### Vectorize

In [15]:
tfidf_d = TfidfVectorizer(stop_words='english', min_df = 20)

rev_td_d = tfidf_d.fit_transform(product_list['description'])
rev_td_d_matrix = pd.DataFrame(rev_td_d.toarray(),columns=tfidf_d.get_feature_names())
# rev_td_d_matrix.head(5)
rev_td_d_matrix.shape

(2233, 1272)

In [16]:
rev_td_d_matrix.head(5)

Unnamed: 0,ability,absolute,absorb,absorbed,absorbing,absorbs,absorption,acetate,achieve,acid,...,worldwide,wrinkle,wrinkles,xanthan,year,years,younger,youth,youthful,zinc
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.051457,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.131415,0.0,0.0,0.0,0.0,0.0


In [17]:
tfidf_r = TfidfVectorizer(stop_words='english', min_df = 20)

rev_td_r = tfidf_r.fit_transform(all_review_list['all_review_text'])
rev_td_r_matrix = pd.DataFrame(rev_td_r.toarray(),columns=tfidf_r.get_feature_names())
# rev_td_r_matrix.head(5)
rev_td_r_matrix.shape

(2241, 2522)

In [18]:
rev_td_r_matrix.head(5)

Unnamed: 0,ab,abl,abov,abras,absolv,absorb,ac,acc,acceiv,access,...,young,youth,youtub,yr,yuck,yummy,zero,zint,zit,zon
0,0.0,0.011598,0.034075,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.039204,0.0,0.0
1,0.0,0.016109,0.0,0.0,0.027272,0.0,0.0,0.0,0.026177,0.0,...,0.015673,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02284,0.0
2,0.0,0.0,0.0,0.044894,0.024515,0.0,0.0,0.0,0.0,0.0,...,0.028177,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055755
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.041171,0.0,0.0,0.059019,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.062652,0.117234,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.218908,0.0,0.0


### Reduce dimensionality

In [19]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [20]:
nmf = NMF(10)
doc_topic = nmf.fit_transform(rev_td_r_matrix)
topic_word = pd.DataFrame(nmf.components_.round(3),
#                         index = ["component_1","component_2","component_3","component_4"],
                        columns = tfidf_r.get_feature_names())
print(topic_word)

display_topics(nmf, tfidf_r.get_feature_names(), 10)

      ab    abl   abov  abras  absolv  absorb     ac    acc  acceiv  access  \
0  0.000  0.047  0.000  0.000   0.055   0.000  0.001  0.002   0.000   0.014   
1  0.000  0.005  0.004  0.000   0.031   0.000  0.000  0.005   0.000   0.000   
2  0.000  0.008  0.000  0.000   0.015   0.000  0.000  0.000   0.000   0.000   
3  0.005  0.013  0.000  0.000   0.075   0.012  0.000  0.000   0.004   0.000   
4  0.005  0.011  0.001  0.000   0.010   0.000  0.009  0.002   0.003   0.000   
5  0.000  0.009  0.012  0.000   0.055   0.307  0.001  0.007   0.014   0.000   
6  0.002  0.037  0.023  0.000   0.031   0.094  0.004  0.002   0.010   0.007   
7  0.005  0.030  0.004  0.000   0.025   0.138  0.000  0.002   0.014   0.000   
8  0.000  0.004  0.002  0.102   0.000   0.000  0.000  0.000   0.000   0.000   
9  0.000  0.020  0.001  0.000   0.043   0.000  0.002  0.003   0.000   0.000   

   ...  young  youth  youtub     yr   yuck  yummy   zero   zint    zit    zon  
0  ...  0.032  0.000   0.002  0.017  0.003  0.000 

In [21]:
H_r = pd.DataFrame(doc_topic.round(5),
             index = all_review_list['asin'])
H_r

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9
asin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
B0000Y3NO6,0.05425,0.00904,0.00433,0.02283,0.10079,0.03511,0.04920,0.06421,0.00000,0.00097
B00012C5RS,0.05170,0.01705,0.00866,0.01598,0.04929,0.01919,0.12582,0.00585,0.00746,0.01432
B0001EKTTC,0.04838,0.02001,0.04257,0.07200,0.10526,0.00331,0.00418,0.00000,0.01372,0.00000
B0001EL5Q8,0.07571,0.00000,0.00000,0.08721,0.01848,0.00526,0.08459,0.01514,0.01924,0.00000
B0001EL5JA,0.03345,0.00424,0.00000,0.06189,0.03397,0.10668,0.02496,0.00438,0.00000,0.00970
...,...,...,...,...,...,...,...,...,...,...
B01HBS87ZS,0.03086,0.00000,0.00198,0.02121,0.07267,0.00000,0.04792,0.00000,0.00922,0.00000
B01HBS7WW2,0.08174,0.00252,0.00628,0.02414,0.00000,0.03298,0.00000,0.02414,0.03777,0.01070
B01HEESSHG,0.04058,0.00000,0.00492,0.00000,0.00000,0.01716,0.03694,0.03204,0.03320,0.03323
B01HBS7XP8,0.06497,0.02684,0.00368,0.03071,0.00000,0.00000,0.00000,0.00000,0.00876,0.02400


In [22]:
# save out dictionary of topic words
def save_topic_words(model, feature_names, no_top_words, topic_names=None):
    topic_words = {}
    for ix, topic in enumerate(model.components_):
        lst = [feature_names[i]
            for i in topic.argsort()[:-no_top_words - 1:-1]]
        topic_words.update({ix:lst})
    return topic_words

In [23]:
topic_dict = save_topic_words(nmf,tfidf_r.get_feature_names(), 10)
topic_dict[0]

['thi',
 'year',
 'hav',
 'ord',
 'purchas',
 'recommend',
 've',
 'tim',
 'amazon',
 'pric']

### Cosine similarity on product description

In [24]:
from sklearn.metrics.pairwise import cosine_similarity

In [25]:
# calculate the cosine similarity between all combinations of documents
from itertools import combinations

# # list all of the combinations of 5 take 2 as well as the pairs of phrases
# corpus = []
# for i in product_list['product'][0:100]:
#     corpus.append(i)
    
# pairs = list(combinations(enumerate(corpus),2))
# combos = [(a[0], b[0]) for a, b in pairs]
# phrases = [(a[1], b[1]) for a, b in pairs]

In [26]:
# results_tfidf = [cosine_similarity(rev_td_d_matrix.iloc[a].values.reshape(1,-1), rev_td_d_matrix.iloc[b].values.reshape(1,-1)) for a, b in combos]

# results_tfidf
# sorted(zip(results_tfidf, phrases), reverse=True)

### Cosine similarity on product reviews

In [27]:
# calculate the cosine similarity between all combinations of documents
from itertools import combinations

# list all of the combinations of 5 take 2 as well as the pairs of phrases
corpus = []
for i in all_review_list['asin']:
    corpus.append(i)
    
pairs = list(combinations(enumerate(corpus),2))
combos = [(a[0], b[0]) for a, b in pairs]
phrases = [(a[1], b[1]) for a, b in pairs]

In [28]:
results_tfidf = [cosine_similarity(H_r.iloc[a].values.reshape(1,-1), H_r.iloc[b].values.reshape(1,-1)) for a, b in combos]

results_tfidf
sim_asin = sorted(zip(results_tfidf, phrases), reverse=True)

In [29]:
sim_asin[0:5]

[(array([[1.]]), ('B00E6QU4WS', 'B01EM45E4G')),
 (array([[1.]]), ('B00AZJDNR8', 'B00JLQOO7O')),
 (array([[1.]]), ('B00AGOOFYM', 'B00IS41WTS')),
 (array([[1.]]), ('B0075RKSR6', 'B00IS41WTS')),
 (array([[1.]]), ('B0075RKSR6', 'B00AGOOFYM'))]

In [30]:
# save out all_review_list
with open('pickles/sim_asin.pickle', 'wb') as to_write:
    pickle.dump(sim_asin, to_write)

In [31]:
# save out all_review_list
with open('pickles/doc_topic.pickle', 'wb') as to_write:
    pickle.dump(H_r, to_write)

In [32]:
# save out topic words
with open('pickles/topic_words_dict.pickle', 'wb') as to_write:
    pickle.dump(topic_dict, to_write)

###  Visualizing w/ pyLDAvis

In [35]:
import numpy as np
import pyLDAvis

import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

In [36]:
# need to drop documents that don't match any of the topics (the row sums to zero)
a = np.sum(rev_td_r_matrix,axis=1)[np.sum(rev_td_r_matrix,axis=1)==0].index
rev_matrix2 = rev_td_r_matrix.drop(index=a)

In [37]:
# turn back into sparse matrix
from scipy import sparse

In [38]:
sparse_rev = sparse.csr_matrix(rev_matrix2.values)
type(sparse_rev)

scipy.sparse.csr.csr_matrix

In [39]:
# dimension model
# vectorizer model
# doc-term matrix
pyLDAvis.sklearn.prepare(nmf, sparse_rev, tfidf_r)

  kernel = (topic_given_term * np.log((topic_given_term.T / topic_proportion).T))
  log_lift = np.log(topic_term_dists / term_proportion)
  log_ttd = np.log(topic_term_dists)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
