## Set up

In [2]:
from pymongo import MongoClient
from pprint import pprint

import pandas as pd
import re
import nltk

import pickle
import json

In [3]:
from bson.objectid import ObjectId

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
# stemming
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to /Users/Jocelyn/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/Jocelyn/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
# This creates a client that uses the default port on localhost.
# If connecting to AWS, you need a connection string.
# Can do the same thing with MongoClient("mongodb://localhost:27017")
# client = MongoClient()
client = MongoClient("mongodb://localhost:27017")

In [7]:
# Makes it look similar to shell mongo
db = client.amazon_lb

In [8]:
db.list_collection_names()

['skin_care_face_reviews',
 'product_reviews',
 'skin_care_face_rev_agg',
 'skin_care_face_products',
 'skin_care_face_prod_rev',
 'product_metadata']

In [9]:
# cursor = db.skin_care_face_products.find()
# products = list(cursor)
# products[0]

In [10]:
cursor = db.skin_care_face_prod_rev.find()
products = list(cursor)
products[0]

{'_id': ObjectId('5dc881769f9b98109203bca6'),
 'title': 'DERMAdoctor Calm, Cool & Corrected anti-redness tranquility cream - 1.7 Oz',
 'also_buy': ['B019EKOK6G',
  'B01EM44IEI',
  'B0186FLPUE',
  'B00RORUNQI',
  'B006TD38ZG',
  'B01EM4D85I',
  'B002VM7ILE',
  'B0000ZREXG',
  'B000OMPQ76',
  'B00VHJ13EA',
  'B00E1KLNW4',
  'B01EM45YNW',
  'B00RORUL6U',
  'B000FJU4HK',
  'B00VHJ0ZUS'],
 'image': ['https://images-na.ssl-images-amazon.com/images/I/41sTOlcsmjL._SX50_SY65_CR,0,0,50,65_.jpg',
  'https://images-na.ssl-images-amazon.com/images/I/81K-8p2lItL.SX50_SY65_CR,0,0,50,65_PKmb-play-button-overlay-thumb_.jpg'],
 'rank': '186,829inBeautyamp;PersonalCare(',
 'also_view': ['B01ETQBD6U',
  'B019EKOK6G',
  'B01EM45YNW',
  'B0186FLPUE',
  'B07B8BN8H6',
  'B000FJU4HK',
  'B01EM44IEI',
  'B00RORUL6U',
  'B01FO1IT02',
  'B001FXOCT6',
  'B01EM45E4G',
  'B00RORUNQI',
  'B000Z61SSM',
  'B00VHJ13EA',
  'B002CML1XE',
  'B0000ZREXG',
  'B07B7HVPCB',
  'B071JS38YF',
  'B00ZPWR0N8',
  'B00E1KLNW4'],
 'de

In [11]:
# import product list
with open('pickles/product_list.pickle', 'rb') as to_read:
    product_list = pickle.load(to_read)

product_list.head(5)

Unnamed: 0,id,asin,product,overall_rating,review_count,description,category2,category3
0,5dc881769f9b98109203bca6,B0000Y3NO6,"DERMAdoctor Calm, Cool & Corrected anti-rednes...",3.9,34,,Treatments & Masks,Not listed
1,5dc881769f9b98109203bca9,B00012C5RS,"DERMAdoctor Picture Porefect Pore Minimizer, 1...",3.2,27,,"Exfoliators, Polishes & Scrubs",Exfoliators
2,5dc881769f9b98109203bcb9,B0001EKTTC,"Glytone Rejuvenating Mask, 3 oz.",4.3,26,Promotes a supple texture and bright tone Glyt...,Treatments & Masks,Masks
3,5dc881769f9b98109203bcc4,B0001EL5Q8,"PCA SKIN Rejuvenating Serum, 1 fl. oz.",3.6,55,Formulated with grape fruit stem cell extract ...,Treatments & Masks,Serums
4,5dc881769f9b98109203bcc5,B0001EL5JA,PCA SKIN Protecting Hydrator Broad Spectrum S...,4.3,41,This non-oily daily hydrator and sunscreen pr...,Creams & Moisturizers,Face Moisturizers


In [12]:
product_list.shape

(2269, 8)

In [13]:
product_list['id'].nunique()
product_list['asin'].nunique()

2269

### cleaning

In [14]:
products[8]['description']

'A 5% liquid benzoyl peroxide acne treatment used to spot-treat affected areas, penetrate pores to clear existing acne blemishes and prevent the occurrence of future breakouts. PCA SKIN is a trusted innovator in the development of highly effective skincare products. Our vision is to improve peoples lives by providing results-oriented skin care solutions for the health of your unique skin. '

In [15]:
all_reviews = pd.DataFrame(columns=['asin','reviewer_id','review_id','rating','review_text'])

for prod in products:
    df = pd.DataFrame()
    df['asin'] = [rev['asin'] for rev in prod['reviews']]
    df['reviewer_id'] = [rev['reviewerID'] for rev in prod['reviews']]
    df['review_id'] = [rev['_id'] for rev in prod['reviews']]
    df['rating'] = [rev['overall'] for rev in prod['reviews']]
    df['review_text'] = [rev['reviewText'] for rev in prod['reviews']]
    all_reviews = pd.concat([all_reviews,df],axis=0,ignore_index=True)
    

In [16]:
print(len(all_reviews))
all_reviews.tail(10)
# all_reviews['asin'].nunique()

84036


Unnamed: 0,asin,reviewer_id,review_id,rating,review_text
84026,B01HGSJPMW,ALUV2YKWZENKK,5dc881bfaf3db5220c9cf9c9,4.0,"This smells delicious (like rosehips), goes on..."
84027,B01HGSJPMW,A3RL0RFSSLJGKT,5dc881bfaf3db5220c9cf9cd,4.0,The first thing I noticed when I opened this w...
84028,B01HGSJPMW,A2PGT5T1M9UJCJ,5dc881c1af3db5220c9e3fe7,5.0,I love the feel of this product on my face is ...
84029,B01HGSJPMW,AAF5D1LTFGB7L,5dc881c1af3db5220c9e3fe9,5.0,I love all of the Elemis products.
84030,B01HGSJPMW,A1ZPA7IPK9H6OL,5dc881c1af3db5220c9e3fea,5.0,Wonderful product. You will see results immed...
84031,B01HGSJPMW,A1T0YIVHV7PMCA,5dc881c1af3db5220c9e3feb,5.0,"My go to Facial Oil, used 2x per day, morning ..."
84032,B01HGSJPMW,A3GFF90WVBT8H2,5dc881c1af3db5220c9e3fec,5.0,"Very nice facial oil, will buy again."
84033,B01HGSJPMW,A1KP9URYX9EFXQ,5dc881c1af3db5220c9e3fed,4.0,I work in an aesthetics practice. I regularly...
84034,B01HGSJPMW,AT218D5LS35QU,5dc881c1af3db5220c9e3fee,5.0,I loooove! My skin just eats this up. Only ne...
84035,B01HGSJPMW,A1CY917NEV8TAF,5dc881c1af3db5220c9e3fef,5.0,Excellent Product !


### text processing

In [17]:
df = pd.merge(product_list,all_reviews,on='asin',how='inner')
df.head(5)
df.shape

(84027, 12)

In [18]:
print(all_reviews['review_text'][1021])

I am an avid mustela user for me and baby. Mustela is my favorite scent in the world. This is not the same scent as other mustela products.


In [19]:
#descriptive reviews: 5
#has the phrase 'worth the money': 17
print(df['review_text'][1021])

I am an avid mustela user for me and baby. Mustela is my favorite scent in the world. This is not the same scent as other mustela products.


In [20]:
#Spelling correction

In [21]:
#Compound phrases
from nltk.tokenize import word_tokenize
from nltk.tokenize import MWETokenizer # multi-word expression

# my_text = "You all are the greatest students of all time."
# mwe_tokenizer = MWETokenizer([('You','all'), ('of', 'all', 'time')])

# mwe_tokens = mwe_tokenizer.tokenize(word_tokenize(my_text))
# ' '.join(mwe_tokens)

mwe_tokenizer = MWETokenizer([('times','a','day'),('holy','grail'),('t','zone')])



In [22]:
#Proper nouns
#list of brand names
#list of products


In [23]:
# Text preprocessing steps - remove numbers, captial letters and punctuation
import re
import string

remove_links = lambda x: re.sub('<[^<]+?>', '', x)
alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x)
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())

#custom stop words
more_stop_words = lambda x: re.sub('oz|ml|mls|ounce|ounces','',x)
more_stop_words2 = lambda x: re.sub('use|using|product|products|work|works|working|buy|buys|buying|stuff|good|excellent|great|bad|terrible|like|love|really','',x)

#custom replacements
# 'T-zone','t zone'
mwe = lambda x: ' '.join(mwe_tokenizer.tokenize(word_tokenize(x)))


df['review_text'] = df.review_text.map(remove_links).map(alphanumeric).map(punc_lower).map(more_stop_words).map(more_stop_words2).map(mwe)

In [24]:
# Stemmers

# stemmer = LancasterStemmer()
# stemmer = PorterStemmer
# stemmer = SnowballStemmer('english')
# lemmatizer = WordNetLemmatizer()

# df['review_text']=[' '.join([stemmer.stem(word) for word in text.split(' ')])
#           for text in df['review_text']]

In [25]:
#check
print(df.iloc[1018][11])

i think this is a i don t the smell as much as the hydra bebe s so i am taking a star off i don t think the smell is but my husband thinks it smells jalapenos i definitely don t but i guess every nose is different and it is something to consider if you are considering a purchase i don t it as shampoo only body wash on my month old she s baths so we do one every night as part of her bedtime routine i wanted a mild body cleanser and this is it keeps her skin soft and hydrated but i do also the hydra bebe after her bath i would order again


In [26]:
# for i,_ in enumerate(df.iterrows()):
#     if 'href' in df['review_text'][i]:
#         print(i,df['review_id'][i],df['review_text'][i])
#         pass

In [27]:
# test = db.skin_care_face_reviews.find({'_id':ObjectId('5dc881afaf3db5220c9584e4')},)
# list(test)

In [28]:
# df.iloc[1][1]
df_low = df.loc[df['rating']<=2] 
df_high = df.loc[df['rating']>=3]

In [29]:
# dataframes by category

def df_cat(category):
    df1 = df.loc[df['category2']==category]
    print(category, df1.shape)
    return df1

df_cleanser = df_cat('Cleansers')
df_moisturizer = df_cat('Creams & Moisturizers')
df_toner = df_cat('Toners & Astringents')
df_EPS = df_cat('Exfoliators, Polishes & Scrubs')
df_tx = df_cat('Treatments & Masks')

Cleansers (21763, 12)
Creams & Moisturizers (30958, 12)
Toners & Astringents (5852, 12)
Exfoliators, Polishes & Scrubs (2160, 12)
Treatments & Masks (19032, 12)


In [30]:
df_cleanser_low = df_cleanser.loc[df_cleanser['rating']<=3] 
df_cleanser_high = df_cleanser.loc[df_cleanser['rating']>=4]
print(df_cleanser_low.shape,df_cleanser_high.shape)

(3877, 12) (17886, 12)


In [31]:
# save out link_list
with open('pickles/review_list.pickle', 'wb') as to_write:
    pickle.dump(df, to_write)

### Count Vectorizer

In [33]:
# The first document-term matrix has default Count Vectorizer values - counts of unigrams
cv1 = CountVectorizer(stop_words='english')

rev_cv1 = cv1.fit_transform(df_cleanser['review_text'][0:1000])

rev_cv1_matrix = pd.DataFrame(rev_cv1.toarray(),columns=cv1.get_feature_names())
# rev_cv1_matrix.head(5)
rev_cv1_matrix.shape

(1000, 2965)

In [85]:
cv2 = CountVectorizer(ngram_range=(1,2), binary=True, stop_words='english')

rev_cv2 = cv2.fit_transform(df_cleanser['review_text'][0:1000])

rev_cv2_matrix = pd.DataFrame(rev_cv2.toarray(),columns=cv2.get_feature_names())
# rev_cv2_matrix.head(5)
rev_cv2_matrix.shape

(1000, 8306)

### TD-IDF

In [87]:
# Create TF-IDF versions of the Count Vectorizers created earlier in the exercise
tfidf1 = TfidfVectorizer(stop_words='english')

rev_td1 = tfidf1.fit_transform(df_cleanser['review_text'][0:1000])
rev_td1_matrix = pd.DataFrame(rev_td1.toarray(),columns=tfidf1.get_feature_names())
# rev_td1_matrix.head(5)
rev_td1_matrix.shape

(1000, 1824)

In [94]:
tfidf2 = TfidfVectorizer(ngram_range=(1,2), binary=False, stop_words='english', min_df = 3)

rev_td2 = tfidf2.fit_transform(df_tx['review_text'][0:1000])
rev_td2_matrix = pd.DataFrame(rev_td2.toarray(),columns=tfidf2.get_feature_names())
# rev_td2_matrix.head(5)
rev_td2_matrix.shape

(1000, 4680)

### Topic Modeling

In [33]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

#### Compare variances

In [None]:
# vectors = [rev_cv_matrix, rev_cv2_matrix, rev_td_matrix, rev_td2_matrix]
vectors = [rev_cv_matrix]
for vec in vectors:
    lsa = TruncatedSVD(5)
    lsa.fit_transform(vec)
    print(lsa.explained_variance_ratio_)

#### Display topics

In [95]:
# NMF

vectors = [rev_td2_matrix]
vectorizers = [tfidf2]

for i,vec in enumerate(vectors):
    nmf = NMF(3)
    nmf.fit_transform(vec)
#     print(nmf.explained_variance_ratio_)
#     topic_word = pd.DataFrame(lsa.components_.round(3),
# #              index = ["component_1","component_2","component_3"],
#              columns = cv1.get_feature_names())
#     print(topic_word)
    display_topics(nmf, vectorizers[i].get_feature_names(), 10)


Topic  0
skin, sensitive, sensitive skin, oily, oily skin, dry, dry skin, didn, cream, pca

Topic  1
years, ve years, ve, favorite years, favorite, day years, ve day, better skin, day, better

Topic  2
face, wash, cleanser, face wash, bottle, facial, does, clean, facial cleanser, doesn


In [89]:
# Acronynms: Latent Semantic Analysis (LSA) is just another name for 
#  Signular Value Decomposition (SVD) applied to Natural Language Processing (NLP)

vectors = [rev_cv1_matrix, rev_cv2_matrix, rev_td_matrix, rev_td2_matrix]
vectorizers = [cv1,cv2,tfidf1,tfidf2]
# vectors = [rev_cv1_matrix,rev_cv2_matrix]
# vectorizers = [cv1,cv2]

for i,vec in enumerate(vectors):
    lsa = TruncatedSVD(3)
    lsa.fit_transform(vec)
    print(lsa.explained_variance_ratio_)
#     topic_word = pd.DataFrame(lsa.components_.round(3),
# #              index = ["component_1","component_2","component_3"],
#              columns = cv1.get_feature_names())
#     print(topic_word)
    display_topics(lsa, vectorizers[i].get_feature_names(), 10)

[0.13172661 0.04712319 0.03478123]

Topic  0
skin, acne, cream, pca, really, face, day, just, dry, little

Topic  1
skin, nbsp, baby, said, wash, service, looks, neova, best, fresh

Topic  2
baby, scent, like, love, just, mustela, smell, face, little, old
[0.03143141 0.03045427 0.0272021 ]

Topic  0
skin, acne, face, cream, really, pca, just, little, like, days

Topic  1
radiant, neova, radiant skin, service, customer service, idea, early, fresh, treated, followed

Topic  2
acne scars, online, scars, believe, price little, old acne, visible, gotten, pricey, beauty
[0.00970453 0.02999891 0.02330085]

Topic  0
secrets, mild, perception, acne, faithful, hesitant, comfortable, phase, extremely, rinses

Topic  1
secrets, faithful, medicine, money, competitive, dispenses, acts, doctor, experiencing, felt

Topic  2
faithful, hesitant, phase, avid, disorders, extremely, old, infection, chanel, expected
[0.00391598 0.01116866 0.00873359]

Topic  0
love, skin, acne, cream, smell, face, pca, acne

In [90]:
# comparing high and low
tfidf2_high = TfidfVectorizer(ngram_range=(1,3), binary=True, stop_words='english')
rev_td2_high = tfidf2_high.fit_transform(df_cleanser_high['review_text'][0:1000])
rev_td2_high_matrix = pd.DataFrame(rev_td2_high.toarray(),columns=tfidf2_high.get_feature_names())

tfidf2_low = TfidfVectorizer(ngram_range=(1,3), binary=True, stop_words='english')
rev_td2_low = tfidf2_low.fit_transform(df_cleanser_low['review_text'])
rev_td2_low_matrix = pd.DataFrame(rev_td2_low.toarray(),columns=tfidf2_low.get_feature_names())

In [91]:
# Acronynms: Latent Semantic Analysis (LSA) is just another name for 
#  Signular Value Decomposition (SVD) applied to Natural Language Processing (NLP)

vectors = [rev_td2_low_matrix,rev_td2_high_matrix]
vectorizers = [tfidf2_low,tfidf2_high]

for i,vec in enumerate(vectors):
    lsa = TruncatedSVD(3)
    lsa.fit_transform(vec)
    print(lsa.explained_variance_ratio_)
#     topic_word = pd.DataFrame(lsa.components_.round(3),
# #              index = ["component_1","component_2","component_3"],
#              columns = cv1.get_feature_names())
#     print(topic_word)
    display_topics(lsa, vectorizers[i].get_feature_names(), 10)

[0.00090668 0.00365253 0.00269552]

Topic  0
skin, did, like, smell, ok, face, didn, does, don, really

Topic  1
ok, just ok, ok couldn raves, ok couldn, couldn raves, ok does, raves, ok does lather, does lather, ok moisturizing

Topic  2
smell, doesn smell, doesn, smell strong, strong, nice smell, nice, like smell, know, did like smell
[0.01173085 0.00508871 0.00763634]

Topic  0
love, love smell, smell, years love, years, skin, baby, gentle, mustela, nice

Topic  1
skin, acne, sensitive, sensitive skin, cream, face, pca, really, little, long

Topic  2
received, thank, item, received described, ship, received described love, transaction, future thank smooth, quick ship item, quick ship


In [98]:
# Acronynms: Latent Semantic Analysis (LSA) is just another name for 
#  Signular Value Decomposition (SVD) applied to Natural Language Processing (NLP)

# vectors = [rev_td2_low_matrix,rev_td2_high_matrix]
# vectorizers = [tfidf2_low,tfidf2_high]

# vectors = [rev_cv1_matrix, rev_cv2_matrix, rev_td_matrix, rev_td2_matrix]
# vectorizers = [cv1,cv2,tfidf1,tfidf2]
vectors = [rev_td2_matrix]
vectorizers = [tfidf2]

for i,vec in enumerate(vectors):
    nmf = NMF(4)
    nmf.fit_transform(vec)
#     print(nmf.explained_variance_ratio_)
#     topic_word = pd.DataFrame(lsa.components_.round(3),
# #              index = ["component_1","component_2","component_3"],
#              columns = cv1.get_feature_names())
#     print(topic_word)
    display_topics(nmf, vectorizers[i].get_feature_names(), 10)


Topic  0
acne, cream, acne cream, pca, ed, skin, helps, spot, really, face

Topic  1
love, smell, love smell, years love, baby, mustela, years, love mustela, scent, ve

Topic  2
skin, gentle, sensitive, nice, sensitive skin, way, little, long, leaves, cleanser

Topic  3
received, item, thank, ship item, future thank, quick ship, smooth transaction, ship, described love, item received


### Count vectorize all reviews

In [76]:
# The first document-term matrix has default Count Vectorizer values - counts of unigrams
cv3 = CountVectorizer(stop_words='english', min_df=3)

rev_cv3 = cv3.fit_transform(df['review_text'])

rev_cv3_matrix = pd.DataFrame(rev_cv3.toarray(),columns=cv3.get_feature_names())

rev_cv3_matrix.shape

(84027, 11790)

In [77]:
rev_cv3_matrix.head(5)

Unnamed: 0,aa,aaa,aactin,ab,aback,abandon,abandoned,abated,abc,abd,...,zingiber,zip,ziplock,zirh,zit,zits,zone,zones,zoom,zyme
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [43]:
# for i in df['review_text']:
#     if '00' in i:
#         print(i)

In [44]:
tfidf3 = TfidfVectorizer(stop_words='english', min_df = 3)

rev_td3 = tfidf3.fit_transform(df['review_text'])
rev_td3_matrix = pd.DataFrame(rev_td3.toarray(),columns=tfidf3.get_feature_names())
# rev_td3_matrix.head(5)
rev_td3_matrix.shape

(84027, 11790)

In [45]:
rev_td3_matrix.head(5)

Unnamed: 0,aa,aaa,aactin,ab,aback,abandon,abandoned,abated,abc,abd,...,zingiber,zip,ziplock,zirh,zit,zits,zone,zones,zoom,zyme
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
from sklearn.metrics.pairwise import cosine_similarity

In [71]:
# calculate the cosine similarity between all combinations of documents
from itertools import combinations

# list all of the combinations of 5 take 2 as well as the pairs of phrases
corpus = []
for i in df['review_text'][0:10]:
    corpus.append(i)
    
pairs = list(combinations(enumerate(corpus),2))
combos = [(a[0], b[0]) for a, b in pairs]
phrases = [(a[1], b[1]) for a, b in pairs]

In [67]:
# pairs = list(combinations(range(0,10),2))

In [74]:
results_tfidf = [cosine_similarity(rev_td3_matrix.iloc[a].values.reshape(1,-1), rev_td3_matrix.iloc[b].values.reshape(1,-1)) for a, b in combos]

results_tfidf
sorted(zip(results, phrases), reverse=True)

[(array([[0.40824829]]),
  ('i gave it a shot for a while until the small bottle was completely empty and it just didn t seem to make a difference for me the cream feels cool on the skin which is nice but overall i didn t notice a change',
   'i have d this cream for a few months now i don t wear makeup but i try to take care of my skin i have mild rosacea in a few spots i have been to the dermatologist and gotten lots of prescription creams even a steroid cream that ed pretty well this s almost as as the steroid it keeps the redness managed without all the chemicals i m sure it has chemicals in it but you don t need a prescription so i would think that these do not have the side effects of a prescription cream i a very small amount and it keeps the redness down i m still on my first container of it i would guess a jar would last months if you d it sparingly it s worth it if you can afford it')),
 (array([[0.40824829]]),
  ('i gave it a shot for a while until the small bottle was compl

In [54]:
type(rev_td3_matrix.iloc[0])

pandas.core.series.Series

In [56]:
corpus = ['The weather is hot under the sun',
          'I make my hot chocolate with milk',
          'One hot encoding',
          'I will have a chai latte with milk',
          'There is a hot sale today']
# create the document-term matrix with count vectorizer

cv = CountVectorizer(stop_words="english")
X = cv.fit_transform(corpus).toarray()
dt = pd.DataFrame(X, columns=cv.get_feature_names())
dt

Unnamed: 0,chai,chocolate,encoding,hot,latte,make,milk,sale,sun,today,weather
0,0,0,0,1,0,0,0,0,1,0,1
1,0,1,0,1,0,1,1,0,0,0,0
2,0,0,1,1,0,0,0,0,0,0,0
3,1,0,0,0,1,0,1,0,0,0,0
4,0,0,0,1,0,0,0,1,0,1,0


In [62]:
# calculate the cosine similarity between all combinations of documents
from itertools import combinations

# list all of the combinations of 5 take 2 as well as the pairs of phrases
pairs = list(combinations(enumerate(corpus),2))
combos = [(a[0], b[0]) for a, b in pairs]
phrases = [(a[1], b[1]) for a, b in pairs]

# calculate the cosine similarity for all pairs of phrases and sort by most similar
results = [cosine_similarity(dt.iloc[a].values.reshape(1,-1), dt.iloc[b].values.reshape(1,-1)) for a, b in combos]
sorted(zip(results, phrases), reverse=True)

[(array([[0.40824829]]),
  ('The weather is hot under the sun', 'One hot encoding')),
 (array([[0.40824829]]), ('One hot encoding', 'There is a hot sale today')),
 (array([[0.35355339]]),
  ('I make my hot chocolate with milk', 'One hot encoding')),
 (array([[0.33333333]]),
  ('The weather is hot under the sun', 'There is a hot sale today')),
 (array([[0.28867513]]),
  ('The weather is hot under the sun', 'I make my hot chocolate with milk')),
 (array([[0.28867513]]),
  ('I make my hot chocolate with milk', 'There is a hot sale today')),
 (array([[0.28867513]]),
  ('I make my hot chocolate with milk', 'I will have a chai latte with milk')),
 (array([[0.]]),
  ('The weather is hot under the sun', 'I will have a chai latte with milk')),
 (array([[0.]]), ('One hot encoding', 'I will have a chai latte with milk')),
 (array([[0.]]),
  ('I will have a chai latte with milk', 'There is a hot sale today'))]