# Read in Data

In [19]:
# Import data.
import pandas as pd
import numpy as np
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt

nltk.download('punkt')
nltk.download('stopwords')

df = pd.read_csv('watch_reviews.tsv', sep='\t', error_bad_lines=False)

[nltk_data] Downloading package punkt to /Users/judychen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/judychen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
b'Skipping line 8704: expected 15 fields, saw 22\nSkipping line 16933: expected 15 fields, saw 22\nSkipping line 23726: expected 15 fields, saw 22\n'
b'Skipping line 85637: expected 15 fields, saw 22\n'
b'Skipping line 132136: expected 15 fields, saw 22\nSkipping line 158070: expected 15 fields, saw 22\nSkipping line 166007: expected 15 fields, saw 22\nSkipping line 171877: expected 15 fields, saw 22\nSkipping line 177756: expected 15 fields, saw 22\nSkipping line 181773: expected 15 fields, saw 22\nSkipping line 191085: expected 15 fields, saw 22\nSkipping line 196273: expected 15 fields, saw 22\nSkipping line 196331: expected 15 fields, saw 22\n'
b'Skipping line 197000: expected 15 fields, saw 22\nSkipping line 197011: exp

In [20]:
df.head()

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,US,3653882,R3O9SGZBVQBV76,B00FALQ1ZC,937001370,"Invicta Women's 15150 ""Angel"" 18k Yellow Gold ...",Watches,5,0,0,N,Y,Five Stars,Absolutely love this watch! Get compliments al...,2015-08-31
1,US,14661224,RKH8BNC3L5DLF,B00D3RGO20,484010722,Kenneth Cole New York Women's KC4944 Automatic...,Watches,5,0,0,N,Y,I love thiswatch it keeps time wonderfully,I love this watch it keeps time wonderfully.,2015-08-31
2,US,27324930,R2HLE8WKZSU3NL,B00DKYC7TK,361166390,Ritche 22mm Black Stainless Steel Bracelet Wat...,Watches,2,1,1,N,Y,Two Stars,Scratches,2015-08-31
3,US,7211452,R31U3UH5AZ42LL,B000EQS1JW,958035625,Citizen Men's BM8180-03E Eco-Drive Stainless S...,Watches,5,0,0,N,Y,Five Stars,"It works well on me. However, I found cheaper ...",2015-08-31
4,US,12733322,R2SV659OUJ945Y,B00A6GFD7S,765328221,Orient ER27009B Men's Symphony Automatic Stain...,Watches,4,0,0,N,Y,"Beautiful face, but cheap sounding links",Beautiful watch face. The band looks nice all...,2015-08-31


In [21]:
# Check missing values.
df.isnull().sum()

marketplace            0
customer_id            0
review_id              0
product_id             0
product_parent         0
product_title          2
product_category       0
star_rating            0
helpful_votes          0
total_votes            0
vine                   0
verified_purchase      0
review_headline        7
review_body          148
review_date            4
dtype: int64

In [22]:
# Remove if the review without review boday.
df.dropna(subset=['review_body'], inplace=True)
df.reset_index(inplace=True, drop=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 960056 entries, 0 to 960055
Data columns (total 15 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   marketplace        960056 non-null  object
 1   customer_id        960056 non-null  int64 
 2   review_id          960056 non-null  object
 3   product_id         960056 non-null  object
 4   product_parent     960056 non-null  int64 
 5   product_title      960054 non-null  object
 6   product_category   960056 non-null  object
 7   star_rating        960056 non-null  int64 
 8   helpful_votes      960056 non-null  int64 
 9   total_votes        960056 non-null  int64 
 10  vine               960056 non-null  object
 11  verified_purchase  960056 non-null  object
 12  review_headline    960049 non-null  object
 13  review_body        960056 non-null  object
 14  review_date        960052 non-null  object
dtypes: int64(5), object(10)
memory usage: 109.9+ MB


In [40]:
# Use the first 15000 data as training data.
data = df.loc[:14999, 'review_body'].tolist()

# Tokenizing and Stemming

Load stopwords and stemmer function from NLTK library.

In [41]:
# Use nltk's English stopwords.
stopwords = nltk.corpus.stopwords.words('english')
stopwords.append("'s")
stopwords.append("'m")
stopwords.append("br") #html <br>
stopwords.append("watch")

print("Stopwords that we use from nltk library: ")
print(stopwords)

Stopwords that we use from nltk library: 
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'no

In [42]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

# Define functions to tokenize and stem reviews.
def tokenization_and_stemming(text):
    # exclude stop words and tokenize the document, generate a list of string
    tokens = []
    for word in nltk.word_tokenize(text):
        if word.lower() not in stopwords:
            tokens.append(word.lower())
    
    # filter out any tokens not containing letters such as numeric tokens and raw punctuation.
    filtered_tokens = []
    for token in tokens:
        if token.isalpha():
            filtered_tokens.append(token)
    
    # stemming.
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

# Test function with the first review.
print(data[0])
print(tokenization_and_stemming(data[0]))


Absolutely love this watch! Get compliments almost every time I wear it. Dainty.
['absolut', 'love', 'get', 'compliment', 'almost', 'everi', 'time', 'wear', 'dainti']


# Term Frequency - Inverse Document Frequency

In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Define vectorizer parameters, and use TfidfVectorizer to create tf-idf matrix
tfidf_model = TfidfVectorizer(max_df=0.99, max_features=1000, min_df=0.01, stop_words='english', use_idf=True, 
                                tokenizer=tokenization_and_stemming, ngram_range=(1,1))

# Fit the vectorizer to synopses
tfidf_matrix = tfidf_model.fit_transform(data)
tfidf_matrix



<15000x227 sparse matrix of type '<class 'numpy.float64'>'
	with 100770 stored elements in Compressed Sparse Row format>

In [44]:
tfidf_matrix.toarray()

array([[0.        , 0.52538715, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [45]:
tfidf_matrix.todense()

matrix([[0.        , 0.52538715, 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ]])

In [46]:
# Save terms identified by TF-IDF
tf_selected_words = tfidf_model.get_feature_names()
print(tf_selected_words)

['abl', 'absolut', 'accur', 'actual', 'adjust', 'alarm', 'alreadi', 'alway', 'amaz', 'amazon', 'anoth', 'anyth', 'appear', 'arriv', 'attract', 'automat', 'awesom', 'bad', 'band', 'batteri', 'beauti', 'best', 'better', 'big', 'bit', 'black', 'blue', 'bought', 'box', 'bracelet', 'brand', 'broke', 'button', 'buy', 'ca', 'came', 'case', 'casio', 'chang', 'cheap', 'clasp', 'classi', 'clear', 'clock', 'color', 'come', 'comfort', 'compliment', 'cool', 'cost', 'coupl', 'crystal', 'cute', 'dark', 'date', 'day', 'deal', 'definit', 'design', 'dial', 'differ', 'difficult', 'digit', 'disappoint', 'display', 'durabl', 'easi', 'easili', 'eleg', 'end', 'everi', 'everyday', 'everyth', 'exact', 'excel', 'expect', 'expens', 'face', 'far', 'fast', 'favorit', 'featur', 'feel', 'fell', 'figur', 'fine', 'fit', 'function', 'gave', 'gift', 'glass', 'goe', 'gold', 'good', 'got', 'great', 'hand', 'happi', 'hard', 'heavi', 'help', 'high', 'hold', 'hope', 'hour', 'howev', 'husband', 'instruct', 'invicta', 'issu', 

# K-means Clustering

In [47]:
# k-means clustering
from sklearn.cluster import KMeans

# define number of clusters
num_clusters = 5
km = KMeans(n_clusters= num_clusters)
km.fit(tfidf_matrix)

clusters = km.labels_.tolist()

In [48]:
# Analyze K-means Result
# Create dataframe films from all of the input files.
product = {'review': df[:15000].review_body, 'cluster': clusters}
frame = pd.DataFrame(product, columns = ['review', 'cluster'])
frame.head(10)

Unnamed: 0,review,cluster
0,Absolutely love this watch! Get compliments al...,4
1,I love this watch it keeps time wonderfully.,1
2,Scratches,4
3,"It works well on me. However, I found cheaper ...",4
4,Beautiful watch face. The band looks nice all...,4
5,"i love this watch for my purpose, about the pe...",4
6,"for my wife and she loved it, looks great and ...",2
7,I was about to buy this thinking it was a Swis...,4
8,Watch is perfect. Rugged with the metal &#34;B...,2
9,Great quality and build.<br />The motors are r...,4


In [49]:
# Number of reviews included in each cluster.
frame['cluster'].value_counts().to_frame()

Unnamed: 0,cluster
4,10587
2,1569
1,1022
0,975
3,847


In [50]:
# km.cluster_centers_ denotes the importances of each items in centroid
km.cluster_centers_

array([[0.        , 0.        , 0.00499989, ..., 0.00340436, 0.00460536,
        0.00128474],
       [0.0004098 , 0.01794603, 0.00047436, ..., 0.        , 0.0049906 ,
        0.00216037],
       [0.00101928, 0.00412852, 0.00435933, ..., 0.00439511, 0.00812147,
        0.01049133],
       [0.        , 0.        , 0.00313669, ..., 0.00556213, 0.00751155,
        0.00090661],
       [0.00482522, 0.00620183, 0.00619551, ..., 0.00902359, 0.02522583,
        0.01657956]])

In [51]:
# Clustering result by K-means
order_centroids = km.cluster_centers_.argsort()[:, ::-1]

cluster_keywords_summary = {}
for i in range(num_clusters):
    print ("Cluster " + str(i) + " words:", end='')
    cluster_keywords_summary[i] = []
    for ind in order_centroids[i, :6]: #replace 6 with n words per cluster
        cluster_keywords_summary[i].append(tf_selected_words[ind])
        print (tf_selected_words[ind] + ",", end='')
    print ()

    cluster_reviews = frame[frame.cluster==i].review.tolist()
    print ("Cluster " + str(i) + " reviews (" + str(len(cluster_reviews)) + " reviews): ")

Cluster 0 words:good,product,look,qualiti,price,recommend,
Cluster 0 reviews (975 reviews): 
Cluster 1 words:love,gift,husband,beauti,wife,bought,
Cluster 1 reviews (1022 reviews): 
Cluster 2 words:great,look,price,love,work,product,
Cluster 2 reviews (1569 reviews): 
Cluster 3 words:nice,look,price,love,realli,like,
Cluster 3 reviews (847 reviews): 
Cluster 4 words:look,like,time,band,work,love,
Cluster 4 reviews (10587 reviews): 


# Topic Modeling - Latent Dirichlet Allocation

In [53]:
# Use LDA for clustering
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components=5)

In [54]:
# document topic matrix for tfida_matrix_lda
lda_output = lda.fit_transform(tfidf_matrix)
print(lda_output)

[[0.75441685 0.0595699  0.06297061 0.06310917 0.05993348]
 [0.08436316 0.08412686 0.08735153 0.6569227  0.08723575]
 [0.10000078 0.10000088 0.59998525 0.1000009  0.10001218]
 ...
 [0.10018696 0.59968791 0.10011057 0.10000007 0.10001448]
 [0.08730112 0.08399738 0.08495703 0.65884727 0.0848972 ]
 [0.06800343 0.06847111 0.72205934 0.06916363 0.0723025 ]]


In [55]:
# Topics and words matrix
topic_word = lda.components_
print(topic_word)

[[  0.5423918   25.84042365   0.2105005  ...   0.67426972  94.26972084
   20.96506909]
 [  0.20180693   3.56771416   0.20141299 ...   9.91245663  91.09386261
    1.21040106]
 [ 40.89954817   6.60404461  50.89000497 ...  43.03387802 110.65889129
   91.86710687]
 [  0.20050495  54.85240854  22.43111231 ...   0.201233     0.53673661
    0.2017021 ]
 [ 12.25840155   0.61268357   7.71510929 ...  57.63718758   0.20213397
   82.97287656]]


In [56]:
topic_names = ["Topic" + str(i) for i in range(lda.n_components)]
doc_names = ['Doc' + str(i) for i in range(len(data))]
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topic_names, index=doc_names)

# get dominant topic for each document
topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['topic'] = topic
df_document_topic.head(10)

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,topic
Doc0,0.75,0.06,0.06,0.06,0.06,0
Doc1,0.08,0.08,0.09,0.66,0.09,3
Doc2,0.1,0.1,0.6,0.1,0.1,2
Doc3,0.06,0.06,0.74,0.07,0.07,2
Doc4,0.15,0.04,0.73,0.04,0.04,2
Doc5,0.7,0.07,0.07,0.07,0.07,0
Doc6,0.07,0.06,0.06,0.74,0.06,3
Doc7,0.06,0.06,0.75,0.06,0.07,2
Doc8,0.33,0.05,0.53,0.05,0.05,2
Doc9,0.06,0.07,0.76,0.06,0.06,2


In [57]:
df_document_topic['topic'].value_counts().to_frame()

Unnamed: 0,topic
2,5105
0,3389
4,2537
3,2194
1,1775


In [58]:
# topic word matrix
print(lda.components_)
df_topic_words = pd.DataFrame(lda.components_)

# column and index
df_topic_words.columns = tfidf_model.get_feature_names()
df_topic_words.index = topic_names
df_topic_words.head()

[[  0.5423918   25.84042365   0.2105005  ...   0.67426972  94.26972084
   20.96506909]
 [  0.20180693   3.56771416   0.20141299 ...   9.91245663  91.09386261
    1.21040106]
 [ 40.89954817   6.60404461  50.89000497 ...  43.03387802 110.65889129
   91.86710687]
 [  0.20050495  54.85240854  22.43111231 ...   0.201233     0.53673661
    0.2017021 ]
 [ 12.25840155   0.61268357   7.71510929 ...  57.63718758   0.20213397
   82.97287656]]


Unnamed: 0,abl,absolut,accur,actual,adjust,alarm,alreadi,alway,amaz,amazon,...,week,weight,white,wife,wish,work,worn,worth,wrist,year
Topic0,0.542392,25.840424,0.2105,21.34963,21.327569,0.200346,2.778315,31.808451,0.20157,10.511356,...,0.20184,29.893737,4.987575,0.986931,15.38232,15.479009,9.994829,0.67427,94.269721,20.965069
Topic1,0.201807,3.567714,0.201413,0.21809,2.236834,0.200886,0.228981,0.292713,123.212693,2.38127,...,0.201249,0.208684,0.201973,0.200902,8.695506,47.665976,0.203069,9.912457,91.093863,1.210401
Topic2,40.899548,6.604045,50.890005,46.004562,70.804714,35.912878,52.138604,36.609637,0.306053,33.261173,...,109.793904,28.746361,47.905246,10.079687,39.345207,127.914272,46.565126,43.033878,110.658891,91.867107
Topic3,0.200505,54.852409,22.431112,0.201249,0.200886,0.201924,0.200658,0.201395,0.200698,0.201077,...,0.200748,13.654278,0.201438,99.804981,5.008292,27.761517,0.201782,0.201233,0.536737,0.201702
Topic4,12.258402,0.612684,7.715109,4.547084,0.888337,21.85178,2.42789,13.151569,0.20132,51.207135,...,34.492387,0.200802,0.20082,0.20159,5.641893,318.969858,4.486652,57.637188,0.202134,82.972877


In [59]:
# print top n keywords for each topic
def print_topic_words(tfidf_model, lda_model, n_words):
    words = np.array(tfidf_model.get_feature_names())
    topic_words = []
    # for each topic, we have words weight
    for topic_words_weights in lda_model.components_:
        top_words = topic_words_weights.argsort()[::-1][:n_words]
        topic_words.append(words.take(top_words))
    return topic_words

topic_keywords = print_topic_words(tfidf_model=tfidf_model, lda_model=lda, n_words=15)        

df_topic_words = pd.DataFrame(topic_keywords)
df_topic_words.columns = ['Word '+str(i) for i in range(df_topic_words.shape[1])]
df_topic_words.index = ['Topic '+str(i) for i in range(df_topic_words.shape[0])]
df_topic_words

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14
Topic 0,great,beauti,look,love,awesom,big,compliment,husband,like,pictur,lot,wrist,watch,bought,happi
Topic 1,perfect,qualiti,fit,band,recommend,amaz,small,cute,ok,great,look,high,good,eleg,wrist
Topic 2,nice,time,band,look,like,day,wear,use,cheap,strap,realli,face,hand,broke,read
Topic 3,love,good,excel,price,gift,like,great,look,wife,pretti,nice,stylish,easi,simpl,durabl
Topic 4,work,product,thank,great,batteri,expect,time,arriv,item,fast,cool,ship,money,best,valu
