In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
import pandas as pd
import numpy as np
import random

In [2]:
df = pd.read_csv('../data/clorox_data.csv')
by_subcategory = df.groupby('subcategory')['review_text']

# sample 20 subcategories
random.seed(13)
sample_subcategories = random.sample(list(pd.unique(df['subcategory'])), 20)
sample_subcategories

['CONSUMABLE TOOLS SPONGES',
 'MOISTURE ABSORBER',
 'FACE CARE FACE CARE TOOLS',
 'CONSUMABLE TOOLS',
 'FACE CARE FACIAL PRIMER',
 'DILUTABLES NATURAL/CONCENTRATED',
 "MEN'S CARE SHAVE",
 'CONSUMABLE TOOLS CONSUMABLE SCRUBBERS',
 'WIPES OTHER WIPES',
 'HAIR CARE HAIR TREATMENTS',
 'BODY CARE HAIR REMOVAL',
 'BATHROOM CLEANERS LIMESCALE/HARDWATER CLEANERS',
 'SPRAY CLEANERS BLEACH CLEANERS',
 'CORE GIFTS HOLIDAY KITS',
 'TOILET BOWL CLEANERS MANUAL TB CLEANERS',
 'ODOR CONTROLLING AIR FRESHENERS',
 'FACE CARE FACIAL TOWELETTES',
 'BODY CARE DEODORANT',
 'BATHROOM CLEANERS MILDEW CLEANERS',
 'ABRASIVE CLEANERS']

In [3]:
category = sample_subcategories[0]
text = by_subcategory.get_group(category)
text

145       I bought this to clean my hot tub and it did a...
255                               loved the product. Strong
261       I cannot imagine not having a Scrub Daddy. The...
352               The scrub daddy is easier for me to hold!
420       I love that it softens in hot water and harden...
                                ...                        
211066    i originally gave scrub daddy a chance because...
211365    It’s amazing and I love using both sides to cl...
213277    Love the scub daddy. It's my fave out of all t...
213325    Me and my brother got one of these they clean ...
213804                                    Best sponge ever!
Name: review_text, Length: 2479, dtype: object

In [4]:
import string

def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation + '’')
    return text.translate(translator)

In [5]:
import nltk
from nltk.stem import WordNetLemmatizer

# Download required NLTK data
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    # Tokenize the text
    words = nltk.word_tokenize(text)
    
    # Perform part-of-speech tagging
    pos_tags = nltk.pos_tag(words)
    
    # Lemmatize words based on their part of speech
    lemmatized_words = []
    for word, pos in pos_tags:
        if pos.startswith('N'):
            lemmatized_words.append(lemmatizer.lemmatize(word, pos='n'))
        elif pos.startswith('V'):
            lemmatized_words.append(lemmatizer.lemmatize(word, pos='v'))
        elif pos.startswith('R'):
            lemmatized_words.append(lemmatizer.lemmatize(word, pos='r'))
        elif pos.startswith('J'):
            lemmatized_words.append(lemmatizer.lemmatize(word, pos='a'))
        else:
            lemmatized_words.append(word)
    
    return ' '.join(lemmatized_words)

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jessicaluo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/jessicaluo/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [21]:
671000/2500*3.8/60

16.998666666666665

In [6]:
# lowercase, remove punctuation, and lemmatize
text = text.apply(lambda x: x.lower())
text = text.apply(lambda x: remove_punctuation(x))
text = text.apply(lambda x: lemmatize_text(x))
text

145       i buy this to clean my hot tub and it do an in...
255                                 love the product strong
261       i can not imagine not have a scrub daddy the s...
352                  the scrub daddy be easy for me to hold
420       i love that it soften in hot water and hardens...
                                ...                        
211066    i originally give scrub daddy a chance because...
211365    its amazing and i love use both side to clean ...
213277    love the scub daddy its my fave out of all the...
213325    me and my brother get one of these they clean ...
213804                                     best sponge ever
Name: review_text, Length: 2479, dtype: object

In [7]:
vectorizer = TfidfVectorizer(stop_words='english', 
                             use_idf=True, 
                             ngram_range=(1, 2),
                             smooth_idf=True)
n_topics = 15

# SVD to reduce dimensionality: 
svd_model = TruncatedSVD(n_components=n_topics,        
                         algorithm='randomized',
                         n_iter=20)
# pipeline of tf-idf + SVD, fit to and applied to documents:
svd_transformer = Pipeline([('tfidf', vectorizer), 
                            ('svd', svd_model)])

svd_matrix = svd_transformer.fit_transform(text)

terms = vectorizer.get_feature_names_out()
topic_top_words_weights = []
# Get the top 10 words for each topic
for i, comp in enumerate(svd_model.components_):
    terms_comp = zip(terms, comp)
    sorted_terms = sorted(terms_comp, key=lambda x: x[1], reverse=True)[:10]
    topic_top_words_weights.append(sorted_terms)
    print(f"Topic {i+1}:")
    for t in sorted_terms:
        # print(f"{t[0]} ({t[1]:.4f})", end='\n')  # Include weights for better understanding
        print(f"{t[0]}", end=', ')
    print("\n")

Topic 1:
love, scrub, sponge, daddy, scrub daddy, love scrub, use, great, mommy, clean, 

Topic 2:
scrub, daddy, scrub daddy, love scrub, mommy, scrub mommy, daddy product, daddy scrub, product, love, 

Topic 3:
best, sponge, best sponge, sponge use, use, ive, sponge ive, love sponge, ive use, daddy best, 

Topic 4:
love, love sponge, love love, love product, sponge, product, love scrub, absolutely love, absolutely, sponge work, 

Topic 5:
clean, dish, easy, use, good, like, make, clean dish, scratch, doesnt, 

Topic 6:
long, long time, time, smell, sponge, doesnt, like, doesnt smell, great long, like sponge, 

Topic 7:
sponge, love sponge, like, mommy, scrub mommy, work great, work, sponge work, scrub, like sponge, 

Topic 8:
product, sponge, great product, love product, daddy product, scrub daddy, daddy, like, great, like sponge, 

Topic 9:
scrubber, like, good, doesnt, like sponge, product, water, best scrubber, little, really, 

Topic 10:
use, mommy, scrub mommy, sponge use, produc

In [8]:
# display representative reviews for each topic
for i in range(n_topics):
    print(f"Topic {i+1}:")
    print(text.iloc[np.argsort(svd_matrix[:,i])[::-1][:3]].values)
    print("\n")

Topic 1:
['i love the scrub daddy so much' 'love some scrub daddy and scrub mommy'
 'love the scrub daddy sponge']


Topic 2:
['i love the scrub daddy so much' 'love some scrub daddy and scrub mommy'
 'i love scrub mommy and scrub daddy']


Topic 3:
['best sponge ever' 'cant go without one the best sponge out there'
 'best sponge you will ever use']


Topic 4:
['i love everything about this' 'love that it have 2 side'
 'love this one more than any others']


Topic 5:
['clean up well' 'how well it clean the dish'
 'easy to use and to keep clean']


Topic 6:
['these last a long time and when one be out' 'last a long time'
 'it last a long time and doesnt smell']


Topic 7:
['its a sponge'
 'i love this sponge so much i now have 4 of them they work great on everything ️'
 'love this sponge work great for everything']


Topic 8:
['great product would buy again' 'thus be my go to love this product'
 'i love these product']


Topic 9:
['would never go back to any other scrubber'
 'good scrub

In [9]:
# create a dataframe of words and their weights for each topic
topic_words = pd.DataFrame(svd_model.components_, columns=terms).T
topic_words.columns = [f"Topic {i+1}" for i in range(n_topics)]

# remove words with weights close to zero
topic_words = topic_words[topic_words.abs().max(axis=1) > 0.1]
topic_words

Unnamed: 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,Topic 8,Topic 9,Topic 10,Topic 11,Topic 12,Topic 13,Topic 14,Topic 15
best,0.134044,-0.172814,0.399439,-0.232881,-0.280127,-0.181274,-0.30437,-0.085041,-0.067847,-0.129102,0.110485,-0.182666,-0.078979,0.01174,-0.019944
best scrubber,0.012379,-0.029689,0.042179,-0.045013,-0.031736,-0.039011,-0.103708,-0.094403,0.103046,-0.025717,-0.015832,-0.026888,0.061903,0.02561,-0.00341
best sponge,0.087369,-0.138666,0.30888,-0.138816,-0.217476,-0.099898,-0.097576,0.053224,-0.155681,-0.062555,0.106931,-0.152937,-0.080423,0.050591,0.048254
buy,0.077646,-0.047094,0.009041,-0.033523,-0.014041,-0.026722,-0.052766,0.070552,0.068333,0.064628,0.138004,0.053403,-0.044109,0.236823,-0.245975
clean,0.151789,-0.134658,-0.042817,-0.006388,0.447013,-0.079242,-0.034209,-0.024447,-0.328911,-0.209311,0.085762,-0.17349,0.228461,0.255836,-0.057712
clean dish,0.031737,-0.046265,-0.003778,-0.006085,0.109867,-0.030439,-0.00241,-0.017104,-0.103636,-0.099823,0.045296,0.067833,-0.003459,0.002301,0.018334
cleaning,0.049382,-0.064378,-0.005093,-0.036554,-0.023374,-0.099013,-0.009994,-0.052614,0.013664,-0.067845,0.106306,0.151766,-0.058933,-0.224963,-0.135906
cold,0.021934,-0.011849,0.000678,-0.017573,0.038348,0.00327,0.009137,-0.00322,0.057286,0.059379,-0.044454,-0.046481,-0.061393,-0.048091,-0.13927
cold water,0.015896,-0.009205,-0.001842,-0.01175,0.03129,0.003005,0.007016,-0.005345,0.044939,0.056803,-0.040089,-0.037882,-0.052345,-0.042328,-0.116615
daddy,0.266695,0.302722,0.026348,-0.142443,-0.035559,-0.001483,0.00327,0.133678,-0.017413,-0.155566,-0.283045,0.043186,0.00405,-0.0124,-0.034528


In [10]:
# remove space between 2 gram words
topic_top_words_weights = [[(w.replace(' ', ''), v) for w, v in topic] for topic in topic_top_words_weights]
topic_top_words_weights

[[('love', np.float64(0.37520064029420647)),
  ('scrub', np.float64(0.36807701117954705)),
  ('sponge', np.float64(0.3083298715251952)),
  ('daddy', np.float64(0.26669524993906807)),
  ('scrubdaddy', np.float64(0.255714982895419)),
  ('lovescrub', np.float64(0.206293310035861)),
  ('use', np.float64(0.18701568573189697)),
  ('great', np.float64(0.15852963994633917)),
  ('mommy', np.float64(0.1534711040491157)),
  ('clean', np.float64(0.15178862762659676))],
 [('scrub', np.float64(0.35300584677061586)),
  ('daddy', np.float64(0.302722277230593)),
  ('scrubdaddy', np.float64(0.29951979728016626)),
  ('lovescrub', np.float64(0.2840133715837418)),
  ('mommy', np.float64(0.1802159621297943)),
  ('scrubmommy', np.float64(0.17534836611779664)),
  ('daddyproduct', np.float64(0.08819396265016287)),
  ('daddyscrub', np.float64(0.06705478741133547)),
  ('product', np.float64(0.06117626700071164)),
  ('love', np.float64(0.059772541219407935))],
 [('best', np.float64(0.3994393322452179)),
  ('spong

In [11]:
topic_top_words = []
for i, topic in enumerate(topic_top_words_weights):
    topic_top_words.append(str.join(' ', [word for word, _ in topic]))

topic_top_words

['love scrub sponge daddy scrubdaddy lovescrub use great mommy clean',
 'scrub daddy scrubdaddy lovescrub mommy scrubmommy daddyproduct daddyscrub product love',
 'best sponge bestsponge spongeuse use ive spongeive lovesponge iveuse daddybest',
 'love lovesponge lovelove loveproduct sponge product lovescrub absolutelylove absolutely spongework',
 'clean dish easy use good like make cleandish scratch doesnt',
 'long longtime time smell sponge doesnt like doesntsmell greatlong likesponge',
 'sponge lovesponge like mommy scrubmommy workgreat work spongework scrub likesponge',
 'product sponge greatproduct loveproduct daddyproduct scrubdaddy daddy like great likesponge',
 'scrubber like good doesnt likesponge product water bestscrubber little really',
 'use mommy scrubmommy spongeuse product loveproduct water iveuse ive usescrub',
 'mommy scrubmommy product buy loveproduct greatproduct best dish bestsponge cleaning',
 'dish wash good washdish cleaning sponge favorite scrubber spongeuse cle

In [12]:
# vectorize words for each topic
vectorizer = TfidfVectorizer()
topic_word_vectors = vectorizer.fit_transform(topic_top_words)
topic_word_vectors

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 150 stored elements and shape (15, 65)>

In [13]:
from sklearn.cluster import KMeans

# Convert topics to vectors
topic_vectors = svd_model.components_

# Perform K-means clustering
n_clusters = 10  # Adjust as needed
kmeans = KMeans(n_clusters=n_clusters, random_state=12)
cluster_labels = kmeans.fit_predict(topic_word_vectors)
cluster_labels

array([1, 1, 0, 5, 3, 6, 5, 1, 4, 0, 9, 8, 7, 2, 3], dtype=int32)

In [14]:
# show the cluster assignment for each topic
topic_cluster = pd.DataFrame({'topic': topic_top_words, 'cluster': cluster_labels})
topic_cluster

Unnamed: 0,topic,cluster
0,love scrub sponge daddy scrubdaddy lovescrub u...,1
1,scrub daddy scrubdaddy lovescrub mommy scrubmo...,1
2,best sponge bestsponge spongeuse use ive spong...,0
3,love lovesponge lovelove loveproduct sponge pr...,5
4,clean dish easy use good like make cleandish s...,3
5,long longtime time smell sponge doesnt like do...,6
6,sponge lovesponge like mommy scrubmommy workgr...,5
7,product sponge greatproduct loveproduct daddyp...,1
8,scrubber like good doesnt likesponge product w...,4
9,use mommy scrubmommy spongeuse product lovepro...,0


In [15]:
# merge topic_top_words that have the same cluster label 
topic_cluster = topic_cluster.groupby('cluster')['topic'].apply(lambda x: ' '.join(x)).reset_index()
topic_cluster

Unnamed: 0,cluster,topic
0,0,best sponge bestsponge spongeuse use ive spong...
1,1,love scrub sponge daddy scrubdaddy lovescrub u...
2,2,work good clean buy easy really scrubber makec...
3,3,clean dish easy use good like make cleandish s...
4,4,scrubber like good doesnt likesponge product w...
5,5,love lovesponge lovelove loveproduct sponge pr...
6,6,long longtime time smell sponge doesnt like do...
7,7,scrubber great clean greatproduct sponge loves...
8,8,dish wash good washdish cleaning sponge favori...
9,9,mommy scrubmommy product buy loveproduct great...


In [16]:
# remove duplicate words in each cluster
for i, row in topic_cluster.iterrows():
    words = row['topic'].split(' ')
    set_words = list(set(words))
    if len(words) != len(set_words):
        print(f'before removing duplicates: {len(words)} words')
        print(f'after removing duplicates: {len(set_words)} words')
    print(set_words)
    topic_cluster.at[i, 'topic'] = ' '.join(set_words)

before removing duplicates: 20 words
after removing duplicates: 16 words
['sponge', 'use', 'ive', 'scrubmommy', 'usescrub', 'iveuse', 'best', 'spongeive', 'lovesponge', 'loveproduct', 'water', 'bestsponge', 'mommy', 'spongeuse', 'daddybest', 'product']
before removing duplicates: 30 words
after removing duplicates: 18 words
['clean', 'sponge', 'use', 'scrubmommy', 'greatproduct', 'likesponge', 'scrub', 'great', 'scrubdaddy', 'lovescrub', 'daddyproduct', 'like', 'loveproduct', 'daddy', 'daddyscrub', 'mommy', 'love', 'product']
['clean', 'makeclean', 'really', 'easy', 'good', 'easyclean', 'goodsponge', 'scrubber', 'work', 'buy']
before removing duplicates: 20 words
after removing duplicates: 17 words
['clean', 'use', 'easy', 'good', 'cleandish', 'pot', 'potpan', 'like', 'doesnt', 'scratch', 'work', 'doesntscratch', 'dish', 'make', 'pan', 'spongeuse', 'product']
['little', 'really', 'likesponge', 'good', 'bestscrubber', 'like', 'scrubber', 'doesnt', 'water', 'product']
before removing dup

In [17]:
# save topic_cluster to a csv file
topic_cluster.to_csv('outputs/lsa_topic_cluster.csv', index=False)