In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pandas as pd
import numpy as np
import random

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


In [6]:
df = pd.read_csv('../data/clorox_data.csv')
by_subcategory = df.groupby('subcategory')['review_text']

# sample 20 subcategories
random.seed(13)
sample_subcategories = random.sample(list(pd.unique(df['subcategory'])), 20)
sample_subcategories
category = sample_subcategories[0]
text = by_subcategory.get_group(category)
text

145       I bought this to clean my hot tub and it did a...
255                               loved the product. Strong
261       I cannot imagine not having a Scrub Daddy. The...
352               The scrub daddy is easier for me to hold!
420       I love that it softens in hot water and harden...
                                ...                        
211066    i originally gave scrub daddy a chance because...
211365    It’s amazing and I love using both sides to cl...
213277    Love the scub daddy. It's my fave out of all t...
213325    Me and my brother got one of these they clean ...
213804                                    Best sponge ever!
Name: review_text, Length: 2479, dtype: object

In [3]:
# preprocess text removing punctuation, lowercasing, and lemmitizing
import re

# use regex to remove punctuation
text = text.apply(lambda x: re.sub(r'[^\w\s]', '', x))

# lower case
text = text.apply(lambda x: x.lower())

# strip leading and trailing white space
text = text.apply(lambda x: x.strip())

# drop duplicates
text = text.drop_duplicates()

text

145       i bought this to clean my hot tub and it did a...
255                                loved the product strong
261       i cannot imagine not having a scrub daddy the ...
352                the scrub daddy is easier for me to hold
420       i love that it softens in hot water and harden...
                                ...                        
211066    i originally gave scrub daddy a chance because...
211365    its amazing and i love using both sides to cle...
213277    love the scub daddy its my fave out of all the...
213325    me and my brother got one of these they clean ...
213804                                     best sponge ever
Name: review_text, Length: 2476, dtype: object

In [4]:
# 1. Convert text to a document-term matrix using CountVectorizer
count_vectorizer = CountVectorizer(stop_words='english')
count_data = count_vectorizer.fit_transform(text)

# 2. Fit the LDA model with a specified number of topics
n_topics = 10
lda = LatentDirichletAllocation(n_components=n_topics, random_state=12)
lda.fit(count_data)

# 3. Extract the top words for each topic
num_words = 10
feature_names = count_vectorizer.get_feature_names_out()

# Display the top words for each topic
for i, topic in enumerate(lda.components_):
    print(f"Topic {i + 1}:")
    top_words = [feature_names[i] for i in topic.argsort()[:-num_words - 1:-1]]
    print(" ".join(top_words))
    print("\n")

Topic 1:
best dishes cleaning sponge sponges used ive scrubber washing love


Topic 2:
scrub daddy love mommy products sponge sponges brand time long


Topic 3:
like dont water im use sponge scrubbing got great just


Topic 4:
long use great sponge time product scrubbing kitchen lasting bought


Topic 5:
love sponge dishes scrubber worth use favorite great wash item


Topic 6:
water soft sponge cold hot love hard scrubber scrub used


Topic 7:
sponge like sponges scrub love really regular daddy better using


Topic 8:
works great doesnt love does like smell scrubber job use


Topic 9:
sponge love product buy use great used best apart cleans


Topic 10:
love use clean sponge dishes great easy long lasts time




In [8]:
# display representative reviews for each topic
n_reviews = 3
doc_topic = lda.transform(count_data)
for i in range(n_topics):
    print(f"Topic {i + 1}:")
    top_reviews = text.iloc[doc_topic[:, i].argsort()[-n_reviews:]]
    for review in top_reviews:
        print(review)
    print("\n")

Topic 1:
these are our favorite to sponges for washing dishes weve bought them consistently for over a year now we dont have a dishwasher so we use these sponges frequently i love their duality and they last a good while as long as theyre used properly and with care i also love that they have a recycling program so thats a huge plus
always wanted to try this sponge for the hype and advertising i bought one and its not all that i like that it doesnt scratch my pots but its harder to get grease off i use dawn not very flexible for edges on your tupperware took me longer to wash my dishes i went back to the usual dishwashing sponges but ill keep it for surfaces shoes fabrics cleaning
we dont have a dish washer so its all done by hand weve tried so many different types of sponges rags and scrubbers these are the ones we return to over 26 over i love they dont hold onto moisture that leads to mildew 26 mold ive never had an issue with them smelling the main issue i have is i go thru them fa