In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pandas as pd
import numpy as np
import random

# LDA for topic modeling

In [5]:
df = pd.read_csv('../data/clorox_data.csv')
by_subcategory = df.groupby('subcategory')['review_text']

# choose 1 subcategory
category = random.sample(list(pd.unique(df['subcategory'])), 1)[0]
text = by_subcategory.get_group(category)
text

278              Works great for funny smell... recommended
282                         access was easy, product superb
311                              Only this part in the box.
312       This stuff definitely gets rid of all the mois...
609         Get all the moister out the air and smells good
                                ...                        
198602    My only complaint is I should have gotten thes...
199257           Great product. Walmart delivered promptly.
200056    These are excellent, definitely pulls excess m...
200402    The bags fill up pretty quickly depending on l...
202086    No problems with the item - it's great. Not ha...
Name: review_text, Length: 822, dtype: object

In [6]:
# preprocess text removing punctuation, lowercasing, and lemmitizing
import re

# use regex to remove punctuation
text = text.apply(lambda x: re.sub(r'[^\w\s]', '', x))

# lower case
text = text.apply(lambda x: x.lower())

# strip leading and trailing white space
text = text.apply(lambda x: x.strip())

# drop duplicates
text = text.drop_duplicates()

text

278                 works great for funny smell recommended
282                          access was easy product superb
311                               only this part in the box
312       this stuff definitely gets rid of all the mois...
609         get all the moister out the air and smells good
                                ...                        
198602    my only complaint is i should have gotten thes...
199257             great product walmart delivered promptly
200056    these are excellent definitely pulls excess mo...
200402    the bags fill up pretty quickly depending on l...
202086    no problems with the item  its great not happy...
Name: review_text, Length: 822, dtype: object

In [7]:
# 1. Convert text to a document-term matrix using CountVectorizer
count_vectorizer = CountVectorizer(stop_words='english')
count_data = count_vectorizer.fit_transform(text)

# 2. Fit the LDA model with a specified number of topics
n_topics = 10
lda = LatentDirichletAllocation(n_components=n_topics, random_state=12)
lda.fit(count_data)

# 3. Extract the top words for each topic
num_words = 10
feature_names = count_vectorizer.get_feature_names_out()

# Display the top words for each topic
for i, topic in enumerate(lda.components_):
    print(f"Topic {i + 1}:")
    top_words = [feature_names[i] for i in topic.argsort()[:-num_words - 1:-1]]
    print(" ".join(top_words))
    print("\n")

Topic 1:
closet moisture product like works scent clothes smell great work


Topic 2:
moisture bags work bag smell water air like time hanging


Topic 3:
moisture love great closet smell keeps fresh really product closets


Topic 4:
moisture use just product hang great using works closet smell


Topic 5:
did home bathroom working great time problem water order price


Topic 6:
product smell damp areas excellent 26 rid basement moisture fulltime


Topic 7:
moisture damp closets works great use used closet easy room


Topic 8:
basement work damp smell rid use love good used product


Topic 9:
closet closets use moisture just product clothes room like hang


Topic 10:
works great moisture does product job really humidity good smell




In [8]:
# display representative reviews for each topic
n_reviews = 3
doc_topic = lda.transform(count_data)
for i in range(n_topics):
    print(f"Topic {i + 1}:")
    top_reviews = text.iloc[doc_topic[:, i].argsort()[-n_reviews:]]
    for review in top_reviews:
        print(review)
    print("\n")

Topic 1:
it definitely pulls out the moisture i keep this in our master closet as fall starts to approach in throughout winter we live in a manufactured home where in the master closet there are vents for airing out the underside of the home in the crawlspace during the wet season i absolutely hate it because it starts to smell musty like it needs to open up this definitely helps pull out that moisture and keeps that musty smell away from our clothes this works for now until we are able to pay the money to have our crawlspace encapsulated
there is not a way to tip of the local driver when the delivery order is split between two parties such as the local spark driver or uber and the rest comes by fedex or ups delivery trucks the local spark or uber person cannot receive a tip fix this now for although walmart pays them anyway as two representatives explained to me on tuesday andor wednesday of thos week they still deserve a partial tip for the part of the order that they delivered and f