In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
import pandas as pd
import numpy as np
import random

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


# open preprocessed reviews as a df

In [3]:
# open preprocessed data
df = pd.read_csv('../data/processed_reviews.csv')

In [49]:
# group reviews by subcategory and convert to dictionary
grouped = df.groupby('subcategory')['review_text'].apply(list).to_dict()
grouped.keys()

dict_keys(['ABRASIVE CLEANERS', 'AIR FRESHENER', 'BATHROOM CLEANERS', 'BATHROOM CLEANERS DAILY SHOWER CLEANERS', 'BATHROOM CLEANERS GENERAL BATHROOM CLEANERS', 'BATHROOM CLEANERS LIMESCALE/HARDWATER CLEANERS', 'BATHROOM CLEANERS MILDEW CLEANERS', 'BODY CARE', 'BODY CARE BAR SOAP', 'BODY CARE BATH SOAP', 'BODY CARE BODY LOTION', 'BODY CARE BODY OIL', 'BODY CARE BODY TOOLS', 'BODY CARE BODY WASH', 'BODY CARE BODY WIPES', 'BODY CARE DEODORANT', 'BODY CARE FOOT CARE', 'BODY CARE HAIR REMOVAL', 'BODY CARE HAND CARE', 'CONSUMABLE TOOLS', 'CONSUMABLE TOOLS CLEANING CLOTHS', 'CONSUMABLE TOOLS CONSUMABLE SCRUBBERS', 'CONSUMABLE TOOLS SOAP PADS/STEEL WOOL', 'CONSUMABLE TOOLS SPONGES', 'CORE GIFTS', 'CORE GIFTS EVERYDAY KITS', 'CORE GIFTS HOLIDAY KITS', 'DILUTABLES', 'DILUTABLES NATURAL/CONCENTRATED', 'DILUTABLES PINE/DISINFECTING DILUTABLES', 'DILUTABLES SCENTED/NON-DISINFECTING DILUTABLES', 'DISH CARE', 'DISH CARE LIQUID DISH DETERGENT', 'DRAIN CARE', 'FACE CARE', 'FACE CARE ACNE TREATMENTS', '

# LSA

In [50]:
# create lsa pipeline
def create_components(n_topics):
    vectorizer = TfidfVectorizer(stop_words='english', 
                                 use_idf=True, 
                                 ngram_range=(1, 1),
                                 smooth_idf=True)
    svd_model = TruncatedSVD(n_components=n_topics,        
                             algorithm='randomized',
                             n_iter=20)
    svd_transformer = Pipeline([('tfidf', vectorizer), 
                                ('svd', svd_model)])
    return svd_transformer, vectorizer, svd_model

In [51]:
# dictionary of first 2 subcategories
groups = {k: grouped[k] for k in list(grouped)[:2]}

In [52]:
# for each subcategory, fit the pipeline to the reviews and save results to lsa_df with columns: subcategory, topic_number, top_words, sample_reviews
lsa_df = pd.DataFrame(columns=['subcategory', 'topic_number', 'top_words', 'sample_reviews'])

for subcategory, reviews in grouped.items():
    reviews = df[df['subcategory'] == subcategory]['review_text']
    n_topics = int(min(15, len(reviews)))
    # if n_topics >= 2:
    svd_transformer, vectorizer, svd_model = create_components(n_topics)
    svd_matrix = svd_transformer.fit_transform(reviews)
    terms = vectorizer.get_feature_names_out()

    for i, topic in enumerate(svd_model.components_):
        top_words = [terms[j] for j in topic.argsort()[:-10 - 1:-1]]
        n_samples = min(5, len(reviews))
        rep_reviews = reviews.iloc[np.argsort(svd_matrix[:,i])[::-1][:3]].values
        new_row = pd.DataFrame({'subcategory': [subcategory], 'topic_number': [i], 'top_words': [top_words], 'sample_reviews': [rep_reviews]})
        lsa_df = pd.concat([lsa_df, new_row], ignore_index=True)

lsa_df

  self.explained_variance_ratio_ = exp_var / full_var


Unnamed: 0,subcategory,topic_number,top_words,sample_reviews
0,ABRASIVE CLEANERS,0,"[clean, great, use, product, work, sink, good,...","[product work great for what i use it for, ive..."
1,ABRASIVE CLEANERS,1,"[use, year, comet, product, love, ive, good, j...","[ive use comet for year, how be use this produ..."
2,ABRASIVE CLEANERS,2,"[great, work, product, price, year, good, job,...","[work on almost everything great product, have..."
3,ABRASIVE CLEANERS,3,"[good, clean, smell, bathroom, comet, price, w...","[work good for clean my bathroom, i love the s..."
4,ABRASIVE CLEANERS,4,"[good, product, stainless, steel, price, pot, ...","[very very good product, good price good produ..."
...,...,...,...,...
1372,WOOD/FURNITURE/DUST WOOD/FURNITURE CLEANER,10,"[wood, love, cleaner, best, product, surface, ...","[best wood cleaner i have ever use, love how i..."
1373,WOOD/FURNITURE/DUST WOOD/FURNITURE CLEANER,11,"[cleaner, best, smell, like, favorite, pledge,...","[love the smell of this cleaner, murphy be alw..."
1374,WOOD/FURNITURE/DUST WOOD/FURNITURE CLEANER,12,"[smell, use, nice, easy, work, year, receive, ...","[easy to use clean well smell nice, smell so g..."
1375,WOOD/FURNITURE/DUST WOOD/FURNITURE CLEANER,13,"[make, surface, look, spray, wood, easy, good,...",[make wood look wonderful and make the house s...


In [53]:
# save lsa_df to csv
lsa_df.to_csv('outputs/lsa_1gram_results.csv', index=False)

# Cluster similar topics

In [54]:
# remove space between 2 gram words in top_words so that when vectorized, they are treated as one entity
lsa_df['top_words'] = lsa_df['top_words'].apply(lambda x: [word.replace(' ', '') for word in x])

# create dict of subcategories and array of top words joined to one string for each topic
lsa_dict = {}
for subcategory in lsa_df['subcategory'].unique():
    sub_df = lsa_df[lsa_df['subcategory'] == subcategory]
    sub_dict = {}
    for topic in sub_df['topic_number']:
        topic_df = sub_df[sub_df['topic_number'] == topic]
        sub_dict[topic] = ' '.join(topic_df['top_words'].values[0])
    lsa_dict[subcategory] = sub_dict
lsa_dict['ABRASIVE CLEANERS']

{0: 'clean great use product work sink good love year comet',
 1: 'use year comet product love ive good job ajax cleanser',
 2: 'great work product price year good job value wonder ive',
 3: 'good clean smell bathroom comet price work love like toilet',
 4: 'good product stainless steel price pot pan excellent job really',
 5: 'best work cleaner comet cleanser good stain like ive water',
 6: 'best cleaner great product bathroom price cleaning kitchen job excellent',
 7: 'comet great stainless steel like best price love cleaner pot',
 8: 'love best stain product smell water remove hard cleaner comet',
 9: 'clean best like product pot cleaner pan thing scratch really',
 10: 'bathroom work kitchen product best use cleaning pot pan smell',
 11: 'cleaner love year smell good use stuff great tub clean',
 12: 'bathroom job cleanser like bar keeper friend kitchen scrub cleaning',
 13: 'cleaner like product scrub kitchen sink job comet ajax excellent',
 14: 'like cleanser sink smell tub product

In [55]:
# cluster top words representations for each subcategory
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt

vectorizer = TfidfVectorizer()
n_clusters = 10

# kmeans applied to a subcategory, return cluster labels
def apply_kmeans(topics_dict, vectorizer, n_clusters):
    if len(topics_dict) > n_clusters:
        topics_vectorized = vectorizer.fit_transform(topics_dict.values())
        kmeans_model = KMeans(n_clusters=n_clusters, random_state=12)
        cluster_labels = kmeans_model.fit_predict(topics_vectorized)
        return cluster_labels
    return None

# merge topics in subcategory with the same cluster labels
def merge_topics(topics_dict, cluster_labels):
    merged_topics = {}
    for i in range(len(cluster_labels)):
        if cluster_labels[i] not in merged_topics:
            merged_topics[cluster_labels[i]] = topics_dict[i]
        else:
            merged_topics[cluster_labels[i]] += ' ' + topics_dict[i]
    return merged_topics

# remove duplicate words in merged topics
def remove_duplicates(merged_topics):
    for label, words in merged_topics.items():
        merged_topics[label] = ' '.join(list(set(words.split())))

In [56]:
# apply kmeans to each subcategory and merge topics with the same cluster labels

clustered_dict = {}
for subcategory, topics_dict in lsa_dict.items():
    cluster_labels = apply_kmeans(topics_dict, vectorizer, n_clusters)
    if cluster_labels is not None:
        merged_topics = merge_topics(topics_dict, cluster_labels)
        remove_duplicates(merged_topics)
        clustered_dict[subcategory] = merged_topics
    else:
        clustered_dict[subcategory] = topics_dict

clustered_dict['ABRASIVE CLEANERS']

{np.int32(1): 'comet product work sink cleanser smell year job cleaner love use tub great clean ive stuff good ajax',
 np.int32(0): 'work product price year job value great ive good wonder',
 np.int32(6): 'comet work smell price like bathroom love clean good toilet',
 np.int32(3): 'comet really excellent product price job like cleaner steel love pan great stainless best good pot',
 np.int32(4): 'comet remove work stain cleanser product smell like cleaner love ive hard best good water',
 np.int32(7): 'excellent product work smell price job cleaner bathroom cleaning kitchen use great pan best pot',
 np.int32(8): 'really product like cleaner pan thing clean scratch best pot',
 np.int32(5): 'bar cleanser job like keeper bathroom friend kitchen cleaning scrub',
 np.int32(2): 'comet excellent product sink like job cleaner kitchen scrub ajax',
 np.int32(9): 'product smell sink cleanser like look tub scrub best new'}

In [57]:
# save clustered_dict to csv
clustered_df = pd.DataFrame(columns=['subcategory', 'topic_number', 'top_words'])
for subcategory, topics_dict in clustered_dict.items():
    for topic, words in topics_dict.items():
        new_row = pd.DataFrame({'subcategory': [subcategory], 'topic_number': [topic], 'top_words': [words]})
        clustered_df = pd.concat([clustered_df, new_row], ignore_index=True)
clustered_df.to_csv('outputs/lsa_1gram_clustered_results.csv', index=False)

In [58]:
lsa_dict['BODY CARE HAIR REMOVAL']

{0: 'work small dry come box'}