In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
import pandas as pd
import numpy as np
import random

# open preprocessed reviews as a df

In [2]:
# open preprocessed data
df = pd.read_csv('../data/processed_reviews.csv')

In [29]:
# group reviews by subcategory and convert to dictionary
grouped_dict = df.groupby('subcategory')['review_text'].apply(list).to_dict()
grouped_dict.keys()

dict_keys(['ABRASIVE CLEANERS', 'AIR FRESHENER', 'BATHROOM CLEANERS', 'BATHROOM CLEANERS DAILY SHOWER CLEANERS', 'BATHROOM CLEANERS GENERAL BATHROOM CLEANERS', 'BATHROOM CLEANERS LIMESCALE/HARDWATER CLEANERS', 'BATHROOM CLEANERS MILDEW CLEANERS', 'BODY CARE', 'BODY CARE BAR SOAP', 'BODY CARE BATH SOAP', 'BODY CARE BODY LOTION', 'BODY CARE BODY OIL', 'BODY CARE BODY TOOLS', 'BODY CARE BODY WASH', 'BODY CARE BODY WIPES', 'BODY CARE DEODORANT', 'BODY CARE FOOT CARE', 'BODY CARE HAIR REMOVAL', 'BODY CARE HAND CARE', 'CONSUMABLE TOOLS', 'CONSUMABLE TOOLS CLEANING CLOTHS', 'CONSUMABLE TOOLS CONSUMABLE SCRUBBERS', 'CONSUMABLE TOOLS SOAP PADS/STEEL WOOL', 'CONSUMABLE TOOLS SPONGES', 'CORE GIFTS', 'CORE GIFTS EVERYDAY KITS', 'CORE GIFTS HOLIDAY KITS', 'DILUTABLES', 'DILUTABLES NATURAL/CONCENTRATED', 'DILUTABLES PINE/DISINFECTING DILUTABLES', 'DILUTABLES SCENTED/NON-DISINFECTING DILUTABLES', 'DISH CARE', 'DISH CARE LIQUID DISH DETERGENT', 'DRAIN CARE', 'FACE CARE', 'FACE CARE ACNE TREATMENTS', '

# LSA

In [22]:
# create lsa pipeline
def create_components(n_topics):
    vectorizer = TfidfVectorizer(stop_words='english', 
                                 use_idf=True, 
                                 ngram_range=(1, 2),
                                 smooth_idf=True)
    svd_model = TruncatedSVD(n_components=n_topics,        
                             algorithm='randomized',
                             n_iter=20)
    svd_transformer = Pipeline([('tfidf', vectorizer), 
                                ('svd', svd_model)])
    return svd_transformer, vectorizer, svd_model

In [23]:
# dictionary of first 2 subcategories
groups = {k: grouped_dict[k] for k in list(grouped_dict)[:2]}

In [25]:
# for each subcategory, fit the pipeline to the reviews and save results to lsa_df with columns: subcategory, topic_number, top_words, sample_reviews
lsa_df = pd.DataFrame(columns=['subcategory', 'topic_number', 'top_words', 'sample_reviews'])

for subcategory, reviews in grouped_dict.items():
    # change grouped_dict to df
    reviews = pd.DataFrame(reviews)
    n_topics = int(min(15, len(reviews)))
    # if n_topics >= 2:
    svd_transformer, vectorizer, svd_model = create_components(n_topics)
    svd_matrix = svd_transformer.fit_transform(reviews)
    terms = vectorizer.get_feature_names_out()

    for i, topic in enumerate(svd_model.components_):
        top_words = [terms[j] for j in topic.argsort()[:-10 - 1:-1]]
        n_samples = min(5, len(reviews))
        # get representative 3 reviews for each topic
        rep_reviews = 
            
        new_row = pd.DataFrame({'subcategory': [subcategory], 'topic_number': [i], 'top_words': [top_words], 'sample_reviews': [rep_reviews]})
        
        lsa_df = pd.concat([lsa_df, new_row], ignore_index=True)

lsa_df

TypeError: only integer scalar arrays can be converted to a scalar index

In [31]:
import numpy as np

lsa_df = pd.DataFrame(columns=['subcategory', 'topic_number', 'top_words', 'sample_reviews'])

for subcategory, reviews in grouped_dict.items():
    # change grouped_dict to df
    reviews = df[df['subcategory'] == subcategory]['review_text']
    n_topics = int(min(15, len(reviews)))
    svd_transformer, vectorizer, svd_model = create_components(n_topics)
    svd_matrix = svd_transformer.fit_transform(reviews)
    terms = vectorizer.get_feature_names_out()

    for i, topic in enumerate(svd_model.components_):
        top_words = [terms[j] for j in topic.argsort()[:-10 - 1:-1]]
        
        # Calculate topic scores for each review
        topic_scores = svd_matrix[:, i]
        
        # Sort reviews by topic score
        sorted_indices = np.argsort(topic_scores)[::-1]
        
        # Get top 3 representative reviews
        top_review_indices = sorted_indices[:3]
        rep_reviews = reviews.iloc[top_review_indices].tolist()
        
        new_row = pd.DataFrame({
            'subcategory': [subcategory], 
            'topic_number': [i], 
            'top_words': [top_words], 
            'sample_reviews': [rep_reviews]
        })
        
        lsa_df = pd.concat([lsa_df, new_row], ignore_index=True)

lsa_df

  self.explained_variance_ratio_ = exp_var / full_var


Unnamed: 0,subcategory,topic_number,top_words,sample_reviews
0,ABRASIVE CLEANERS,0,"[clean, great, use, product, work, sink, work ...",[this product work great on stainless steel si...
1,ABRASIVE CLEANERS,1,"[use, year, use year, comet, product, ive, ive...","[i have use this for many year, good product h..."
2,ABRASIVE CLEANERS,2,"[great, work, work great, great product, produ...","[have always work great, work on almost everyt..."
3,ABRASIVE CLEANERS,3,"[clean, bathroom, clean bathroom, kitchen, sme...","[clean what you put in on, cant clean without ..."
4,ABRASIVE CLEANERS,4,"[good, product, good product, price, good pric...","[very very good product, good price good produ..."
...,...,...,...,...
1372,WOOD/FURNITURE/DUST WOOD/FURNITURE CLEANER,10,"[clean, product, great product, surface, clean...","[love how it clean my wood floor, i love this ..."
1373,WOOD/FURNITURE/DUST WOOD/FURNITURE CLEANER,11,"[make, like, look, furniture, shine, make floo...","[make my floor look new again, this make my fl..."
1374,WOOD/FURNITURE/DUST WOOD/FURNITURE CLEANER,12,"[cleaner, hardwood, hardwood floor, best, furn...","[best hardwood floor cleaner, best cleaner for..."
1375,WOOD/FURNITURE/DUST WOOD/FURNITURE CLEANER,13,"[furniture, clean, leave, wood furniture, leat...",[i use on my leather and wood furniture work g...


In [32]:
# save lsa_df to csv
lsa_df.to_csv('outputs/lsa_1-2_gram_results.csv', index=False)

# Cluster similar topics

In [14]:
# remove space between 2 gram words in top_words so that when vectorized, they are treated as one entity
lsa_df['top_words'] = lsa_df['top_words'].apply(lambda x: [word.replace(' ', '') for word in x])

# create dict of subcategories and array of top words joined to one string for each topic
lsa_dict = {}
for subcategory in lsa_df['subcategory'].unique():
    sub_df = lsa_df[lsa_df['subcategory'] == subcategory]
    sub_dict = {}
    for topic in sub_df['topic_number']:
        topic_df = sub_df[sub_df['topic_number'] == topic]
        sub_dict[topic] = ' '.join(topic_df['top_words'].values[0])
    lsa_dict[subcategory] = sub_dict
lsa_dict['ABRASIVE CLEANERS']

{0: 'clean great use product work sink workgreat year good love',
 1: 'use year useyear comet product ive iveuse love usecomet useproduct',
 2: 'great work workgreat greatproduct product price productwork greatprice greatclean workgood',
 3: 'clean bathroom cleanbathroom kitchen smell comet good tub toilet greatclean',
 4: 'good product goodproduct price goodprice greatproduct productclean loveproduct job excellent',
 5: 'good work comet cleaner best cleanser stain workgood goodproduct like',
 6: 'comet stainless good stainlesssteel usecomet steel like smell clean workgreat',
 7: 'best cleaner great bestcleaner price cleaning bathroom comet kitchen bestproduct',
 8: 'comet great bar keeper love barkeeper friend keeperfriend price job',
 9: 'love product best work comet loveproduct productwork pot bestproduct pan',
 10: 'clean cleaner best pot pan potpan useyear year bestcleaner ive',
 11: 'stain comet sink water hard job hardwater great remove waterstain',
 12: 'love cleaner loveproduc

In [15]:
# cluster top words representations for each subcategory
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt

vectorizer = TfidfVectorizer()
n_clusters = 10

# kmeans applied to a subcategory, return cluster labels
def apply_kmeans(topics_dict, vectorizer, n_clusters):
    if len(topics_dict) > n_clusters:
        topics_vectorized = vectorizer.fit_transform(topics_dict.values())
        kmeans_model = KMeans(n_clusters=n_clusters, random_state=12)
        cluster_labels = kmeans_model.fit_predict(topics_vectorized)
        return cluster_labels
    return None

# merge topics in subcategory with the same cluster labels
def merge_topics(topics_dict, cluster_labels):
    merged_topics = {}
    for i in range(len(cluster_labels)):
        if cluster_labels[i] not in merged_topics:
            merged_topics[cluster_labels[i]] = topics_dict[i]
        else:
            merged_topics[cluster_labels[i]] += ' ' + topics_dict[i]
    return merged_topics

# remove duplicate words in merged topics
def remove_duplicates(merged_topics):
    for label, words in merged_topics.items():
        merged_topics[label] = ' '.join(list(set(words.split())))

In [16]:
# apply kmeans to each subcategory and merge topics with the same cluster labels

clustered_dict = {}
for subcategory, topics_dict in lsa_dict.items():
    cluster_labels = apply_kmeans(topics_dict, vectorizer, n_clusters)
    if cluster_labels is not None:
        merged_topics = merge_topics(topics_dict, cluster_labels)
        remove_duplicates(merged_topics)
        clustered_dict[subcategory] = merged_topics
    else:
        clustered_dict[subcategory] = topics_dict

clustered_dict['ABRASIVE CLEANERS']

{np.int32(1): 'work product workgreat use comet useyear year sink iveuse good ive useproduct great clean usecomet love',
 np.int32(0): 'work productclean product greatclean goodprice greatprice loveproduct excellent goodproduct price job greatproduct productwork good great workgreat workgood',
 np.int32(6): 'smell toilet kitchen greatclean comet tub cleanbathroom good bathroom clean',
 np.int32(7): 'work comet cleanser goodproduct best like good cleaner workgood stain',
 np.int32(8): 'smell clean workgreat comet stainlesssteel like stainless steel good usecomet',
 np.int32(4): 'bestcleaner kitchen comet friend barkeeper keeperfriend price job best keeper bathroom bestproduct bar great cleaner cleaning love',
 np.int32(5): 'work bestcleaner ive product comet cleaner loveproduct useyear year potpan best productwork pot bestproduct clean pan love',
 np.int32(2): 'water remove kitchen comet waterstain hardwater potpan job hard sink bathroom pot great pan stain',
 np.int32(9): 'smell stuff 

In [17]:
# save clustered_dict to csv
clustered_df = pd.DataFrame(columns=['subcategory', 'topic_number', 'top_words'])
for subcategory, topics_dict in clustered_dict.items():
    for topic, words in topics_dict.items():
        new_row = pd.DataFrame({'subcategory': [subcategory], 'topic_number': [topic], 'top_words': [words]})
        clustered_df = pd.concat([clustered_df, new_row], ignore_index=True)
clustered_df.to_csv('outputs/lsa_1-2_gram_clustered_results.csv', index=False)

In [18]:
lsa_dict['BODY CARE HAIR REMOVAL']

{0: 'box worksmall work smallcome small drybox dry comedry come'}