In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF
import pandas as pd
from sklearn.preprocessing import normalize
import spacy

In [37]:
nlp = spacy.load("en_core_web_sm")

In [38]:
df = pd.read_csv("../data/clorox_data.csv")

In [39]:
df['subcategory'].value_counts().tail(50)

subcategory
WOOD/FURNITURE/DUST WOOD/FURNITURE CLEANER        1592
HAIR CARE                                         1591
WOOD/FURNITURE/DUST POLISH                        1530
BATHROOM CLEANERS LIMESCALE/HARDWATER CLEANERS    1482
WOOD/FURNITURE/DUST CONVENIENCE WOOD/FURN/DUST    1447
CONSUMABLE TOOLS SOAP PADS/STEEL WOOL             1435
SUNCARE & FIRST AID SUNCARE                       1406
HAIR CARE SHAMPOO                                 1110
CORE GIFTS EVERYDAY KITS                          1035
WIPES OTHER WIPES                                 1033
LIP CARE                                          1028
FLOOR CLEANERS                                    1028
HAIR CARE CONDITIONER                              990
BODY CARE BATH SOAP                                903
MEN'S CARE BODY WASH                               895
ODOR CONTROLLING FABRIC REFRESHERS                 823
MOISTURE ABSORBER                                  822
TOILET BOWL CLEANERS                               81

In [40]:
# look at hair care subcategory (1591 values)
df_sample = df[df['subcategory']=='HAIR CARE']
df_sample

Unnamed: 0,brand,product_title,proxy_date,retailer,category,subcategory,review_text,star_rating,topic,brand_type
214354,Nexxus,Nexxus Advanced Therappe Shampoo and Humectres...,2024-07-01,Costco,PERSONAL CARE,HAIR CARE,We have been using this duo for several years....,5.0,Positive reviews for a makeup remover (undeter...,Competitor
214364,Nexxus,Nexxus Advanced Therappe Shampoo and Humectres...,2024-07-01,Costco,PERSONAL CARE,HAIR CARE,"I have a very sensitive scalp, tried hypoaller...",4.0,Positive reviews for a makeup remover (undeter...,Competitor
214655,Head & Shoulders,Head & Shoulders Complete Scalp Care 2-in-1 Da...,2024-07-01,Costco,PERSONAL CARE,HAIR CARE,We go through a lot of shampoo at our house an...,5.0,Shopping for alternative hair products,Competitor
214659,Nexxus,Nexxus Advanced Therappe Shampoo and Humectres...,2024-07-01,Costco,PERSONAL CARE,HAIR CARE,Great product made my hair healthier and stronger,5.0,Hair growth using conditioning products,Competitor
214662,Nexxus,Nexxus Advanced Therappe Shampoo and Humectres...,2024-07-01,Costco,PERSONAL CARE,HAIR CARE,Great shampoo and conditioner. Your hair will ...,5.0,Experiences with different hair products and b...,Competitor
...,...,...,...,...,...,...,...,...,...,...
642758,CVS Health,"CVS Health Hair Detangling Spray, 10oz",2022-11-07,CVS,PERSONAL CARE,HAIR CARE,Awesome stuff!! I bought CVS brand and love it...,5.0,Positive reviews for a makeup remover (undeter...,Competitor
646780,Amazon Basics,Amazon Basics 2-in-1 Dandruff Shampoo and Cond...,2022-10-24,Amazon,PERSONAL CARE,HAIR CARE,smells better than headnshoulders,5.0,Fragrances and scents,Competitor
656828,Amazon Basics,Amazon Basics 2-in-1 Dandruff Shampoo and Cond...,2022-10-03,Amazon,PERSONAL CARE,HAIR CARE,"This shampoo is fantastic, no more white speck...",5.0,Positive reviews for a makeup remover (undeter...,Competitor
661227,CVS Health,"CVS Health Hair Detangling Spray, 10oz",2022-09-19,CVS,PERSONAL CARE,HAIR CARE,I use this on my granddaughters hair. I've tri...,5.0,Positive reviews for a makeup remover (undeter...,Competitor


In [41]:
# function for creating custom stopwords (from clorox code)
def create_custom_stopwords(df):

    unique_words = set()
    df['product_title'].drop_duplicates().str.split().apply(unique_words.update)
    df['brand'].drop_duplicates().str.split().apply(unique_words.update)
    unique_words = list(unique_words)

    custom_stopwords = unique_words + [word.lower() for word in unique_words] + [
        word.upper() for word in unique_words] + [word.capitalize() for word in unique_words]
    custom_stopwords = list(set(custom_stopwords))

    return custom_stopwords

In [42]:
# function for lemmetization
def preprocess_text(text):
        doc = nlp(text.lower())
        return " ".join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])

# create new cleaned review text 
df_sample['cleaned_review_text'] = df_sample['review_text'].apply(preprocess_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sample['cleaned_review_text'] = df_sample['review_text'].apply(preprocess_text)


In [43]:
df_sample.head(5)

Unnamed: 0,brand,product_title,proxy_date,retailer,category,subcategory,review_text,star_rating,topic,brand_type,cleaned_review_text
214354,Nexxus,Nexxus Advanced Therappe Shampoo and Humectres...,2024-07-01,Costco,PERSONAL CARE,HAIR CARE,We have been using this duo for several years....,5.0,Positive reviews for a makeup remover (undeter...,Competitor,duo year disappoint reduce size container price
214364,Nexxus,Nexxus Advanced Therappe Shampoo and Humectres...,2024-07-01,Costco,PERSONAL CARE,HAIR CARE,"I have a very sensitive scalp, tried hypoaller...",4.0,Positive reviews for a makeup remover (undeter...,Competitor,sensitive scalp try hypoallergenic cause actua...
214655,Head & Shoulders,Head & Shoulders Complete Scalp Care 2-in-1 Da...,2024-07-01,Costco,PERSONAL CARE,HAIR CARE,We go through a lot of shampoo at our house an...,5.0,Shopping for alternative hair products,Competitor,lot shampoo house size container great
214659,Nexxus,Nexxus Advanced Therappe Shampoo and Humectres...,2024-07-01,Costco,PERSONAL CARE,HAIR CARE,Great product made my hair healthier and stronger,5.0,Hair growth using conditioning products,Competitor,great product hair healthy strong
214662,Nexxus,Nexxus Advanced Therappe Shampoo and Humectres...,2024-07-01,Costco,PERSONAL CARE,HAIR CARE,Great shampoo and conditioner. Your hair will ...,5.0,Experiences with different hair products and b...,Competitor,great shampoo conditioner hair feel amazing


In [44]:
def identify_topics(df, num_topics = 10, max_features=500):
    topic_list = []

    # get subcategory reviews
    subcategory_reviews = df['cleaned_review_text']

    # create stopwords (custom and default)
    custom_stopwords = create_custom_stopwords(df_sample)
    default_stopwords = list(CountVectorizer(
        stop_words="english").get_stop_words())
    all_stopwords = default_stopwords + custom_stopwords
 
    # create a term-document matrix
    vectorizer = TfidfVectorizer(stop_words=all_stopwords, max_features=max_features)
    tfidf_matrix = vectorizer.fit_transform(subcategory_reviews)

    # apply matrix factorization
    nmf_model = NMF(n_components = num_topics, random_state=42, solver='mu', beta_loss='kullback-leibler', max_iter=100)
    W = nmf_model.fit_transform(tfidf_matrix)
    H = nmf_model.components_
        
    # normalize and find dominant topic for each document
    doc_topic = normalize(W, norm='l1', axis=1)
    dominant_topic = doc_topic.argmax(axis=1)

    # add topic to list
    for topic_idx in dominant_topic:
        topic_words = " ".join([vectorizer.get_feature_names_out()[i] for i in H[topic_idx].argsort()[:-6:-1]])
        topic_list.append(topic_words)

    # add topic as new col to df
    df['NMF_topic'] = topic_list
    return df

In [45]:
df_with_topics = identify_topics(df_sample, num_topics=7)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['NMF_topic'] = topic_list


In [46]:
df_with_topics.to_csv('reviews_nmf.csv', index=False)

In [47]:
df_with_topics['NMF_topic'].unique()

array(['good price quality product costco', 'work like brand use buy',
       'great work price value wonderful',
       'smell scent nice help recommend',
       'product bottle review receive marketing',
       'feel like clean leave year', 'soft leave dry use make'],
      dtype=object)

In [48]:
df_with_topics['topic'].unique()

array(['Positive reviews for a makeup remover (undetermined topic)',
       'Shopping for alternative hair products',
       'Hair growth using conditioning products',
       'Experiences with different hair products and brands',
       'Skincare products and their effects on skin',
       'Consumer complaints about changes in product ingredients and quality',
       'Hair care and scent preferences', 'Reviews of water bottles',
       'Skincare routines and product recommendations',
       'Beauty product reviews and recommendations',
       'Effective dandruff treatments',
       'Customer satisfaction and experience', 'Good value for money',
       'Long-time users of a product',
       'Quality and Shipping Issues with Beauty Products',
       'Hair growth and strengthening with oils',
       'Packaging and shipping quality issues with beauty products',
       'Favorite soaps and their reviews',
       'Review of Maui Moisture hair products',
       'Great deals on excellent produc