In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sentence_transformers import SentenceTransformer
from torch.nn.functional import cosine_similarity


  from tqdm.autonotebook import tqdm, trange


# compute similarity scores for original bertopic topics

In [10]:
path = '../data/processed_reviews.csv'
df = pd.read_csv(path)
df.head()

Unnamed: 0,brand,product_title,proxy_date,retailer,category,subcategory,review_text,star_rating,topic,brand_type
0,Clorox,Clorox Wring Clean Cotton Mop,2024-07-08,Target,CLEANING,FLOOR CLEANERS,the mop be cheap the wringing attachment doesn...,1.0,Product review: Click n Clean multi-surface sp...,Clorox
1,Rapid,"32 oz. Rapid Clean Remediation, Trigger Spray ...",2024-07-08,Home Depot,CLEANING,BATHROOM CLEANERS MILDEW CLEANERS,this product be recommend for a front load was...,1.0,Mold removal and prevention products,Competitor
2,Mrs. Meyer's Clean Day,Mrs Meyers Clean Day Cleaner Plastic Bottle Mu...,2024-07-08,Target,CLEANING,SPRAY CLEANERS ALL PURPOSE CLEANERS,smell so good definitely my favorite scent,5.0,Fall scents and their popularity,Competitor
3,Pourri,Poo Pourri Plastic Spray Bottle Toilet Oil Fre...,2024-07-08,Target,CLEANING,ODOR CONTROLLING AIR FRESHENERS,i keep multiple bottle in the bathroom and car...,5.0,Household cleaning and sanitization tips,Competitor
4,Twist & Shout,Twist & Shout Spin Mop & Bucket System with 1 ...,2024-07-01,Costco,CLEANING,FLOOR CLEANERS,i recently just get this mop love how easy it ...,5.0,Product review: Click n Clean multi-surface sp...,Competitor


In [12]:
subcategories = ['SPRAY CLEANERS BLEACH CLEANERS', 'BODY CARE BODY LOTION']
df_filtered = df[df['sub_category'].isin(subcategories)]
df_filtered.shape

(25012, 10)

In [13]:
def similarity_scores(model_name, reviews, topics):
    """
    Calculate the similarity scores between reviews and topics using a pre-trained SentenceTransformer model.
    
    model_name: the name of the pre-trained SentenceTransformer model to use
    reviews: a list of review texts
    topics: a list of topic phrases
    
    return: a 2D numpy array of similarity scores
    """
    model = SentenceTransformer(model_name)
    review_embeddings = model.encode(reviews, convert_to_tensor=True)
    phrase_embeddings = model.encode(topics, convert_to_tensor=True)
    similarity_scores = cosine_similarity(review_embeddings, phrase_embeddings).cpu().numpy()
    return similarity_scores

In [19]:
reviews = df_filtered['review_text'].tolist()
topics = df_filtered['topic'].tolist()
model = 'all-MiniLM-L6-v2'

In [22]:
# run similarity_scores on chunks of reviews 
chunk_size = 300
n = len(reviews)
similarity_scores_list = []
for i in range(0, n, chunk_size):
    print(f'Processing reviews {i} to {min(i+chunk_size, n)}')
    similarity_scores_list.extend(similarity_scores(model, reviews[i:i+chunk_size], topics[i:i+chunk_size]))

Processing reviews 0 to 300




Processing reviews 300 to 600
Processing reviews 600 to 900
Processing reviews 900 to 1200
Processing reviews 1200 to 1500
Processing reviews 1500 to 1800
Processing reviews 1800 to 2100
Processing reviews 2100 to 2400
Processing reviews 2400 to 2700
Processing reviews 2700 to 3000
Processing reviews 3000 to 3300
Processing reviews 3300 to 3600
Processing reviews 3600 to 3900
Processing reviews 3900 to 4200
Processing reviews 4200 to 4500
Processing reviews 4500 to 4800
Processing reviews 4800 to 5100
Processing reviews 5100 to 5400
Processing reviews 5400 to 5700
Processing reviews 5700 to 6000
Processing reviews 6000 to 6300
Processing reviews 6300 to 6600
Processing reviews 6600 to 6900
Processing reviews 6900 to 7200
Processing reviews 7200 to 7500
Processing reviews 7500 to 7800
Processing reviews 7800 to 8100
Processing reviews 8100 to 8400
Processing reviews 8400 to 8700
Processing reviews 8700 to 9000
Processing reviews 9000 to 9300
Processing reviews 9300 to 9600
Processing re

In [24]:
df_filtered['similarity_score'] = similarity_scores_list

25012

In [28]:
# save df_filtered to csv
df_filtered.to_csv('data/lotion+spray_bertopic_similarity_scores.csv', index=False)