#imports

In [None]:
!pip install bertopic

In [None]:
import pandas as pd
import os

In [None]:
from google.colab import drive
drive.mount('/content/drive')
path = "/content/drive/My Drive/DS_project"
os.chdir(path)

Mounted at /content/drive


In [None]:
from bertopic import BERTopic
#from top2vec import Top2Vec
import re
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
import numpy as np
#import umap
#from mpl_toolkits.mplot3d import Axes3D

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# load csv

In [None]:
sus_data = pd.read_csv('sus_data.csv')
non_sus_data = pd.read_csv('non_sus_data.csv')
sus_data.head()

Unnamed: 0,product_id,full_price,discounted,current_price,product_url,description,ratings,reviews
0,DX7615-601,57,True,38.97,https://www.nike.com/t/star-runner-4-big-kids-...,"Because ice-cream trucks, games of tag and rac...",4.5 Stars,94
1,DX9176-107,95,True,71.97,https://www.nike.com/t/gamma-force-womens-shoe...,Layers upon layers of dimensional style—that's...,4.6 Stars,381
2,DH3162-100,70,False,70.0,https://www.nike.com/t/court-legacy-mens-shoes...,"Honoring a history rooted in tennis culture, t...",4.6 Stars,312
3,FN6344-001,120,False,120.0,https://www.nike.com/t/dunk-low-next-nature-se...,"The Dunk returns with classic construction, th...",5 Stars,4
4,DJ6525-002,32,False,32.0,https://www.nike.com/t/alpha-huarache-4-keysto...,These game-changers aren’t your grandaddy’s ol...,4.9 Stars,107


# clean up sus descriptions (remove stopwords, lemmatize)

In [None]:
stop_words = set(stopwords.words('english'))
# add additional stopwords
additional_stopwords = {'nike', 'air', 'shoe', 'design'}
stop_words = set(stopwords.words('english')).union(additional_stopwords)
lemmatizer = WordNetLemmatizer()

def preprocess_text(text: str) -> str:
    # Remove links
    text = re.sub(r"http\S+", "", text)
    # Remove special chars and numbers
    text = re.sub(r"[^A-Za-z ]+", "", text)
    # Tokenize and remove stopwords
    tokens = nltk.word_tokenize(text.lower())
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatize
    lemmatized = [lemmatizer.lemmatize(word) for word in tokens]
    # Join tokens back into a string
    text = " ".join(lemmatized)
    return text.strip()

#clean the descriptions (remove stopwords and special characters like ,.-)
sus_data['clean_description'] = sus_data['description'].apply(lambda text: preprocess_text(text))
sus_data.head()

Unnamed: 0,product_id,full_price,discounted,current_price,product_url,description,ratings,reviews,clean_description
0,DX7615-601,57,True,38.97,https://www.nike.com/t/star-runner-4-big-kids-...,"Because ice-cream trucks, games of tag and rac...",4.5 Stars,94,icecream truck game tag race end street back w...
1,DX9176-107,95,True,71.97,https://www.nike.com/t/gamma-force-womens-shoe...,Layers upon layers of dimensional style—that's...,4.6 Stars,381,layer upon layer dimensional stylethats force ...
2,DH3162-100,70,False,70.0,https://www.nike.com/t/court-legacy-mens-shoes...,"Honoring a history rooted in tennis culture, t...",4.6 Stars,312,honoring history rooted tennis culture court l...
3,FN6344-001,120,False,120.0,https://www.nike.com/t/dunk-low-next-nature-se...,"The Dunk returns with classic construction, th...",5 Stars,4,dunk return classic construction throwback hoo...
4,DJ6525-002,32,False,32.0,https://www.nike.com/t/alpha-huarache-4-keysto...,These game-changers aren’t your grandaddy’s ol...,4.9 Stars,107,gamechangers arent grandaddys oldschool metal ...


## clean up descriptions on non sus data (stop words, lemmatize)

In [None]:
non_sus_data['clean_description'] = non_sus_data['description'].apply(lambda text: preprocess_text(text))
non_sus_data.head()

Unnamed: 0,product_id,full_price,discounted,current_price,product_url,description,ratings,reviews,clean_description
0,BQ3207-002,60,False,60.0,https://www.nike.com/t/revolution-5-womens-roa...,"When the road beckons, answer the call in a li...",4.6 Stars,2019,road beckons answer call lightweight pair that...
1,DH3158-108,80,True,60.97,https://www.nike.com/t/court-vision-low-next-n...,In love with the classic look of '80s basketba...,4.7 Stars,440,love classic look basketball thing fastpaced c...
2,FV6603-101,110,False,110.0,https://www.nike.com/t/p-6000-shoes-SGxVgg/FV6...,"A mash-up of past Pegasus sneakers, the P-6000...",4.9 Stars,185,mashup past pegasus sneaker p take early runni...
3,FZ4178-010,170,True,102.97,https://www.nike.com/t/jordan-6-rings-mens-sho...,You don't win six championships without some s...,4.7 Stars,9,dont win six championship without serious skil...
4,511417-136,100,False,100.0,https://www.nike.com/t/air-max-correlate-women...,The Nike Air Max Correlate sets you up in thro...,4.6 Stars,99,max correlate set throwback style multitexture...


# descriptions as lists

In [None]:
non_sus_descriptions = [each for each in non_sus_data['clean_description']]
sus_descriptions = [each for each in sus_data['clean_description']]

In [None]:
words_to_remove_sus = {"nike", "air", "youve", 'made', 'shoe', 'jr', 'even', 'take', 'dj', 'looking', 'youre', 'whether','fj','pas'}
words_to_remove_non_sus = {'youll', 'next', 'logo', 'aj', 'youre', 'mj','take','give','whether'}
non_sus_filtered_descriptions = [' '.join(word for word in description.split() if word not in words_to_remove_non_sus) for description in non_sus_descriptions]
sus_filtered_descriptions = [' '.join(word for word in description.split() if word not in words_to_remove_sus) for description in sus_descriptions]

#initial attempt on sus descriptions

In [None]:
topic_model = BERTopic(calculate_probabilities=True, verbose=True)
topics, probs = topic_model.fit_transform(sus_filtered_descriptions)

2024-05-01 01:14:54,244 - BERTopic - Embedding - Transforming documents to embeddings.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

2024-05-01 01:15:08,931 - BERTopic - Embedding - Completed ✓
2024-05-01 01:15:08,932 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-01 01:15:19,875 - BERTopic - Dimensionality - Completed ✓
2024-05-01 01:15:19,877 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-05-01 01:15:19,902 - BERTopic - Cluster - Completed ✓
2024-05-01 01:15:19,911 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-05-01 01:15:19,953 - BERTopic - Representation - Completed ✓


### shows what words represent each topic

In [None]:
topic_model.visualize_barchart(n_words=10)

# second attempt BERTopic model on sus descriptions

In [None]:
train_sus, test_sus = train_test_split(sus_filtered_descriptions, test_size=0.5, random_state=40)

In [None]:
model_sus= BERTopic(calculate_probabilities=True, verbose=True)
topics, probs = model_sus.fit_transform(train_sus)
model_sus.fit(test_sus)

2024-05-08 21:44:49,525 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-08 21:44:54,056 - BERTopic - Embedding - Completed ✓
2024-05-08 21:44:54,058 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-08 21:44:56,570 - BERTopic - Dimensionality - Completed ✓
2024-05-08 21:44:56,573 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-05-08 21:44:56,587 - BERTopic - Cluster - Completed ✓
2024-05-08 21:44:56,593 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-05-08 21:44:56,616 - BERTopic - Representation - Completed ✓
2024-05-08 21:44:56,648 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-08 21:45:00,559 - BERTopic - Embedding - Completed ✓
2024-05-08 21:45:00,562 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-08 21:45:04,918 - BERTopic - Dimensionality - Completed ✓
2024-05-08 21:45:04,924 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-05-08 21:45:04,942 - BERTopic - Cluster - Completed ✓
2024-05-08 21:45:04,950 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-05-08 21:45:04,991 - BERTopic - Representation - Completed ✓


<bertopic._bertopic.BERTopic at 0x7a508803bf70>

In [None]:
model_sus.visualize_barchart(n_words=10)

# initial BERTopic on non sus descriptions

In [None]:
topic_model_1 = BERTopic(nr_topics = 5, calculate_probabilities=True, verbose=True)
topics1, probs1 = topic_model_1.fit_transform(non_sus_filtered_descriptions)

2024-05-01 01:27:41,077 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/19 [00:00<?, ?it/s]

2024-05-01 01:28:00,157 - BERTopic - Embedding - Completed ✓
2024-05-01 01:28:00,159 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-01 01:28:04,360 - BERTopic - Dimensionality - Completed ✓
2024-05-01 01:28:04,361 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-05-01 01:28:04,411 - BERTopic - Cluster - Completed ✓
2024-05-01 01:28:04,413 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-05-01 01:28:04,487 - BERTopic - Representation - Completed ✓
2024-05-01 01:28:04,489 - BERTopic - Topic reduction - Reducing number of topics
2024-05-01 01:28:04,555 - BERTopic - Topic reduction - Reduced number of topics from 16 to 5


In [None]:
topic_model_1.get_document_info(non_sus_data['clean_description']).head()

In [None]:
topic_model_1.visualize_topics()

In [None]:
topic_model_1.visualize_barchart(n_words = 10)

#sec BERTopic on non sus descriptions

In [None]:
train_non_sus, test_non_sus = train_test_split(non_sus_filtered_descriptions, test_size=0.5, random_state=41)

In [None]:
model_non_sus = BERTopic(nr_topics = 5, calculate_probabilities=True, verbose=True)
topics, probs = model_non_sus.fit_transform(train_non_sus)
model_non_sus.fit(test_non_sus)

2024-05-08 21:50:30,565 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/10 [00:00<?, ?it/s]

2024-05-08 21:50:42,217 - BERTopic - Embedding - Completed ✓
2024-05-08 21:50:42,220 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-08 21:50:45,120 - BERTopic - Dimensionality - Completed ✓
2024-05-08 21:50:45,122 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-05-08 21:50:45,151 - BERTopic - Cluster - Completed ✓
2024-05-08 21:50:45,154 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-05-08 21:50:45,198 - BERTopic - Representation - Completed ✓
2024-05-08 21:50:45,200 - BERTopic - Topic reduction - Reducing number of topics
2024-05-08 21:50:45,253 - BERTopic - Topic reduction - Reduced number of topics from 7 to 5
2024-05-08 21:50:45,294 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/10 [00:00<?, ?it/s]

2024-05-08 21:50:55,369 - BERTopic - Embedding - Completed ✓
2024-05-08 21:50:55,371 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-08 21:50:58,659 - BERTopic - Dimensionality - Completed ✓
2024-05-08 21:50:58,663 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-05-08 21:50:58,691 - BERTopic - Cluster - Completed ✓
2024-05-08 21:50:58,693 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-05-08 21:50:58,736 - BERTopic - Representation - Completed ✓
2024-05-08 21:50:58,738 - BERTopic - Topic reduction - Reducing number of topics
2024-05-08 21:50:58,784 - BERTopic - Topic reduction - Reduced number of topics from 8 to 5


<bertopic._bertopic.BERTopic at 0x7a507ddded70>

In [None]:
model_non_sus.visualize_barchart(n_words=10)

# BERTopic for reviews

In [None]:
sus = pd.read_csv('product_reviews.csv')
non_sus = pd.read_csv('product_reviews_all.csv')

In [None]:
def remove_emojis(text):
    # Unicode ranges for emojis
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F700-\U0001F77F"  # alchemical symbols
                           u"\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
                           u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                           u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                           u"\U0001FA00-\U0001FA6F"  # Chess Symbols
                           u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
                           u"\U00002702-\U000027B0"  # Dingbats
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

sus['review'] = sus['review'].apply(remove_emojis)
non_sus['review'] = non_sus['review'].apply(remove_emojis)
sus['review'] = sus['review'].apply(lambda text: preprocess_text(text))
non_sus['review'] = non_sus['review'].apply(lambda text: preprocess_text(text))

In [None]:
words_to_remove = {'available', 'review', 'url','page', 'please', 'part','instead','collected', 'though', 'af', 'came', 'bur', 'mind', 'excelente', 'luv','tbh', ' ', 'comodos', 'muy'}
non_sus_reviews = [each for each in non_sus['review']]
sus_reviews = [each for each in sus['review']]

sus_reviews = [' '.join(word for word in review.split() if word not in words_to_remove) for review in sus_reviews]
non_sus_reviews = [' '.join(word for word in review.split() if word not in words_to_remove) for review in non_sus_reviews]

##initial sus reviews


In [None]:
topic_model_sus = BERTopic(calculate_probabilities=True, verbose=True)
topics1, probs1 = topic_model_sus.fit_transform(sus_reviews)

2024-05-01 03:30:57,227 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/19 [00:00<?, ?it/s]

2024-05-01 03:31:05,195 - BERTopic - Embedding - Completed ✓
2024-05-01 03:31:05,201 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-01 03:31:10,892 - BERTopic - Dimensionality - Completed ✓
2024-05-01 03:31:10,894 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-05-01 03:31:10,937 - BERTopic - Cluster - Completed ✓
2024-05-01 03:31:10,944 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-05-01 03:31:10,973 - BERTopic - Representation - Completed ✓


In [None]:
topic_model_sus.visualize_barchart(n_words = 10)

## sec attempt sus reviews

In [None]:
train_sus1, test_sus1 = train_test_split(sus_reviews, test_size=0.5, random_state=41)

In [None]:
model_sus1 = BERTopic(calculate_probabilities=True, verbose=True)
topics, probs = model_sus1.fit_transform(train_sus1)
model_sus1.fit(test_sus1)

2024-05-09 00:07:10,820 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/10 [00:00<?, ?it/s]

2024-05-09 00:07:18,510 - BERTopic - Embedding - Completed ✓
2024-05-09 00:07:18,512 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-09 00:07:21,431 - BERTopic - Dimensionality - Completed ✓
2024-05-09 00:07:21,433 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-05-09 00:07:21,459 - BERTopic - Cluster - Completed ✓
2024-05-09 00:07:21,466 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-05-09 00:07:21,488 - BERTopic - Representation - Completed ✓
2024-05-09 00:07:21,508 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/10 [00:00<?, ?it/s]

2024-05-09 00:07:25,385 - BERTopic - Embedding - Completed ✓
2024-05-09 00:07:25,386 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-09 00:07:29,449 - BERTopic - Dimensionality - Completed ✓
2024-05-09 00:07:29,456 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-05-09 00:07:29,490 - BERTopic - Cluster - Completed ✓
2024-05-09 00:07:29,501 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-05-09 00:07:29,539 - BERTopic - Representation - Completed ✓


<bertopic._bertopic.BERTopic at 0x7a502c801720>

In [None]:
model_sus1.visualize_barchart(n_words=10)

## initial non-sus reviews

In [None]:
topic_model_non_sus = BERTopic(nr_topics= 5, calculate_probabilities=True, verbose=True)
topics, probs = topic_model_non_sus.fit_transform(non_sus_reviews)

2024-05-01 03:29:28,913 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/46 [00:00<?, ?it/s]

2024-05-01 03:29:51,849 - BERTopic - Embedding - Completed ✓
2024-05-01 03:29:51,852 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-01 03:30:01,230 - BERTopic - Dimensionality - Completed ✓
2024-05-01 03:30:01,234 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-05-01 03:30:01,579 - BERTopic - Cluster - Completed ✓
2024-05-01 03:30:01,581 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-05-01 03:30:01,747 - BERTopic - Representation - Completed ✓
2024-05-01 03:30:01,749 - BERTopic - Topic reduction - Reducing number of topics
2024-05-01 03:30:01,866 - BERTopic - Topic reduction - Reduced number of topics from 33 to 5


In [None]:
topic_model_non_sus.visualize_topics()

In [None]:
topic_model_non_sus.visualize_barchart(n_words = 10)

##sec attempt non sus reviews

In [None]:
train_nonsus_r, test_nonsus_r = train_test_split(non_sus_reviews, test_size=0.5, random_state=45)

In [None]:
topic_model_nonsus = BERTopic(nr_topics = 5, calculate_probabilities=True, verbose=True)
topics_r, probs_r = topic_model_nonsus.fit_transform(train_nonsus_r)
topic_model_nonsus.fit(test_nonsus_r)

2024-05-09 00:38:36,824 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/23 [00:00<?, ?it/s]

2024-05-09 00:38:49,847 - BERTopic - Embedding - Completed ✓
2024-05-09 00:38:49,850 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-09 00:38:54,515 - BERTopic - Dimensionality - Completed ✓
2024-05-09 00:38:54,517 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-05-09 00:38:54,573 - BERTopic - Cluster - Completed ✓
2024-05-09 00:38:54,575 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-05-09 00:38:54,610 - BERTopic - Representation - Completed ✓
2024-05-09 00:38:54,612 - BERTopic - Topic reduction - Reducing number of topics
2024-05-09 00:38:54,617 - BERTopic - Topic reduction - Reduced number of topics from 3 to 3
2024-05-09 00:38:54,639 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/23 [00:00<?, ?it/s]

2024-05-09 00:39:06,797 - BERTopic - Embedding - Completed ✓
2024-05-09 00:39:06,799 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-09 00:39:10,946 - BERTopic - Dimensionality - Completed ✓
2024-05-09 00:39:10,948 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-05-09 00:39:11,013 - BERTopic - Cluster - Completed ✓
2024-05-09 00:39:11,016 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-05-09 00:39:11,061 - BERTopic - Representation - Completed ✓
2024-05-09 00:39:11,062 - BERTopic - Topic reduction - Reducing number of topics
2024-05-09 00:39:11,065 - BERTopic - Topic reduction - Reduced number of topics from 3 to 3


<bertopic._bertopic.BERTopic at 0x7a50637bd810>

In [None]:
topic_model_nonsus.visualize_barchart(n_words=10)