In [39]:
"""
project references: 
https://maartengr.github.io/BERTopic/getting_started/best_practices/best_practices.html
https://medium.com/rapids-ai/faster-topic-modeling-with-bertopic-and-rapids-cuml-5c7559aba898
https://towardsdatascience.com/topic-modeling-with-lsa-plsa-lda-nmf-bertopic-top2vec-a-comparison-5e6ce4b1e4a5
https://hdbscan.readthedocs.io/en/latest/parameter_selection.html
"""

# installs
# !pip install bertopic 

"""
NOTE: to use GPU acclerated UMAP and HDBSCAN, you need to install RAPIDS cuML.
Ensure you have the proper cupy version (cupy-cuda11x for Cheaha)
For more installation info, see: https://docs.rapids.ai/install
"""

# imports
import os
import gzip
import json
import torch
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer, util
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, PartOfSpeech

# GPU accelerated
from cuml.manifold import UMAP
from cuml.cluster import HDBSCAN

# not GPU accelerated
# from umap import UMAP
# from hdbscan import HDBSCAN
    
print("imports compelete")


imports compelete


In [2]:
## Download and decompress data

# uncomment below to initially download compressed data file
# !wget https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_v2/categoryFilesSmall/Gift_Cards_5.json.gz


chunk_size=5 * 1024 * 1024


with gzip.open("Gift_Cards_5.json.gz", 'rb') as f:
    with open("Gift_Cards_5.json", 'wb') as f_out:
        while True:
            chunk = f.read(chunk_size)
            if not chunk:
                break
            f_out.write(chunk)

print("done")

done


In [3]:
%%time

# store data in list

training_data_gift_cards = []

with open('Gift_Cards_5.json') as f:
    for review in f:
        review = json.loads(review)
        text = review.get("reviewText", "").strip()
        summary = review.get("summary", "").strip()
        review = summary + " " + text
        if review.strip():
            training_data_gift_cards.append(review)
            
print("done")


done
CPU times: user 11.5 ms, sys: 2.69 ms, total: 14.2 ms
Wall time: 45.9 ms


In [4]:
%%time

# Generate embeddings for data, using a GPU if available

# Check if a GPU is available
if torch.cuda.is_available():
    device = 'cuda' 
    print("using gpu")
else:
    device = 'cpu'

model = SentenceTransformer('all-MiniLM-L6-v2').to(device)

embeddings = model.encode(training_data_gift_cards, device=device, show_progress_bar=True)



using gpu


Batches: 100%|██████████| 93/93 [00:05<00:00, 17.68it/s]

CPU times: user 8.14 s, sys: 1.58 s, total: 9.73 s
Wall time: 25.4 s





In [34]:
%%time
# train and save model

os.environ['TOKENIZERS_PARALLELISM'] = 'false'

print(f"training data amount: {len(training_data_gift_cards)}")


umap_model = UMAP(random_state=42)

# does clustering
hdbscan_model = HDBSCAN(min_cluster_size=45, min_samples=20, cluster_selection_epsilon=1)

# tokenize, remove stopwords
vectorizer_model = CountVectorizer(ngram_range=(1, 1), stop_words="english", min_df = 2)


keybert_model = KeyBERTInspired()
pos_model = PartOfSpeech("en_core_web_sm")
mmr_model = MaximalMarginalRelevance(diversity=0.3)

# All representation models
representation_model = {
    "KeyBERT": keybert_model,
    "MMR": mmr_model,
    "POS": pos_model
}

bert_model = BERTopic(
    embedding_model=model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    representation_model=representation_model,
    top_n_words=10,
    nr_topics=25,
    verbose=True
)

topics, probs = bert_model.fit_transform(training_data_gift_cards, embeddings)

bert_model.save("bertopic_model_gift_cards_full")

print("model trained")

training data amount: 2972


2023-11-26 20:46:00,604 - BERTopic - Reduced dimensionality
2023-11-26 20:46:00,656 - BERTopic - Clustered reduced embeddings
2023-11-26 20:46:01,069 - BERTopic - Reduced number of topics from 8 to 8


model trained
CPU times: user 2.72 s, sys: 77.9 ms, total: 2.8 s
Wall time: 1.69 s


In [36]:
# visualize results

bert_model.visualize_topics().write_html("./intertopic_dist_gift_cards_full.html")
bert_model.visualize_barchart(top_n_topics = 25).write_html("./barchart_gift_cards_full.html")
bert_model.visualize_hierarchy().write_html("./hieararchy_gift_cards_full.html")
bert_model.visualize_documents(training_data_gift_cards).write_html("./projections_gift_cards_full.html")

print(bert_model.get_topic_info())

   Topic  Count                           Name  \
0     -1    261  -1_stars_worked_arrived_thank   
1      0     61     0_great_stars_weekend_guft   
2      1    148    1_stars_great_good_delivery   
3      2     97     2_stars_loved_love_awesome   
4      3   1620        3_gift_card_cards_great   
5      4    634   4_stars_gift_christmas_great   
6      5     53       5_perfect_pretty_good_ok   
7      6     98   6_product_great_seller_stars   

                                      Representation  \
0  [stars, worked, arrived, thank, upload, works,...   
1  [great, stars, weekend, guft, hand, movies, go...   
2  [stars, great, good, delivery, fast, worked, g...   
3  [stars, loved, love, awesome, girlfriend, enjo...   
4  [gift, card, cards, great, amazon, love, good,...   
5  [stars, gift, christmas, great, cute, gifts, e...   
6  [perfect, pretty, good, ok, ty, nice, great, s...   
7  [product, great, seller, stars, deal, good, pu...   

                                            