# BERTopic test with G2 data

In [5]:
import pandas as pd

## Get sample data

In [7]:
# load data 
df = pd.read_csv("../data/G2_Reviews_2023-06-29.csv", engine='python')
# df = pd.DataFrame(docs[0:10])

df.shape


(2207053, 48)

In [8]:
df.columns

Index(['survey_response_id', 'product_id', 'product', 'star rating', 'title',
       'likely to reccommend', 'what do you like best?',
       'what do you dislike?', 'business problems solved',
       'meets requirements', 'ease of use', 'ease of setup', 'ease of admin',
       'quality of support', 'ease of doing business with',
       'what is your level of experience with this product?',
       'frequency_of_use', 'is this product headed in the right direction?',
       'recommendations to others considering?',
       'did you deploy in the cloud or on-premise?', 'go_live_time_text',
       'what % of your users have fully adopted the system?',
       'how did you implement?', 'implementation_year',
       'number of users purchased', 'price',
       'your one time costs for setting up this product',
       'estimated time to roi', 'annual recurring cost',
       'what are the term of your contract?',
       'what % discount off list price did you recieve?',
       'did you switch f

In [4]:
# one product reviews count 
df[df['product_id'] == 501].shape

(123, 48)

In [9]:
# select sample reviews - positive reviews
df_sample_like = df.iloc[0:5000,6]
# df_sample_like.head()

In [10]:
# select samples
df_sample_dislike = df.iloc[0:5000,7]
# df_sample_dislike.head()

## Train one model for all products

In [7]:
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer

from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer


# Step 1 - Extract embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Step 2 - Reduce dimensionality
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')
# Step 2.1 - PCA
from sklearn.decomposition import PCA
pca_model = PCA(n_components=5)

# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
# Step 3.1 - KMeans
from sklearn.cluster import KMeans
kmeans_model = KMeans(n_clusters=100)

# Step 4 - Tokenize topics
vectorizer_model = CountVectorizer(stop_words="english")

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer()

# Step 6 - (Optional) Fine-tune topic representations with 
# a `bertopic.representation` model
representation_model = KeyBERTInspired()

# All steps together
topic_model = BERTopic(
  embedding_model=embedding_model,          # Step 1 - Extract embeddings
  # umap_model=umap_model,                    # Step 2 - Reduce dimensionality
  umap_model=pca_model,                    # Step 2 - Reduce dimensionality - PCA
  # hdbscan_model=hdbscan_model,              # Step 3 - Cluster reduced embeddings
  hdbscan_model=kmeans_model,              # Step 3 - Cluster reduced embeddings - Kmeans
  vectorizer_model=vectorizer_model,        # Step 4 - Tokenize topics
  ctfidf_model=ctfidf_model,                # Step 5 - Extract topic words
  representation_model=representation_model, # Step 6 - (Optional) Fine-tune topic represenations
  verbose=True
)


  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


fix for this issue: https://github.com/MaartenGr/BERTopic/issues/1412

In [8]:
# !pip uninstall BERTopic

In [9]:
# !pip install --upgrade git+https://github.com/scikit-learn-contrib/hdbscan.git

In [10]:
# !pip install --upgrade BERTopic

In [11]:
#convert to list & train
docs_like = df_sample_like.to_list()
topics_like, probabilities_like = topic_model.fit_transform(docs_like)

Batches:   0%|          | 0/157 [00:00<?, ?it/s]

2023-08-15 13:27:06,486 - BERTopic - Transformed documents to Embeddings
2023-08-15 13:27:06,651 - BERTopic - Reduced dimensionality
2023-08-15 13:27:09,326 - BERTopic - Clustered reduced embeddings


In [21]:
# save model
topic_model.save("G2_likes_topics_model_v1") # v1 - 5000 reviews / kmeans 100 topics
# topic_model.save("G2_likes_topics_model_v2") # v2 - 500 reviews / hdbscan

In [13]:
topic_model.get_topic_freq().head(11)

Unnamed: 0,Topic,Count
9,0,77
27,1,75
44,2,72
42,3,70
61,4,69
11,5,68
53,6,67
99,7,66
8,8,65
72,9,65


In [14]:
topic_model.get_topic(1)

[('studio', 0.49346614),
 ('visuals', 0.4610408),
 ('microsoft', 0.45853853),
 ('ui', 0.43951738),
 ('tools', 0.43834805),
 ('photoshop', 0.43458575),
 ('developer', 0.4257627),
 ('presentations', 0.42078912),
 ('visual', 0.40001923),
 ('customizations', 0.39443448)]

In [15]:
topic_model.visualize_topics()

In [16]:
topic_model.visualize_barchart()

## Label new document

In [19]:
# get data from above Tutorial 2
df_new_like = df.iloc[5000:5002,6]
df_new_like.head(2)

5000    Like that the product is constantly looking fo...
5001    Have used Firebug on a few occasions in the pa...
Name: what do you like best?, dtype: object

In [20]:
docs_new = df_new_like.to_list()

In [22]:
# load model
loaded_model = BERTopic.load("G2_likes_topics_model_v1")

In [23]:
# get topics for new tweets
new_topics = loaded_model.transform(docs_new)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2023-08-15 16:58:28,179 - BERTopic - Reduced dimensionality
2023-08-15 16:58:28,304 - BERTopic - Predicted clusters


In [25]:
new_topics

([40, 66], None)

In [26]:
# returned topics
for i in range(len(new_topics[0])):
    i_top = new_topics[0][i]
    try:
        print(i_top)
        print(f'{loaded_model.get_topic(i_top)}')
    except:
        print('-x- ERROR -x-')
    print('--------------------')

40
[('vlc', 0.612945), ('videos', 0.50458306), ('youtube', 0.46602434), ('audio', 0.43218714), ('video', 0.36829388), ('dvds', 0.3284006), ('animations', 0.31548744), ('ios', 0.30591714), ('play', 0.27532205), ('android', 0.2748412)]
--------------------
66
[('debuggers', 0.512261), ('firebug', 0.49422243), ('firefox', 0.49062353), ('html', 0.45907712), ('debugging', 0.45610794), ('browser', 0.44012266), ('javascript', 0.43939668), ('chrome', 0.40166944), ('css', 0.3466862), ('webpage', 0.3399068)]
--------------------


In [40]:
# loaded_model.get_document_info(docs_new)

In [12]:
# loaded_model.get_topic_info()

## Train one model per product

In [37]:
# Step 1 - Extract embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Step 2 - Reduce dimensionality
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')
# Step 2.1 - PCA
from sklearn.decomposition import PCA
pca_model = PCA(n_components=5)

# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
# Step 3.1 - KMeans
from sklearn.cluster import KMeans
kmeans_model = KMeans(n_clusters=50)

# Step 4 - Tokenize topics
vectorizer_model = CountVectorizer(stop_words="english")

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer()

# Step 6 - (Optional) Fine-tune topic representations with 
# a `bertopic.representation` model
representation_model = KeyBERTInspired()

# All steps together
topic_model_one = BERTopic(
  embedding_model=embedding_model,          # Step 1 - Extract embeddings
  # umap_model=umap_model,                    # Step 2 - Reduce dimensionality
  umap_model=pca_model,                    # Step 2 - Reduce dimensionality - PCA
  # hdbscan_model=hdbscan_model,              # Step 3 - Cluster reduced embeddings
  hdbscan_model=kmeans_model,              # Step 3 - Cluster reduced embeddings - Kmeans
  vectorizer_model=vectorizer_model,        # Step 4 - Tokenize topics
  ctfidf_model=ctfidf_model,                # Step 5 - Extract topic words
  representation_model=representation_model, # Step 6 - (Optional) Fine-tune topic represenations
  verbose=True
)

In [13]:
# get data just for one product
df_one_like = df[df['product_id'] == 501]
df_one_like = df_one_like.iloc[:,6]
df_one_like.shape

(123,)

!!! preprocessing here would be goood to get rid of words that we dont want to appear in the topics (e.g. product name)

In [14]:
# select all
docs_like_one = df_one_like.to_list()
# docs_like_one

In [40]:
# train
topics_one_like, probabilities_one_like = topic_model_one.fit_transform(docs_like_one)

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2023-08-15 18:48:09,806 - BERTopic - Transformed documents to Embeddings
2023-08-15 18:48:09,816 - BERTopic - Reduced dimensionality
2023-08-15 18:48:11,102 - BERTopic - Clustered reduced embeddings


In [41]:
# get top topics
topic_model_one.get_topic_freq().head(11)

Unnamed: 0,Topic,Count
29,0,6
7,1,6
11,2,6
15,3,4
22,4,4
17,5,4
25,6,4
18,7,4
32,8,3
33,9,3


In [42]:
topic_model_one.get_topic(1)

[('heroku', 0.5590676),
 ('cloud', 0.4289738),
 ('rails', 0.39145768),
 ('hosting', 0.37133536),
 ('nosql', 0.34346235),
 ('server', 0.326001),
 ('deploys', 0.31266207),
 ('postgresql', 0.29791492),
 ('plugins', 0.29462257),
 ('database', 0.27606076)]

In [43]:
# viz
topic_model_one.visualize_barchart()