<a href="https://colab.research.google.com/github/lbk209/topic_modeling/blob/main/tm_wine_reviews_cab6_params.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setting

In [3]:
%%capture
!pip install bertopic accelerate adjustText

In [4]:
import os
import pandas as pd
import plotly.express as px
from tqdm import tqdm

In [5]:
# to work with path name having blank
import locale
locale.getpreferredencoding = lambda: "UTF-8"

run to copy files from google drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


run to upload files from local

In [None]:
from google.colab import files
uploaded = files.upload()

# 🗂️ Data

In [6]:
file = 'wine_reviews_cab6_transl2'
path = 'sample_data'
path_src = '/content/drive/MyDrive/Colab\ Notebooks/'

In [7]:
!unzip {path_src}/{file}.zip -d {path}

Archive:  /content/drive/MyDrive/Colab Notebooks//wine_reviews_cab6_transl2.zip
  inflating: sample_data/wine_reviews_cab6_transl2.csv  


In [8]:
import pandas as pd
f = f'{path}/{file}.csv'
df_reviews = pd.read_csv(f, parse_dates=['date'])
df_reviews.head()

Unnamed: 0,id,wid,wine,date,review,lang,review_transl,review_len,length_group
0,0,0,Casillero del Diablo Cabernet Sauvignon (Reserva),2024-01-24,Little too cherry on the front end for me,en,Little too cherry on the front end for me,41,> 10
1,1,0,Casillero del Diablo Cabernet Sauvignon (Reserva),2024-01-20,"En liten skarp knekk i smaken. Ok fredagsvin,m...",no,A small sharp crack in the taste. Ok Friday wi...,62,> 10
2,2,0,Casillero del Diablo Cabernet Sauvignon (Reserva),2024-01-19,Aight,en,Aight,5,<= 10
3,3,0,Casillero del Diablo Cabernet Sauvignon (Reserva),2024-01-18,무난한 맛 가성비 좋은듯,ko,Good taste and good value for money,35,> 10
4,4,0,Casillero del Diablo Cabernet Sauvignon (Reserva),2024-01-17,oak cherry black cherry chocolate blackcurrant...,en,oak cherry black cherry chocolate blackcurrant...,77,> 10


## Review data

In [9]:
df = df_reviews.groupby(by=['wid','wine']).id.count().rename('count').reset_index(1)
swid = df.loc[df['count']>100]
swid

Unnamed: 0_level_0,wine,count
wid,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Casillero del Diablo Cabernet Sauvignon (Reserva),472
1,Yellow Tail Cabernet Sauvignon,136
3,San Pedro Gato Negro Cabernet Sauvignon,879


In [10]:
df_reviews = df_reviews.loc[df_reviews.wid.isin(swid.index)]

In [11]:
# DO NOT CHANGE the (document) id as it is index to topics of topic model (topic_model.topics_)
docs = df_reviews.review_transl.tolist()

# 🗨️ **BERTopic**

## Params

### Embedding

In [44]:
# 12-layer, 384-hidden
st_id = 'all-MiniLM-L12-v2'

### Dimensionality Reduction

In [45]:
n_components = 15
n_neighbors = 10

# the minimum distance apart that points are allowed to be in the low dimensional representation.
# This means that low values of min_dist will result in clumpier embeddings.
# This can be useful if you are interested in clustering, or in finer topological structure.
# Larger values of min_dist will prevent UMAP from packing points together and will focus on the preservation of the broad topological structure instead.
min_dist = 0 #0.1

random_state=42

### Clustering

In [46]:
# a lower min_cluster_size will generate more topics
min_cluster_size = 100 #50

# The implementation defaults this value (if it is unspecified) to whatever min_cluster_size is set to.
# The larger the value of min_samples you provide, the more conservative the clustering – more points will be declared as noise,
# and clusters will be restricted to progressively more dense area
#min_samples = None
r = 0.6
min_samples = round(min_cluster_size*r)

# We need this to avoid an AttributeError when integrating our custom HDBSCAN step with BERTopic
prediction_data=True

# can improve the resultant clusters
gen_min_span_tree=True

### BERTopic

In [47]:
# top n words in combined documents in a cluster
top_n_words = 5 #10

# Calculate the probabilities of all topics per document instead of the probability of the assigned topic per document.
# This could slow down the extraction of topics if you have many documents (> 100_000).
calculate_probabilities=False

## **Sub-models**

In [None]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer(st_id)
embeddings = embedding_model.encode(docs, show_progress_bar=True)

In [49]:
from umap import UMAP

umap_model = UMAP(
    n_components=n_components,
    n_neighbors=n_neighbors,
    min_dist=min_dist,
    metric='cosine', random_state=random_state)

In [19]:
# Pre-reduce embeddings for visualization purposes
reduced_embeddings = UMAP(n_neighbors=n_neighbors, n_components=2, min_dist=min_dist, metric='cosine', random_state=random_state).fit_transform(embeddings)

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


In [66]:
from hdbscan import HDBSCAN

hdbscan_model = HDBSCAN(
    min_cluster_size=min_cluster_size,
    min_samples=min_samples,
    metric='euclidean',
    cluster_selection_method='eom',
    prediction_data=prediction_data,
    gen_min_span_tree=gen_min_span_tree
    )

In [67]:
from bertopic.representation import KeyBERTInspired

keybert = KeyBERTInspired()

representation_model = {
    "KeyBERT": keybert
}

CountVectorizer before training the topic model to minimize the size of the resulting c-TF-IDF matrix:

In [60]:
min_df = 0.01
len(docs), min_df * len(docs)

(1487, 14.870000000000001)

In [61]:
from sklearn.feature_extraction.text import CountVectorizer

#vectorizer_model = None
vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 3), min_df=min_df)

## 🔥 **Training**

Now that we have our models prepared, we can start training our topic model! We supply BERTopic with the sub-models of interest, run `.fit_transform`, and see what kind of topics we get.

In [68]:
from bertopic import BERTopic

topic_model = BERTopic(
  # Sub-models
  embedding_model=embedding_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  representation_model=representation_model,
  vectorizer_model=vectorizer_model,

  top_n_words=top_n_words,
  calculate_probabilities=calculate_probabilities,
  verbose=True
)

# Train model
topics, probs = topic_model.fit_transform(docs, embeddings)

2024-02-06 00:46:11,967 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-02-06 00:46:19,630 - BERTopic - Dimensionality - Completed ✓
2024-02-06 00:46:19,636 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-02-06 00:46:19,809 - BERTopic - Cluster - Completed ✓
2024-02-06 00:46:19,819 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-02-06 00:46:22,488 - BERTopic - Representation - Completed ✓


In [63]:
# num of topics
len(topic_model.get_topics()) - 1

2

In [64]:
# Show topics
topic_model.get_topic_info().head(7)

Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,Representative_Docs
0,-1,693,-1_good_cabernet_wine_medium,"[good, cabernet, wine, medium, red]","[good, great, excellent, best, bad, decent, ni...","[Good, Very good, Good]"
1,0,448,0_oak_cherry_chocolate_fruity,"[oak, cherry, chocolate, fruity, good]","[oak cherry, black cherry, cherry black cherry...","[oak cherry black cherry, oak cherry black che..."
2,1,346,1_wine_good_red_good wine,"[wine, good, red, good wine, price]","[good wine, excellent wine, nice wine, wine go...","[Good wine!, A good wine!, Very good wine]"


## 💦 **Post-processing**

### Vectorizer
Pass the CountVectorizer after training where llm used full context in training => ???

In [43]:
try:
    vectorizer_model
    print('vectorizer_model assigned before!')
except:
    print('passing vectorizer_model ater training')

vectorizer_model assigned before!


In [27]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [25]:
#stop_additional = ['cabernet', 'sauvignon', 'cab', 'wine']
stop_additional = []

ngram_range = (1, 3)

# When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold.
# This value is also called cut-off in the literature.
# If float, the parameter represents a proportion of documents, integer absolute counts
min_df = 10

In [37]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

# Fine-tune topic representations after training BERTopic
# you can pass the CountVectorizer before and after training your topic model.
# Passing it before training allows you to minimize the size of the resulting c-TF-IDF matrix

stopwords = list(stopwords.words('english')) + stop_additional

vectorizer_model = CountVectorizer(stop_words=stopwords, ngram_range=ngram_range, min_df=min_df)

topic_model.update_topics(docs, vectorizer_model=vectorizer_model, top_n_words=top_n_words)

In [38]:
topic_model.get_topic_info().head(7)

Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,Representative_Docs
0,-1,693,-1_good_wine_cabernet_medium,"[good, wine, cabernet, medium, red]","[strawberry, ruby, flavor, fruit, tasty, fruit...",[Full. Taste of plum and vanilla. Good for the...
1,0,448,0_oak_cherry_chocolate_fruity,"[oak, cherry, chocolate, fruity, good]","[strawberry, berries, cherries, flavor, fruit,...","[chocolate, oak, vanilla, cherry and blackberr..."
2,1,346,1_wine_good_red_good wine,"[wine, good, red, good wine, price]","[wine, wines, alcohol, bottle, drink, deliciou...","[Good wine!, Good wine!!, Light wine, good val..."


### Outlier reduction

In [None]:
# Use the "c-TF-IDF" strategy with a threshold
# threshold is the minimum similarity.
new_topics = topic_model.reduce_outliers(docs, topics , strategy="c-tf-idf", threshold=0.1)

# count outliers
len([x for x in new_topics if x < 0])

6455

In [None]:
t = topics
#t = new_topics

new_topics = topic_model.reduce_outliers(docs, t , strategy="distributions", threshold=0.5)

# count outliers
len([x for x in new_topics if x < 0])

100%|██████████| 7/7 [00:22<00:00,  3.25s/it]


6231

In [None]:
t = topics
#t = new_topics
new_topics = topic_model.reduce_outliers(docs, t, strategy="embeddings", threshold=0.5)

# count outliers
len([x for x in new_topics if x < 0])

1988

In [None]:
t = topics
#t = new_topics

# the threshold is minimum probability when strategy="probabilities"
new_topics = topic_model.reduce_outliers(docs, t, strategy="probabilities", probabilities=probs, threshold=0.05)

# count outliers
len([x for x in new_topics if x < 0])

6678

#### update with new_topics

In [None]:
topic_model.update_topics(docs, topics=new_topics)



# 📊 Visualization

In [None]:
titles = [x[:100] for x in docs]
topics_to_visualize = range(20)

topic_model.visualize_documents(titles, reduced_embeddings=reduced_embeddings,
                                hide_annotations=True, hide_document_hover=False,
                                topics=topics_to_visualize,
                                #custom_labels=True
                                )

## Topics per Class

In [None]:
classes = df_reviews.wine.tolist()
topics_per_class = topic_model.topics_per_class(docs, classes=classes)

3it [00:00, 16.52it/s]


In [None]:
custom_labels = True
normalize_frequency = False

In [None]:
topic_model.visualize_topics_per_class(topics_per_class, top_n_topics=10,
                                       normalize_frequency = normalize_frequency,
                                       width=1000, height=500,
                                       custom_labels=custom_labels)

Share of reviews

In [None]:
total_freq = df_reviews.groupby('wine').count().id.to_dict() # num of documents per class

df = topics_per_class.assign(Frequency=topics_per_class.apply(lambda x: x.Frequency/total_freq[x.Class], axis=1))

sum([v for k,v in total_freq.items()]), len(docs)

(1487, 1487)

In [None]:
total_freq

{'Casillero del Diablo Cabernet Sauvignon (Reserva)': 472,
 'San Pedro Gato Negro Cabernet Sauvignon': 879,
 'Yellow Tail Cabernet Sauvignon': 136}

In [None]:
topic_model.visualize_topics_per_class(df, top_n_topics=10,
                                       normalize_frequency = normalize_frequency,
                                       width=1000, height=500,
                                       custom_labels=custom_labels)

# 🎚️ **Parameter Study**

In [None]:
#-- Embedding
st_id = 'all-MiniLM-L12-v2'

#-- UMAP
n_components = 15
n_neighbors = 10

# the minimum distance apart that points are allowed to be in the low dimensional representation.
# This means that low values of min_dist will result in clumpier embeddings.
# This can be useful if you are interested in clustering, or in finer topological structure.
# Larger values of min_dist will prevent UMAP from packing points together and will focus on the preservation of the broad topological structure instead.
min_dist = 0 #0.1

random_state=42

#-- HDBSCAN
# a lower min_cluster_size will generate more topics
min_cluster_size = 100 #50

# The implementation defaults this value (if it is unspecified) to whatever min_cluster_size is set to.
# The larger the value of min_samples you provide, the more conservative the clustering – more points will be declared as noise,
# and clusters will be restricted to progressively more dense area
#min_samples = None
r = 0.6
min_samples = round(min_cluster_size*r)

# We need this to avoid an AttributeError when integrating our custom HDBSCAN step with BERTopic
prediction_data=True

# can improve the resultant clusters
gen_min_span_tree=True


#-- BERTopic
# top n words in combined documents in a cluster
top_n_words = 5 #10

# Calculate the probabilities of all topics per document instead of the probability of the assigned topic per document.
# This could slow down the extraction of topics if you have many documents (> 100_000).
calculate_probabilities=False

In [71]:
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from bertopic.representation import KeyBERTInspired
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic

def train_bertopic(docs,
                   min_df, # CountVectorizer
                   n_components, n_neighbors, min_dist, random_state, # UMAP
                   min_cluster_size, min_samples, # HDBSCAN
                   st_id = 'all-MiniLM-L12-v2',
                   ngram_range=(1, 3),
                   show_progress_bar=False,
                   prediction_data=True,
                   gen_min_span_tree=True,
                   top_n_words=5,
                   calculate_probabilities=False,
                   verbose=False
                   ):

    #-- sub-models
    embedding_model = SentenceTransformer(st_id)
    embeddings = embedding_model.encode(docs, show_progress_bar=show_progress_bar)

    vectorizer_model = CountVectorizer(stop_words="english", ngram_range=ngram_range, min_df=min_df)

    umap_model = UMAP(
        n_components=n_components,
        n_neighbors=n_neighbors,
        min_dist=min_dist,
        metric='cosine', random_state=random_state)

    hdbscan_model = HDBSCAN(
        min_cluster_size=min_cluster_size,
        min_samples=min_samples,
        metric='euclidean',
        cluster_selection_method='eom',
        prediction_data=prediction_data,
        gen_min_span_tree=gen_min_span_tree
        )

    keybert = KeyBERTInspired()
    representation_model = {
        "KeyBERT": keybert
    }

    #-- train bertopic
    topic_model = BERTopic(
        embedding_model=embedding_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        vectorizer_model=vectorizer_model,
        representation_model=representation_model,

        top_n_words=top_n_words,
        calculate_probabilities=calculate_probabilities,
        verbose=verbose
    )

    # Train model
    topics, probs = topic_model.fit_transform(docs, embeddings)

    # Show topics
    return topic_model.get_topic_info()

In [None]:


for p in params:
    t = train_bertopic(docs, st_id, min_df,
                       n_components, n_neighbors, min_dist, random_state,
                       min_cluster_size, min_samples)

In [87]:
min_df = 0.01
n_components = 15
n_neighbors = 10
min_dist = 0
random_state = 42
min_cluster_size = 100
min_samples = None

param_set = [50, 100]

df_cs = pd.DataFrame()
#for i, min_cluster_size in tqdm(enumerate(param_set)):
for i, min_cluster_size in enumerate(param_set):
    df = train_bertopic(docs,
                    min_df, # CountVectorizer
                    n_components, n_neighbors, min_dist, random_state, # UMAP
                    min_cluster_size, min_samples)

    n = len(df) - 1
    print(f'{i}) num of topics: {n}')

    df = df.loc[df.Topic==0, ['Count', 'Representation']].assign(case=i)
    df_cs = pd.concat([df_cs, df])

df_cs

1it [00:54, 54.41s/it]

num of topics: 6


2it [02:08, 64.42s/it]

num of topics: 2





Unnamed: 0,Count,Representation,case
1,345,"[wine, good, good wine, red, price]",0
1,497,"[oak, cherry, chocolate, fruity, red]",1


In [84]:
print(df.loc[df.Topic==0, ['Count', 'Representation']].to_markdown())

|    |   Count | Representation                                  |
|---:|--------:|:------------------------------------------------|
|  1 |     497 | ['oak', 'cherry', 'chocolate', 'fruity', 'red'] |


In [86]:
df.loc[df.Topic==0, ['Count', 'Representation']].assign(case=1)

Unnamed: 0,Count,Representation,case
1,497,"[oak, cherry, chocolate, fruity, red]",1


check Set Params in https://github.com/lbk209/gradient_boosting/blob/main/02s_training_signals_LgbmTuning.ipynb