<a href="https://colab.research.google.com/github/lbk209/topic_modeling/blob/main/tm_wine_reviews_params.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setting

In [1]:
%%capture
!pip install bertopic accelerate adjustText

In [1]:
import os
import pandas as pd
import plotly.express as px
from tqdm import tqdm

In [2]:
# to work with path name having blank
import locale
locale.getpreferredencoding = lambda: "UTF-8"

run to copy files from google drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


run to upload files from local

In [None]:
from google.colab import files
uploaded = files.upload()

# 🗂️ Data

In [3]:
file = 'wine_reviews'
path = 'sample_data'
path_src = '/content/drive/MyDrive/Colab\ Notebooks/'

In [4]:
!unzip {path_src}/{file}.zip -d {path}

Archive:  /content/drive/MyDrive/Colab Notebooks//wine_reviews.zip
replace sample_data/wine_reviews_240124.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [5]:
import os

files = [x for x in os.listdir(path) if x.startswith(file)]
files

['wine_reviews_240124.csv', 'wine_reviews_240207.csv']

In [6]:
import pandas as pd

df_reviews = pd.DataFrame()
for f in files:
    df = pd.read_csv(f'{path}/{f}', parse_dates=['date'])
    df_reviews = pd.concat([df_reviews, df])

df_reviews = df_reviews.reset_index(drop=True).rename_axis('id').reset_index()
df_reviews.head()

Unnamed: 0,id,wid,wine,date,review,source,lang,review_transl
0,0,0,Casillero del Diablo Cabernet Sauvignon (Reserva),2024-01-24,Little too cherry on the front end for me,vivino,en,Little too cherry on the front end for me
1,1,0,Casillero del Diablo Cabernet Sauvignon (Reserva),2024-01-20,"En liten skarp knekk i smaken. Ok fredagsvin,m...",vivino,no,A small sharp crack in the taste. Ok Friday wi...
2,2,0,Casillero del Diablo Cabernet Sauvignon (Reserva),2024-01-19,Aight,vivino,en,Aight
3,3,0,Casillero del Diablo Cabernet Sauvignon (Reserva),2024-01-18,무난한 맛 가성비 좋은듯,vivino,ko,Good taste and good value for money
4,4,0,Casillero del Diablo Cabernet Sauvignon (Reserva),2024-01-17,oak cherry black cherry chocolate blackcurrant...,vivino,en,oak cherry black cherry chocolate blackcurrant...


## Review data

In [7]:
import numpy as np

#df_reviews.loc[df_reviews.lang.str.contains('ERROR')]

cond = df_reviews.review_transl.isna()
df_reviews.loc[cond]

Unnamed: 0,id,wid,wine,date,review,source,lang,review_transl
1745,1745,7,Kendall-Jackson Vintner's Reserve Cabernet Sau...,2023-03-18,:-),vivino,<-- ERROR -->,
2019,2019,9,Frontera Cabernet Sauvignon,2022-05-21,😡,vivino,<-- ERROR -->,
2065,2065,9,Frontera Cabernet Sauvignon,2022-01-24,🤌🏻,vivino,<-- ERROR -->,
2352,2352,12,The 7th Generation - G7 Chardonnay,2022-01-31,🌵🍏🥝🧈,vivino,<-- ERROR -->,
2556,2556,13,San Pedro Gato Negro Chardonnay,2022-04-27,… …. ….,vivino,<-- ERROR -->,
2941,2941,18,Montes Montes Alpha Cabernet Sauvignon,2022-12-19,♡♡♡,vivino,<-- ERROR -->,
3057,3057,18,Montes Montes Alpha Cabernet Sauvignon,2021-12-24,.,vivino,<-- ERROR -->,


In [8]:
df_reviews = df_reviews.loc[~cond]

In [9]:
#df_reviews.groupby(['wid', 'wine']).review_transl.count()
df_reviews.groupby(['wid', 'wine']).id.count()

wid  wine                                                
0    Casillero del Diablo Cabernet Sauvignon (Reserva)       472
1    Yellow Tail Cabernet Sauvignon                          136
2    Roche Mazet Cuvée Spéciale Cabernet Sauvignon            61
3    San Pedro Gato Negro Cabernet Sauvignon                 879
4    Aguirre Dos Copas Cabernet Sauvignon                      7
5    The 7th Generation - G7 Cabernet Sauvignon               26
6    Casillero del Diablo Chardonnay (Reserva)               122
7    Kendall-Jackson Vintner's Reserve Cabernet Sauvignon    121
8    Viña Santa Helena Reservado Cabernet Sauvignon'          51
9    Frontera Cabernet Sauvignon                             207
10   Long Barn Chardonnay                                    142
11   Cono Sur Bicicleta Reserva Unoaked Chardonnay           109
12   The 7th Generation - G7 Chardonnay                       42
13   San Pedro Gato Negro Chardonnay                         230
14   Roche Mazet Cuvée Spéciale 

In [10]:
df = df_reviews.groupby(by=['wid','wine']).id.count().rename('count').reset_index(1)
swid = df.loc[df['count']>100]
swid

Unnamed: 0_level_0,wine,count
wid,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Casillero del Diablo Cabernet Sauvignon (Reserva),472
1,Yellow Tail Cabernet Sauvignon,136
3,San Pedro Gato Negro Cabernet Sauvignon,879
6,Casillero del Diablo Chardonnay (Reserva),122
7,Kendall-Jackson Vintner's Reserve Cabernet Sau...,121
9,Frontera Cabernet Sauvignon,207
10,Long Barn Chardonnay,142
11,Cono Sur Bicicleta Reserva Unoaked Chardonnay,109
13,San Pedro Gato Negro Chardonnay,230
15,Yellow Tail Chardonnay,122


In [11]:
df_reviews = df_reviews.loc[df_reviews.wid.isin(swid.index)]

In [12]:
# DO NOT CHANGE the (document) id as it is index to topics of topic model (topic_model.topics_)
docs = df_reviews.review_transl.tolist()

# 🗨️ **BERTopic**

## Params

### Embedding

In [13]:
# 12-layer, 384-hidden
st_id = 'all-MiniLM-L12-v2'

### Dimensionality Reduction

In [14]:
n_components = 15
n_neighbors = 10

# the minimum distance apart that points are allowed to be in the low dimensional representation.
# This means that low values of min_dist will result in clumpier embeddings.
# This can be useful if you are interested in clustering, or in finer topological structure.
# Larger values of min_dist will prevent UMAP from packing points together and will focus on the preservation of the broad topological structure instead.
min_dist = 0 #0.1

random_state=42

### Clustering

In [16]:
# a lower min_cluster_size will generate more topics
min_cluster_size = 50

# The implementation defaults this value (if it is unspecified) to whatever min_cluster_size is set to.
# The larger the value of min_samples you provide, the more conservative the clustering – more points will be declared as noise,
# and clusters will be restricted to progressively more dense area
#min_samples = None
r = 0.6
min_samples = round(min_cluster_size*r)

# We need this to avoid an AttributeError when integrating our custom HDBSCAN step with BERTopic
prediction_data=True

# can improve the resultant clusters
gen_min_span_tree=True

### BERTopic

In [17]:
# top n words in combined documents in a cluster
top_n_words = 5 #10

# Calculate the probabilities of all topics per document instead of the probability of the assigned topic per document.
# This could slow down the extraction of topics if you have many documents (> 100_000).
calculate_probabilities=False

## **Sub-models**

In [18]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer(st_id)
embeddings = embedding_model.encode(docs, show_progress_bar=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Batches:   0%|          | 0/87 [00:00<?, ?it/s]

In [19]:
from umap import UMAP

umap_model = UMAP(
    n_components=n_components,
    n_neighbors=n_neighbors,
    min_dist=min_dist,
    metric='cosine', random_state=random_state)

In [20]:
# Pre-reduce embeddings for visualization purposes
reduced_embeddings = UMAP(n_neighbors=n_neighbors, n_components=2, min_dist=min_dist, metric='cosine', random_state=random_state).fit_transform(embeddings)

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


In [21]:
from hdbscan import HDBSCAN

hdbscan_model = HDBSCAN(
    min_cluster_size=min_cluster_size,
    min_samples=min_samples,
    metric='euclidean',
    cluster_selection_method='eom',
    prediction_data=prediction_data,
    gen_min_span_tree=gen_min_span_tree
    )

In [22]:
from bertopic.representation import KeyBERTInspired

keybert = KeyBERTInspired()

representation_model = {
    "KeyBERT": keybert
}

CountVectorizer before training the topic model to minimize the size of the resulting c-TF-IDF matrix:

In [23]:
min_df = 0.001
len(docs), min_df * len(docs)

(2760, 2.7600000000000002)

In [24]:
from sklearn.feature_extraction.text import CountVectorizer

#vectorizer_model = None
vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 3), min_df=min_df)

## 🔥 **Training**

Now that we have our models prepared, we can start training our topic model! We supply BERTopic with the sub-models of interest, run `.fit_transform`, and see what kind of topics we get.

In [25]:
from bertopic import BERTopic

topic_model = BERTopic(
  # Sub-models
  embedding_model=embedding_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  representation_model=representation_model,
  vectorizer_model=vectorizer_model,

  top_n_words=top_n_words,
  calculate_probabilities=calculate_probabilities,
  verbose=True
)

# Train model
topics, probs = topic_model.fit_transform(docs, embeddings)

2024-02-07 10:16:28,985 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-02-07 10:16:47,455 - BERTopic - Dimensionality - Completed ✓
2024-02-07 10:16:47,457 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-02-07 10:16:47,671 - BERTopic - Cluster - Completed ✓
2024-02-07 10:16:47,678 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-02-07 10:16:50,375 - BERTopic - Representation - Completed ✓


In [26]:
# num of topics
len(topic_model.get_topics()) - 1

2

In [27]:
# Show topics
topic_model.get_topic_info().head(7)

Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,Representative_Docs
0,-1,96,-1_good_good good_gato_gato negro,"[good, good good, gato, gato negro, negro]","[good, good good, good good good, best, good l...","[Very good, Very good, Good]"
1,0,2573,0_wine_good_oak_taste,"[wine, good, oak, taste, red]","[good wine, wine, good, alcohol, bad, good val...","[Good wine!!, Good wine., A good wine.]"
2,1,91,1_chardonnay_citrus_acidity_good,"[chardonnay, citrus, acidity, good, tropical]","[nice chardonnay, chardonnay, good chardonnay,...","[Fruity Chardonnay, Nice chardonnay, Not my Ch..."


## 💦 **Post-processing**

### Vectorizer
Pass the CountVectorizer after training where llm used full context in training => ???

In [28]:
try:
    vectorizer_model
    print('vectorizer_model assigned before!')
except:
    print('passing vectorizer_model ater training')

vectorizer_model assigned before!


In [80]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [81]:
#stop_additional = ['cabernet', 'sauvignon', 'cab', 'wine']
stop_additional = []

ngram_range = (1, 3)

# When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold.
# This value is also called cut-off in the literature.
# If float, the parameter represents a proportion of documents, integer absolute counts
min_df = 10

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

# Fine-tune topic representations after training BERTopic
# you can pass the CountVectorizer before and after training your topic model.
# Passing it before training allows you to minimize the size of the resulting c-TF-IDF matrix

stopwords = list(stopwords.words('english')) + stop_additional

vectorizer_model = CountVectorizer(stop_words=stopwords, ngram_range=ngram_range, min_df=min_df)

topic_model.update_topics(docs, vectorizer_model=vectorizer_model, top_n_words=top_n_words)

In [83]:
topic_model.get_topic_info().head(7)

Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,Representative_Docs
0,-1,580,-1_chardonnay_good_citrus_acidity,"[chardonnay, good, citrus, acidity, wine]","[good, great, best, nice, better, good cost, g...","[Good, Very good, Not my Chardonnay]"
1,0,1166,0_oak_vanilla_good_taste,"[oak, vanilla, good, taste, cherry]","[chocolate oak, oak blackberry, oak vanilla, o...",[oak cherry black cherry chocolate blackcurran...
2,1,781,1_wine_good_cabernet_red,"[wine, good, cabernet, red, price]","[good wine, wine good, nice wine, great wine, ...","[Very good wine, Good wine., Good wine!!]"
3,2,233,2_good_value_money_value money,"[good, value, money, value money, bad]","[good value money, good value, excellent value...","[Good value for money, Good value for money, V..."


### Outlier reduction

In [None]:
# Use the "c-TF-IDF" strategy with a threshold
# threshold is the minimum similarity.
new_topics = topic_model.reduce_outliers(docs, topics , strategy="c-tf-idf", threshold=0.1)

# count outliers
len([x for x in new_topics if x < 0])

6455

In [None]:
t = topics
#t = new_topics

new_topics = topic_model.reduce_outliers(docs, t , strategy="distributions", threshold=0.5)

# count outliers
len([x for x in new_topics if x < 0])

100%|██████████| 7/7 [00:22<00:00,  3.25s/it]


6231

In [None]:
t = topics
#t = new_topics
new_topics = topic_model.reduce_outliers(docs, t, strategy="embeddings", threshold=0.5)

# count outliers
len([x for x in new_topics if x < 0])

1988

In [None]:
t = topics
#t = new_topics

# the threshold is minimum probability when strategy="probabilities"
new_topics = topic_model.reduce_outliers(docs, t, strategy="probabilities", probabilities=probs, threshold=0.05)

# count outliers
len([x for x in new_topics if x < 0])

6678

#### update with new_topics

In [None]:
topic_model.update_topics(docs, topics=new_topics)



# 📊 Visualization

In [29]:
titles = [x[:100] for x in docs]
topics_to_visualize = range(20)

topic_model.visualize_documents(titles, reduced_embeddings=reduced_embeddings,
                                hide_annotations=True, hide_document_hover=False,
                                topics=topics_to_visualize,
                                #custom_labels=True
                                )

## Topics per Class

In [30]:
classes = df_reviews.wine.tolist()
topics_per_class = topic_model.topics_per_class(docs, classes=classes)

11it [00:00, 15.52it/s]


In [31]:
custom_labels = True
normalize_frequency = False

In [32]:
topic_model.visualize_topics_per_class(topics_per_class, top_n_topics=10,
                                       normalize_frequency = normalize_frequency,
                                       width=1000, height=500,
                                       custom_labels=custom_labels)

**Share of reviews**

In [33]:
total_freq = df_reviews.groupby('wine').count().id.to_dict() # num of documents per class

df = topics_per_class.assign(Frequency=topics_per_class.apply(lambda x: x.Frequency/total_freq[x.Class], axis=1))

sum([v for k,v in total_freq.items()]), len(docs)

(2760, 2760)

In [34]:
total_freq

{'Casillero del Diablo Cabernet Sauvignon (Reserva)': 472,
 'Casillero del Diablo Chardonnay (Reserva)': 122,
 'Cono Sur Bicicleta Reserva Unoaked Chardonnay': 109,
 'Frontera Cabernet Sauvignon': 207,
 "Kendall-Jackson Vintner's Reserve Cabernet Sauvignon": 121,
 'Long Barn Chardonnay': 142,
 'Montes Montes Alpha Cabernet Sauvignon': 220,
 'San Pedro Gato Negro Cabernet Sauvignon': 879,
 'San Pedro Gato Negro Chardonnay': 230,
 'Yellow Tail Cabernet Sauvignon': 136,
 'Yellow Tail Chardonnay': 122}

In [35]:
topic_model.visualize_topics_per_class(df, top_n_topics=10,
                                       normalize_frequency = normalize_frequency,
                                       width=1000, height=500,
                                       custom_labels=custom_labels)

# 🎚️ **Parameter Study**

In [None]:
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from bertopic.representation import KeyBERTInspired
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic


def bertopic_param_search(docs,
                   ### hyperparams
                   min_df=1, # CountVectorizer
                   n_components=15,
                   n_neighbors=10,
                   min_dist=0.1, # UMAP
                   min_cluster_size=10,
                   min_samples=None, # HDBSCAN
                   ####
                   embedding_model=None,
                   embeddings=None,
                   ngram_range=(1, 3),
                   prediction_data=True,
                   gen_min_span_tree=True,
                   top_n_words=5,
                   calculate_probabilities=False,
                   random_state=42,
                   verbose=False
                   ):

    #-- sub-models
    vectorizer_model = CountVectorizer(stop_words="english", ngram_range=ngram_range, min_df=min_df)

    umap_model = UMAP(
        n_components=n_components,
        n_neighbors=n_neighbors,
        min_dist=min_dist,
        metric='cosine', random_state=random_state)

    hdbscan_model = HDBSCAN(
        min_cluster_size=min_cluster_size,
        min_samples=min_samples,
        metric='euclidean',
        cluster_selection_method='eom',
        prediction_data=prediction_data,
        gen_min_span_tree=gen_min_span_tree
        )

    keybert = KeyBERTInspired()
    representation_model = {
        "KeyBERT": keybert
    }

    #-- train bertopic
    topic_model = BERTopic(
        embedding_model=embedding_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        vectorizer_model=vectorizer_model,
        representation_model=representation_model,

        top_n_words=top_n_words,
        calculate_probabilities=calculate_probabilities,
        verbose=verbose
    )

    # Train model
    topics, probs = topic_model.fit_transform(docs, embeddings)

    # Show topics
    return topic_model.get_topic_info()


def get_topics(df, index, num_topics=10, cols = ['Topic', 'KeyBERT']):
    """
    get a row from df, the result of bertopic_param_search
    """
    return df.iloc[1:num_topics+1].loc[:, cols].rename(columns=dict(zip(cols, ['index', index]))).set_index('index').transpose()

## Set params

In [None]:
params_study = {
    # When CountVectorizer building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold.
    # If float, the parameter represents a proportion of documents, integer absolute counts
    'min_df': [0.001, 0.01],

    # If you are interested in (density based) clustering, or other machine learning techniques,
    # it can be beneficial to pick a larger embedding dimension (say 10, or 50) closer to the the dimension
    # of the underlying manifold on which your data lies.
    'n_components': [15, 30],

    # how UMAP balances local versus global structure in the data.
    # low values of n_neighbors will force UMAP to concentrate on very local structure (potentially to the detriment of the big picture)
    'n_neighbors': [10, 20],

    # the minimum distance apart that points are allowed to be in the low dimensional representation.
    # This means that low values of min_dist will result in clumpier embeddings.
    # This can be useful if you are interested in clustering, or in finer topological structure.
    # Larger values of min_dist will prevent UMAP from packing points together and will focus on the preservation of the broad topological structure instead.
    'min_dist': [0, 0.1],

    # a lower min_cluster_size will generate more topics
    'min_cluster_size': [20, 50],

    # The implementation defaults this value (if it is unspecified) to whatever min_cluster_size is set to.
    # The larger the value of min_samples you provide, the more conservative the clustering –
    # more points will be declared as noise, and clusters will be restricted to progressively more dense area
    # UPDATE as the values are ratio to min_cluster_size
    'min_samples': [0.5, 1.]
}

In [None]:
# Embedding model: 12-layer, 384-hidden
st_id = 'all-MiniLM-L12-v2'

params_base = {
    'ngram_range': (1, 3),
    'top_n_words': 5,
    'random_state': 42,
}

Product param sets

In [None]:
from itertools import product

param_names = params_study.keys()

param_values = params_study.values()
param_values = list(product(*param_values))

df_params = pd.DataFrame(param_values, columns=param_names)

try:
    df_params.min_samples = df_params.min_samples.mul(df_params.min_cluster_size).astype(int)
except:
    print('No min_samples')

df_params = df_params.reset_index(drop=True)
df_params.head(5)

Unnamed: 0,min_df,n_components,n_neighbors,min_dist,min_cluster_size,min_samples
0,0.001,15,10,0.0,20,10
1,0.001,15,10,0.0,20,20
2,0.001,15,10,0.0,50,25
3,0.001,15,10,0.0,50,50
4,0.001,15,10,0.1,20,10


Update base params with embedding model

In [None]:
show_progress_bar=False,

embedding_model = SentenceTransformer(st_id)
embeddings = embedding_model.encode(docs, show_progress_bar=show_progress_bar)

Batches:   0%|          | 0/47 [00:00<?, ?it/s]

In [None]:
params_base.update({
    'embedding_model': embedding_model,
    'embeddings': embeddings
})

## Search params

In [None]:
df_result = pd.DataFrame()

total = len(df_params)
#total = 3 # testing

for rec in tqdm(df_params.iloc[:total].itertuples(), total=total):
    idx = rec[0]
    kwargs = rec._asdict()
    kwargs.pop('Index', None)
    kwargs.update(params_base)
    df = bertopic_param_search(docs, **kwargs)
    df = get_topics(df, idx)
    df_result = pd.concat([df_result, df])

df_result = df_params.join(df_result)
df_result.head(5)

100%|██████████| 64/64 [15:44<00:00, 14.76s/it]


Unnamed: 0,min_df,n_components,n_neighbors,min_dist,min_cluster_size,min_samples,0,1,2,3,4,5,6,7,8,9
0,0.001,15,10,0.0,20,10,"[oak black cherry, oak cherry black, oak cherr...","[wines, good wine, wine, nice wine, excellent ...","[good value money, money good value, good valu...","[fruity, light fruity, fruity light fruity, fr...","[good, good good, good really, just awful good...","[chilean cabernet sauvignon, cabernet sauvigno...","[light smooth dry, dry good value, light dry, ...","[good wine, good wine good, nice wine, wine go...","[pleasant flavor texture, bitter taste, rounde...","[great, fantastic, good great, great good, goo..."
1,0.001,15,10,0.0,20,20,"[good wine, excellent wine, great wine, nice w...","[oak black cherry, oak cherry black, oak cherr...","[good good, good really, good, excellent, just...","[chilean cabernet sauvignon, cabernet sauvigno...","[smooth dry, little dry smooth, dry good value...","[fruity, light fruity, fruity light fruity, fr...","[moderately bitter heavy, moderately bitter, s...","[good value money, money good value, good valu...","[ruby color aromas, medium ruby color, ruby co...","[price easy drink, easy drinking, easy drink, ..."
2,0.001,15,10,0.0,50,25,"[good wine, excellent wine, nice wine, wine go...","[oak black cherry, oak cherry black, chocolate...","[fruity, fruity sweet, fruity good, sweet frui...","[good, great good, good great, good good, grea...","[cabernet sauvignon, chilean cabernet sauvigno...","[smooth dry, little dry smooth, dry good value...","[good value money, money good value, good valu...",,,
3,0.001,15,10,0.0,50,50,"[good wine, excellent wine, great wine, nice w...","[fruity, light fruity, fruity light, fruity go...","[oak black cherry, oak cherry black, chocolate...","[cabernet sauvignon, chilean cabernet sauvigno...","[bad, just marvelous, liked really bad, just a...","[good value money, money good value, good valu...",,,,
4,0.001,15,10,0.1,20,10,"[good wine, wine good, nice wine, excellent wi...","[oak black cherry, oak cherry black, chocolate...","[easy drinking, easy drink, price easy drink, ...","[good value money, money good value, good valu...","[fruity, light fruity, fruity light fruity, fr...","[good good, good really, good, excellent, just...","[cabernet sauvignon, chilean cabernet sauvigno...","[light dry, light smooth dry, light dry amazin...","[tasty, tasty tasty, tasty good, tasty good ta...","[great, excellent, good great, fantastic, grea..."


In [None]:
cond = df_result[0].isna()
print('num of param sets to be studied:', cond.sum())
df_result = df_result.loc[~cond]
#df_result

num of param sets to be studied: 0


In [None]:
# compare topics
topic_id = 0
num_topics = 5

_ = [print(f'{i:>2}:', ', '.join(rec[topic_id][:num_topics])) for i, rec in df_result.iterrows()]

 0: oak black cherry, oak cherry black, oak cherry, cherry oak, chocolate oak
 1: good wine, excellent wine, great wine, nice wine, wine good wine
 2: good wine, excellent wine, nice wine, wine good wine, great wine
 3: good wine, excellent wine, great wine, nice wine, wine good wine
 4: good wine, wine good, nice wine, excellent wine, great wine
 5: good wine, excellent wine, nice wine, wine good wine, wine good
 6: good wine, wine good, good value, wine, wines
 7: wine, wines, wine good, good wine, alcohol
 8: good wine, excellent wine, nice wine, wine good, great wine
 9: good wine, excellent wine, wine good, wine good wine, nice wine
10: good wine, wine good, wine, wines, sauvignon
11: black cherry, cherry blackcurrant, oak cherry, chocolate oak, oak chocolate
12: good wine, excellent wine, nice wine, wine good wine, great wine
13: good wine, wine good, excellent wine, nice wine, wine good wine
14: good wine, excellent wine, great wine, nice wine, wine good wine
15: good value, dec

In [None]:
file = 'wr_cab6_params_01'

path = 'sample_data'
path_src = '/content/drive/MyDrive/Colab\ Notebooks/'

f = f'{path}/{file}.csv'
df_result.to_csv(f, index = True)

!cp {f} {path_src}

## Review result

In [None]:
df_result.head()

Unnamed: 0,min_df,n_components,n_neighbors,min_dist,min_cluster_size,min_samples,0,1,2,3,4,5,6,7,8,9
0,0.001,15,10,0.0,20,10,"[oak black cherry, oak cherry black, oak cherr...","[wines, good wine, wine, nice wine, excellent ...","[good value money, money good value, good valu...","[fruity, light fruity, fruity light fruity, fr...","[good, good good, good really, just awful good...","[chilean cabernet sauvignon, cabernet sauvigno...","[light smooth dry, dry good value, light dry, ...","[good wine, good wine good, nice wine, wine go...","[pleasant flavor texture, bitter taste, rounde...","[great, fantastic, good great, great good, goo..."
1,0.001,15,10,0.0,20,20,"[good wine, excellent wine, great wine, nice w...","[oak black cherry, oak cherry black, oak cherr...","[good good, good really, good, excellent, just...","[chilean cabernet sauvignon, cabernet sauvigno...","[smooth dry, little dry smooth, dry good value...","[fruity, light fruity, fruity light fruity, fr...","[moderately bitter heavy, moderately bitter, s...","[good value money, money good value, good valu...","[ruby color aromas, medium ruby color, ruby co...","[price easy drink, easy drinking, easy drink, ..."
2,0.001,15,10,0.0,50,25,"[good wine, excellent wine, nice wine, wine go...","[oak black cherry, oak cherry black, chocolate...","[fruity, fruity sweet, fruity good, sweet frui...","[good, great good, good great, good good, grea...","[cabernet sauvignon, chilean cabernet sauvigno...","[smooth dry, little dry smooth, dry good value...","[good value money, money good value, good valu...",,,
3,0.001,15,10,0.0,50,50,"[good wine, excellent wine, great wine, nice w...","[fruity, light fruity, fruity light, fruity go...","[oak black cherry, oak cherry black, chocolate...","[cabernet sauvignon, chilean cabernet sauvigno...","[bad, just marvelous, liked really bad, just a...","[good value money, money good value, good valu...",,,,
4,0.001,15,10,0.1,20,10,"[good wine, wine good, nice wine, excellent wi...","[oak black cherry, oak cherry black, chocolate...","[easy drinking, easy drink, price easy drink, ...","[good value money, money good value, good valu...","[fruity, light fruity, fruity light fruity, fr...","[good good, good really, good, excellent, just...","[cabernet sauvignon, chilean cabernet sauvigno...","[light dry, light smooth dry, light dry amazin...","[tasty, tasty tasty, tasty good, tasty good ta...","[great, excellent, good great, fantastic, grea..."


In [None]:
# case 1: compapre topic 0's
res_df = df_result[0].str.join(', ')
res_docs = res_df.to_list()

In [None]:
# caes 2: compare models with 10 topics
n = range(10)

res_df = df_result[n].apply(lambda x: x.str.join(', ')).stack()

res_docs = []
for idx, s in res_df.items():
    if s is None:
        break
    i = '_'.join([str(x) for x in idx])
    res_docs.append(f'{i}_{s}')
len(res_docs)

456

In [None]:
random_state = 42

umap_model = UMAP(min_dist=0.0, random_state=random_state)

topic_model = BERTopic(
    umap_model = umap_model,
    calculate_probabilities=True
    )

topics, probs = topic_model.fit_transform(res_docs)

topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,80,0_good_really_great_bom,"[good, really, great, bom, surprisingly, just,...","[20_6_good surprisingly good, good good, good,..."
1,1,72,1_wine_nice_wines_excellent,"[wine, nice, wines, excellent, pleasant, great...","[22_0_good wine, excellent wine, nice wine, wi..."
2,2,56,2_oak_cherry_black_chocolate,"[oak, cherry, black, chocolate, vanilla, black...","[23_0_oak cherry, chocolate oak, black cherry,..."
3,3,50,3_cabernet_sauvignon_chilean_australian,"[cabernet, sauvignon, chilean, australian, dia...","[24_5_chilean cabernet sauvignon, cabernet sau..."
4,4,48,4_value_money_excellent_great,"[value, money, excellent, great, price, good, ...","[40_2_good value money, money good value, good..."
5,5,48,5_fruity_light_sweet_slightly,"[fruity, light, sweet, slightly, fresh, taste,...","[12_5_fruity, light fruity, fruity light fruit..."
6,6,40,6_dry_smooth_light_little,"[dry, smooth, light, little, watery, pretty, b...","[26_6_smooth dry, little dry smooth, dry good ..."
7,7,28,7_tasty_sour_bitter_taste,"[tasty, sour, bitter, taste, flavor, delicious...","[29_9_sour bitter, pleasant flavor texture, mo..."
8,8,20,8_drink_easy_drinkable_drinking,"[drink, easy, drinkable, drinking, drinks, par...","[28_9_inexpensive easy drink, price easy drink..."
9,9,14,9_ruby_color_medium_red,"[ruby, color, medium, red, aromas, intense, br...","[5_7_ruby color aromas, medium ruby color, rub..."


In [None]:
topic_model.visualize_topics()

In [None]:
#topic_model.visualize_distribution(probs[0])

In [None]:
from sentence_transformers import SentenceTransformer

min_dist=0.9

embedding_model = SentenceTransformer(st_id)
res_embeddings = embedding_model.encode(res_docs, show_progress_bar=False)

res_reduced = UMAP(n_components=2, random_state=random_state, min_dist=min_dist).fit_transform(res_embeddings)

In [None]:
d = res_docs
#d = [f'{i}: {rec[0]}' for i, rec in pd.DataFrame(res_df).iterrows()]

topic_model.visualize_documents(d, reduced_embeddings=res_reduced)

In [None]:
# Extract hierarchical topics and their representations
hierarchical_topics = topic_model.hierarchical_topics(res_docs)

# Visualize these representations
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

100%|██████████| 9/9 [00:00<00:00, 65.59it/s]


In [None]:
topic_model.visualize_barchart()

### Topics per Class

In [None]:
len(classes), len(res_docs)

(456, 456)

In [None]:
classes = res_df.index.get_level_values(0)

In [None]:
topics_per_class = topic_model.topics_per_class(res_docs, classes=classes)

In [None]:
topic_model.visualize_topics_per_class(topics_per_class,
                                       #top_n_topics=10,
                                       normalize_frequency = False,
                                       #width=1000, height=500
                                       )

In [None]:
ps = [28, 60]
cond = (topics_per_class.Topic == 0) & (topics_per_class.Class.isin(ps))
topics_per_class.loc[cond]

Unnamed: 0,Topic,Words,Frequency,Class,Name
186,0,"bom, good, great, really, mt",3,28,0_good_really_great_bom
397,0,"bom, good, great, really, mt",3,60,0_good_really_great_bom


In [None]:
ps = [28, 60]
df_result.loc[ps]

Unnamed: 0,min_df,n_components,n_neighbors,min_dist,min_cluster_size,min_samples,0,1,2,3,4,5,6,7,8,9
28,0.001,30,20,0.1,20,10,"[good wine, excellent wine, nice wine, wine go...","[oak black cherry, oak cherry black, chocolate...","[good value money, money good value, good valu...","[quite good, quite good good, good really like...","[chilean cabernet sauvignon, cabernet sauvigno...","[fruity, light fruity, fruity light fruity, fr...","[light smooth dry, smooth dry, light dry, over...","[bom, bom right, meh bom, bom love, bom bom, b...","[great, good great, fantastic, great good, goo...","[inexpensive easy drink, price easy drink, eas..."
60,0.01,30,20,0.1,20,10,"[good wine, excellent wine, nice wine, wine go...","[oak black cherry, oak cherry black, chocolate...","[good value money, money good value, good valu...","[quite good, quite good good, good really like...","[chilean cabernet sauvignon, cabernet sauvigno...","[fruity, light fruity, fruity light fruity, fr...","[light smooth dry, smooth dry, light dry, over...","[bom, bom right, meh bom, bom love, bom bom, b...","[great, good great, fantastic, great good, goo...","[inexpensive easy drink, price easy drink, eas..."


In [None]:
i = 28
topic_model.visualize_distribution(probs[i])