<a href="https://colab.research.google.com/github/lbk209/topic_modeling/blob/main/tm_wine_reviews_params.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setting

In [1]:
%%capture
!pip install bertopic accelerate adjustText

In [2]:
import os
import pandas as pd
import plotly.express as px
import numpy as np

from tqdm import tqdm

In [3]:
# to work with path name having blank
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [4]:
import os
import pandas as pd

def read_csv(file, path_data, **kwargs):
    """
    kwargs: keyword args for pd.read_csv
    """
    files = [x for x in os.listdir(path_data) if x.startswith(file)]

    df_reviews = pd.DataFrame()
    for f in files:
        df = pd.read_csv(f'{path_data}/{f}', **kwargs)
        df_reviews = pd.concat([df_reviews, df])

    return df_reviews.reset_index(drop=True)


def print_topic_info(topic_model):
    """
    print number of topics and percentage of outliers
    """
    df = topic_model.get_topic_info()

    a = len(df) - 1
    print(f'num of topics: {a}')

    a = df.loc[df.Topic == -1]['Count']
    if a.count() > 0:
        a = a.values[0]/df['Count'].sum()
    else:
        a = 0
    print(f'outliers: {a:.3f}')


count_wine = lambda x: x.groupby(by=['wid','wine']).id.count().rename('count').reset_index(1)

In [5]:
def print_with_line_feed(input_string, line_length=50):
    words = input_string.split()
    current_line_length = 0

    for word in words:
        if current_line_length + len(word) <= line_length:
            print(word, end=" ")
            current_line_length += len(word) + 1  # +1 for the space
        else:
            print()  # Start a new line
            print(f'  {word}', end=" ")
            current_line_length = len(word) + 1

    print()  # Ensure the last line is printed

run to copy files from google drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


run to upload files from local

In [None]:
from google.colab import files
uploaded = files.upload()

# 🗂️ Data

In [6]:
file = 'wine_reviews'
path_data = 'sample_data'
path_src = '/content/drive/MyDrive/Colab\ Notebooks/'

In [7]:
!unzip {path_src}/{file}.zip -d {path_data}

Archive:  /content/drive/MyDrive/Colab Notebooks//wine_reviews.zip
  inflating: sample_data/wine_reviews_240124.csv  
  inflating: sample_data/wine_reviews_240207.csv  
  inflating: sample_data/wine_reviews_240212.csv  
  inflating: sample_data/wine_reviews_240213a.csv  
  inflating: sample_data/wine_reviews_240213b.csv  


In [8]:
df_reviews = read_csv(file, path_data, parse_dates=['date'])
df_reviews = df_reviews.rename_axis('id').reset_index()
df_reviews.head()

Unnamed: 0,id,wid,wine,date,review,source,lang,review_transl
0,0,0,Casillero del Diablo Cabernet Sauvignon (Reserva),2024-01-24,Little too cherry on the front end for me,vivino,en,Little too cherry on the front end for me
1,1,0,Casillero del Diablo Cabernet Sauvignon (Reserva),2024-01-20,"En liten skarp knekk i smaken. Ok fredagsvin,m...",vivino,no,A small sharp crack in the taste. Ok Friday wi...
2,2,0,Casillero del Diablo Cabernet Sauvignon (Reserva),2024-01-19,Aight,vivino,en,Aight
3,3,0,Casillero del Diablo Cabernet Sauvignon (Reserva),2024-01-18,무난한 맛 가성비 좋은듯,vivino,ko,Good taste and good value for money
4,4,0,Casillero del Diablo Cabernet Sauvignon (Reserva),2024-01-17,oak cherry black cherry chocolate blackcurrant...,vivino,en,oak cherry black cherry chocolate blackcurrant...


## Review data

**reviews not translated**

In [9]:
#df_reviews.loc[df_reviews.lang.str.contains('ERROR')]

cond = df_reviews.review_transl.isna()
df_reviews.loc[cond]

Unnamed: 0,id,wid,wine,date,review,source,lang,review_transl
5145,5145,7,Kendall-Jackson Vintner's Reserve Cabernet Sau...,2023-03-18,:-),vivino,<-- ERROR -->,
5419,5419,9,Frontera Cabernet Sauvignon,2022-05-21,😡,vivino,<-- ERROR -->,
5465,5465,9,Frontera Cabernet Sauvignon,2022-01-24,🤌🏻,vivino,<-- ERROR -->,
5752,5752,12,The 7th Generation - G7 Chardonnay,2022-01-31,🌵🍏🥝🧈,vivino,<-- ERROR -->,
5956,5956,13,San Pedro Gato Negro Chardonnay,2022-04-27,… …. ….,vivino,<-- ERROR -->,
6341,6341,18,Montes Montes Alpha Cabernet Sauvignon,2022-12-19,♡♡♡,vivino,<-- ERROR -->,
6457,6457,18,Montes Montes Alpha Cabernet Sauvignon,2021-12-24,.,vivino,<-- ERROR -->,


In [10]:
df_reviews = df_reviews.loc[~cond]

In [11]:
#df_reviews.groupby(['wid', 'wine']).review_transl.count()
df_reviews.groupby(['wid', 'wine']).id.count().tail()

wid  wine                                       
45   Cono Sur Single Vineyard Syrah                  31
46   Cono Sur Single Vineyard Cabernet Sauvignon     32
47   Cono Sur Single Vineyard Pinot Noir             40
48   Cono Sur Single Vineyard Carmenere              13
49   Cono Sur Single Vineyard Chardonnay            238
Name: id, dtype: int64

In [12]:
#df_reviews.loc[df_reviews.wid < 19].groupby(['wid', 'wine']).id.count().sum()
df_reviews.id.count()

7132

### Wine group

check wines of small numbers of reviews

In [None]:
cnt = 50
df = count_wine(df_reviews)
list_wid = df.loc[df['count']>cnt].index

df_reviews_tmp = df_reviews.loc[df_reviews.wid.isin(list_wid)]
count_wine(df_reviews_tmp)

Unnamed: 0_level_0,wine,count
wid,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Casillero del Diablo Cabernet Sauvignon (Reserva),472
1,Yellow Tail Cabernet Sauvignon,136
2,Roche Mazet Cuvée Spéciale Cabernet Sauvignon,61
3,San Pedro Gato Negro Cabernet Sauvignon,879
6,Casillero del Diablo Chardonnay (Reserva),122
7,Kendall-Jackson Vintner's Reserve Cabernet Sau...,121
8,Viña Santa Helena Reservado Cabernet Sauvignon',51
9,Frontera Cabernet Sauvignon,207
10,Long Barn Chardonnay,142
11,Cono Sur Bicicleta Reserva Unoaked Chardonnay,109


the favourite red wine group

In [13]:
list_wid = [7, 18, 19, 20, 24, 25, 28, 42, 43, 44]

df_reviews_tmp = df_reviews.loc[df_reviews.wid.isin(list_wid)]
count_wine(df_reviews_tmp)

Unnamed: 0_level_0,wine,count
wid,Unnamed: 1_level_1,Unnamed: 2_level_1
7,Kendall-Jackson Vintner's Reserve Cabernet Sau...,121
18,Montes Montes Alpha Cabernet Sauvignon,220
19,Montes Montes Alpha Merlot,60
20,Montes Montes Alpha Syrah,81
24,Mollydooker The Boxer Shiraz,276
25,William Hill North Coast Cabernet Sauvignon,51
28,Bread & Butter Pinot Noir,1236
42,Two Hands Angels' Share Shiraz,82
43,Two Hands Gnarly Dudes Shiraz,102
44,Two Hands Sexy Beast Cabernet Sauvignon,82


moscato d'asti group

In [None]:
df = count_wine(df_reviews)
list_wid = df.loc[df.wine.str.lower().str.contains('moscato')].index

df_reviews_tmp = df_reviews.loc[df_reviews.wid.isin(list_wid)]
count_wine(df_reviews_tmp)

Unnamed: 0_level_0,wine,count
wid,Unnamed: 1_level_1,Unnamed: 2_level_1
30,Vietti Moscato d'Asti,41
32,Michele Chiarlo Moscato d'Asti Nivole,482
33,Canti Moscato d'Asti,71
34,Vallebelbo Moscato d'Asti,32
35,Balbi Soprani Moscato d'Asti,31
36,Gancia Moscato d'Asti N.V.,160
37,Tosti Moscato d'Asti,31
38,G.D. Vajra Moscato d'Asti,52
39,Castello del Poggio Moscato d'Asti,80
40,Arione Moscato d'Asti,32


#### Confirm the group

In [14]:
df_reviews = df_reviews_tmp
df_reviews.id.count()

2311

In [15]:
# DO NOT CHANGE the (document) id as it is index to topics of topic model (topic_model.topics_)
docs = df_reviews.review_transl.tolist()
docs[:3]

['Enjoyed it with chicken tikka masala as its soft tannins and fruit flavor complimented perfectly.',
 'Vintage 2021 very smooth, Well balanced with vanilla, blackberry',
 'oak plum tobacco Slightly bitter Worth of price Better and soft after air exposure']

# 🗨️ **BERTopic**

## Params

### Embedding

In [None]:
# 12-layer, 384-hidden
st_id = 'all-MiniLM-L12-v2'

### Dimensionality Reduction

In [None]:
n_components = 30
n_neighbors = 10

# the minimum distance apart that points are allowed to be in the low dimensional representation.
# This means that low values of min_dist will result in clumpier embeddings.
# This can be useful if you are interested in clustering, or in finer topological structure.
# Larger values of min_dist will prevent UMAP from packing points together and will focus on the preservation of the broad topological structure instead.
min_dist = 0 #0.1

random_state = 42

### Clustering

In [None]:
# a lower min_cluster_size will generate more topics
min_cluster_size = 20

# The implementation defaults this value (if it is unspecified) to whatever min_cluster_size is set to.
# The larger the value of min_samples you provide, the more conservative the clustering – more points will be declared as noise,
# and clusters will be restricted to progressively more dense area
#min_samples = None
r = 1
min_samples = round(min_cluster_size*r)

# We need this to avoid an AttributeError when integrating our custom HDBSCAN step with BERTopic
prediction_data=True

# can improve the resultant clusters
gen_min_span_tree=True

### BERTopic

In [None]:
# top n words in combined documents in a cluster
top_n_words = 10 #5

# Calculate the probabilities of all topics per document instead of the probability of the assigned topic per document.
# This could slow down the extraction of topics if you have many documents (> 100_000).
calculate_probabilities = True

## **Sub-models**

In [None]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer(st_id)
embeddings = embedding_model.encode(docs, show_progress_bar=True)

Batches:   0%|          | 0/73 [00:00<?, ?it/s]

In [None]:
from umap import UMAP

umap_model = UMAP(
    n_components=n_components,
    n_neighbors=n_neighbors,
    min_dist=min_dist,
    metric='cosine', random_state=random_state)

In [None]:
# Pre-reduce embeddings for visualization purposes
reduced_embeddings = UMAP(n_neighbors=n_neighbors, n_components=2, min_dist=min_dist, metric='cosine', random_state=random_state).fit_transform(embeddings)

In [None]:
from hdbscan import HDBSCAN

hdbscan_model = HDBSCAN(
    min_cluster_size=min_cluster_size,
    min_samples=min_samples,
    metric='euclidean',
    cluster_selection_method='eom',
    prediction_data=prediction_data,
    gen_min_span_tree=gen_min_span_tree
    )

In [None]:
from bertopic.representation import KeyBERTInspired

keybert = KeyBERTInspired()

representation_model = {
    "KeyBERT": keybert
}

CountVectorizer before training the topic model to minimize the size of the resulting c-TF-IDF matrix:

In [None]:
min_df = 0.001
max_df = .8
len(docs), min_df * len(docs), max_df * len(docs)

(2311, 2.311, 1848.8000000000002)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

#vectorizer_model = None
vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 3),
                                   min_df=min_df, max_df=max_df)

## 🔥 **Training**

Now that we have our models prepared, we can start training our topic model! We supply BERTopic with the sub-models of interest, run `.fit_transform`, and see what kind of topics we get.

In [None]:
from bertopic import BERTopic

topic_model = BERTopic(
  # Sub-models
  embedding_model=embedding_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  representation_model=representation_model,
  vectorizer_model=vectorizer_model,

  top_n_words=top_n_words,
  calculate_probabilities=calculate_probabilities,
  verbose=True
)

# Train model
topics, probs = topic_model.fit_transform(docs, embeddings)

2024-02-14 01:00:21,897 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-02-14 01:00:38,391 - BERTopic - Dimensionality - Completed ✓
2024-02-14 01:00:38,395 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-02-14 01:00:38,724 - BERTopic - Cluster - Completed ✓
2024-02-14 01:00:38,732 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-02-14 01:00:45,549 - BERTopic - Representation - Completed ✓


In [None]:
print_topic_info(topic_model)

num of topics: 5
outliers: 0.009


In [None]:
# Show topics
n = 20
topic_model.get_topic_info().head(n)

Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,Representative_Docs
0,-1,21,-1_jammy delicious_haha_profile_nose flavors,"[jammy delicious, haha, profile, nose flavors,...","[wine pinot, pinot noir classic, terrific affo...",[Pale ruby color. Really nice young pinot noir...
1,0,1545,0_blackberry_butter_buttery_aroma,"[blackberry, butter, buttery, aroma, bold, ric...","[oak blackberry vanilla, vanilla oak cherry, o...","[vanilla oak blackberry chocolate pepper, vani..."
2,1,479,1_cabernet_syrah_cab_red wine,"[cabernet, syrah, cab, red wine, blackberry, w...","[cabernet sauvignon, wines, sauvignon, nice wi...",[Wines from non-traditional regions Romanee Mo...
3,2,120,2_noir_pinot noir_bread_californian pinot noir,"[noir, pinot noir, bread, californian pinot no...","[excellent pinot noir, good pinot noir, delici...","[Most delicious Pinot Noir, A smooth Pinot Noi..."
4,3,87,3_pinots_pinot good_nice pinot_good pinot,"[pinots, pinot good, nice pinot, good pinot, l...","[flavorful pinot, tasting pinot, pinot strawbe...",[Pale garnet in the glass with watery rim. On ...
5,4,59,4_shiraz_blackberry_australian shiraz_australian,"[shiraz, blackberry, australian shiraz, austra...","[shiraz vanilla, variety shiraz, shiraz good, ...","[Fabulous Shiraz, 2024.01.25 Chuncheon World L..."


## 💦 **Post-processing**

### Custom labels

In [None]:
name = 'KeyBERT'
length = 40
end = ' ...'
#n_words = 5

#labels = {topic: '; '.join(list(zip(*values))[0][:n_words]) + end for topic, values in topic_model.topic_aspects_[name].items()}
labels = {topic: '; '.join(list(zip(*values))[0]) for topic, values in topic_model.topic_aspects_[name].items()}
labels = {k: v[:length] + end for k,v in labels.items()}

topic_model.set_topic_labels(labels)
topic_model.custom_labels_[:3]

['wine pinot; pinot noir classic; terrific ...',
 'oak blackberry vanilla; vanilla oak cher ...',
 'cabernet sauvignon; wines; sauvignon; ni ...']

### Vectorizer
Pass the CountVectorizer after training where llm used full context in training => ???

In [None]:
try:
    vectorizer_model
    print('vectorizer_model assigned before!')
except:
    print('passing vectorizer_model ater training')

vectorizer_model assigned before!


In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
#stop_additional = ['cabernet', 'sauvignon', 'cab', 'wine']
stop_additional = []

ngram_range = (1, 3)

# When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold.
# This value is also called cut-off in the literature.
# If float, the parameter represents a proportion of documents, integer absolute counts
min_df = 10

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

# Fine-tune topic representations after training BERTopic
# you can pass the CountVectorizer before and after training your topic model.
# Passing it before training allows you to minimize the size of the resulting c-TF-IDF matrix

stopwords = list(stopwords.words('english')) + stop_additional

vectorizer_model = CountVectorizer(stop_words=stopwords, ngram_range=ngram_range, min_df=min_df)

topic_model.update_topics(docs, vectorizer_model=vectorizer_model, top_n_words=top_n_words)

In [None]:
topic_model.get_topic_info().head(7)

Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,Representative_Docs
0,-1,580,-1_chardonnay_good_citrus_acidity,"[chardonnay, good, citrus, acidity, wine]","[good, great, best, nice, better, good cost, g...","[Good, Very good, Not my Chardonnay]"
1,0,1166,0_oak_vanilla_good_taste,"[oak, vanilla, good, taste, cherry]","[chocolate oak, oak blackberry, oak vanilla, o...",[oak cherry black cherry chocolate blackcurran...
2,1,781,1_wine_good_cabernet_red,"[wine, good, cabernet, red, price]","[good wine, wine good, nice wine, great wine, ...","[Very good wine, Good wine., Good wine!!]"
3,2,233,2_good_value_money_value money,"[good, value, money, value money, bad]","[good value money, good value, excellent value...","[Good value for money, Good value for money, V..."


### Outlier reduction

In [None]:
# Use the "c-TF-IDF" strategy with a threshold
# threshold is the minimum similarity.
threshold = 0.5

new_topics = topic_model.reduce_outliers(docs, topics , strategy="c-tf-idf", threshold=threshold)

# count outliers
len([x for x in new_topics if x < 0])

1875

In [None]:
threshold = 0.5

t = topics
#t = new_topics # uncomment if following prv reduction

new_topics = topic_model.reduce_outliers(docs, t , strategy="distributions", threshold=threshold)

# count outliers
len([x for x in new_topics if x < 0])

100%|██████████| 2/2 [00:00<00:00,  2.67it/s]


1809

In [None]:
threshold = 0.8

t = topics
#t = new_topics

new_topics = topic_model.reduce_outliers(docs, t, strategy="embeddings", threshold=threshold)

# count outliers
len([x for x in new_topics if x < 0])

1770

In [None]:
# the threshold is minimum probability when strategy="probabilities"
threshold = 0.5 #0.05

t = topics
#t = new_topics

new_topics = topic_model.reduce_outliers(docs, t, strategy="probabilities", probabilities=probs, threshold=threshold)

# count outliers
len([x for x in new_topics if x < 0])

1869

#### update with new_topics

In [None]:
topic_model.update_topics(docs, topics=new_topics)



## 📊 Visualization

In [None]:
titles = [x[:100] for x in docs]
topics_to_visualize = range(20)

topic_model.visualize_documents(titles, reduced_embeddings=reduced_embeddings,
                                hide_annotations=True, hide_document_hover=False,
                                topics=topics_to_visualize,
                                #custom_labels=True
                                )

### Topics per Class

In [None]:
classes = df_reviews.wine.tolist()
topics_per_class = topic_model.topics_per_class(docs, classes=classes)

10it [00:00, 12.37it/s]


In [None]:
top_n_topics = 20
custom_labels = True
normalize_frequency = False

In [None]:
topic_model.visualize_topics_per_class(topics_per_class, top_n_topics=top_n_topics,
                                       normalize_frequency = normalize_frequency,
                                       #width=1000, height=500,
                                       custom_labels=custom_labels)

**Share of reviews**

In [None]:
total_freq = df_reviews.groupby('wine').count().id.to_dict() # num of documents per class

df = topics_per_class.assign(Frequency=topics_per_class.apply(lambda x: x.Frequency/total_freq[x.Class], axis=1))

sum([v for k,v in total_freq.items()]), len(docs)

(2311, 2311)

In [None]:
total_freq

{'Bread & Butter Pinot Noir': 1236,
 "Kendall-Jackson Vintner's Reserve Cabernet Sauvignon": 121,
 'Mollydooker The Boxer Shiraz': 276,
 'Montes Montes Alpha Cabernet Sauvignon': 220,
 'Montes Montes Alpha Merlot': 60,
 'Montes Montes Alpha Syrah': 81,
 "Two Hands Angels' Share Shiraz": 82,
 'Two Hands Gnarly Dudes Shiraz': 102,
 'Two Hands Sexy Beast Cabernet Sauvignon': 82,
 'William Hill North Coast Cabernet Sauvignon': 51}

In [None]:
topic_model.visualize_topics_per_class(df, top_n_topics=10,
                                       normalize_frequency = normalize_frequency,
                                       width=1000, height=500,
                                       custom_labels=custom_labels)

# 🎚️ **Parameter Study**

## Utils

In [16]:
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from bertopic.representation import KeyBERTInspired
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
from plotly.subplots import make_subplots

from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


def bertopic_batch(docs,
                   ### hyperparams
                   min_df=0.001, # CountVectorizer
                   max_df=1.0,
                   n_components=15,
                   n_neighbors=10,
                   min_dist=0.1, # UMAP
                   min_cluster_size=10,
                   min_samples=None, # HDBSCAN
                   ####
                   embedding_model=None,
                   embeddings=None,
                   ngram_range=(1, 3),
                   prediction_data=True,
                   gen_min_span_tree=True,
                   top_n_words=5,
                   calculate_probabilities=False,
                   random_state=42,
                   verbose=False,
                   return_model=False,
                   hdbscan_model=None,
                   custom_label='keybert'
                   ):

    #-- sub-models
    vectorizer_model = CountVectorizer(stop_words="english", ngram_range=ngram_range,
                                       min_df=min_df, max_df=max_df)

    umap_model = UMAP(
        n_components=n_components,
        n_neighbors=n_neighbors,
        min_dist=min_dist,
        metric='cosine', random_state=random_state)

    if hdbscan_model is None:
        hdbscan_model = HDBSCAN(
            min_cluster_size=min_cluster_size,
            min_samples=min_samples,
            metric='euclidean',
            cluster_selection_method='eom',
            prediction_data=prediction_data,
            gen_min_span_tree=gen_min_span_tree
            )

    keybert = KeyBERTInspired()
    representation_model = {
        "KeyBERT": keybert
    }

    #-- train bertopic
    topic_model = BERTopic(
        embedding_model=embedding_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        vectorizer_model=vectorizer_model,
        representation_model=representation_model,

        top_n_words=top_n_words,
        calculate_probabilities=calculate_probabilities,
        verbose=verbose
    )

    # Train model
    try:
        topics, probs = topic_model.fit_transform(docs, embeddings)

        # set custom label
        if custom_label == 'keybert':
            topic_model = set_custom_labels(topic_model, name='KeyBERT')

        df = topic_model.get_topic_info()
    except:
        df = None

    if return_model:
        return topic_model
    else:
        return df


def get_topics(df, index, num_topics=10, cols = ['Topic', 'KeyBERT']):
    """
    get a row from df, the result of bertopic_batch
    index: index of a param set
    """
    # get the position of topic 0 which might be 0 if no outlier
    i = df.loc[df.Topic==0].index[0]
    return df.iloc[i:num_topics+i].loc[:, cols].rename(columns=dict(zip(cols, ['index', index]))).set_index('index').transpose()


def set_custom_labels(topic_model, name='KeyBERT'):
    #length = 40
    #end = ' ...'

    #labels = {topic: '; '.join(list(zip(*values))[0][:n_words]) + end for topic, values in topic_model.topic_aspects_[name].items()}
    labels = {topic: f'{topic} ' + '; '.join(list(zip(*values))[0]) for topic, values in topic_model.topic_aspects_[name].items()}

    #labels = {k: v[:length] + end for k,v in labels.items()}
    #labels = {k: '\n'.join([v[i:i+40] for i in range(0, len(v), 40)]) for k,v in labels.items()}
    topic_model.set_topic_labels(labels)

    return topic_model


def visualize_topics_per_param(df_result, topic_model, res_docs,
                               ncols=4, top_n_topics=None,
                               horizontal_spacing=.05,
                               vertical_spacing=.3,
                               width = 350, height = 350
                               ):
    """
    grid plot of classes (class is param)
    """
    subplot_titles = [x for x in df_result.columns if not isinstance(x, int)]
    nrows = len(subplot_titles)//ncols+1

    fig = make_subplots(rows=nrows, cols=ncols,
                        shared_xaxes=False,
                        horizontal_spacing=horizontal_spacing,
                        vertical_spacing=vertical_spacing / nrows if nrows > 1 else 0,
                        subplot_titles=subplot_titles)

    row, col = 1, 1
    for i, _ in enumerate(subplot_titles):
        classes = df_result.iloc[:, i].apply(str)
        topics_per_class = topic_model.topics_per_class(res_docs, classes=classes)

        f = topic_model.visualize_topics_per_class(topics_per_class,
                                            top_n_topics=top_n_topics,
                                            #width=1000, height=500,
                                            normalize_frequency = False)

        # update visible to show all topics
        _ = [fig.add_trace(x.update({'visible':True}), row=row, col=col) for x in f.data]

        if col == ncols:
            col = 1
            row += 1
        else:
            col += 1

    fig.update_layout(
        template="plotly_white",
        showlegend=False,
        width=width*ncols,
        height=height*nrows if nrows > 1 else height * 1.3,
        hoverlabel=dict(
            bgcolor="white",
            font_size=14,
            font_family="Rockwell"
        ),
    )

    fig.update_xaxes(showgrid=True)
    fig.update_yaxes(showgrid=True)
    return fig


def check_similarity(topic_model, custom_labels=False,
                     embedding_model=None, min_distance=0.8):

    distance_matrix = cosine_similarity(np.array(topic_model.topic_embeddings_))

    if custom_labels:
        list_labels = topic_model.custom_labels_
    else:
        list_labels = topic_model.topic_labels_.values()

    dist_df = pd.DataFrame(distance_matrix, columns=list_labels, index=list_labels)

    tmp = []
    for rec in dist_df.reset_index().to_dict('records'):
        t1 = rec['index']
        for t2 in rec:
            if t2 == 'index':
                continue
            tmp.append(
                {
                    'topic1': t1,
                    'topic2': t2,
                    'distance': rec[t2]
                }
            )

    pair_dist_df = pd.DataFrame(tmp)

    pair_dist_df = pair_dist_df[(pair_dist_df.topic1.map(lambda x: not x.startswith('-1'))) &
                (pair_dist_df.topic2.map(lambda x: not x.startswith('-1')))]

    pair_dist_df = (pair_dist_df[pair_dist_df.topic1 < pair_dist_df.topic2]
            .sort_values('distance', ascending = False)
            .reset_index(drop=True))

    if embedding_model is not None:
        print(f'Calculating the similarity of custom label pairs for which the topic similarity exceeds {min_distance}...')
        encode = lambda x: embedding_model.encode(x, convert_to_tensor=True)
        pair_dist_df = pair_dist_df.join(pair_dist_df
                                         .loc[pair_dist_df.distance >= min_distance]
                                         .apply(lambda x: util.pytorch_cos_sim(encode(x.topic1), encode(x.topic2))[0][0].item(), axis=1)
                                         .rename('c/label sim')
                                         , how='right')
    return pair_dist_df


def print_custom_labels(topic_model, list_tid=None, length=120):
    if not isinstance(list_tid, list):
        list_tid = [list_tid]
    tid_all = topic_model.topic_labels_.keys()
    dict_label = dict(zip(tid_all, topic_model.custom_labels_))
    if list_tid is not None:
        dict_label = {k:v for k,v in dict_label.items() if k in list_tid}

    _ = [print_with_line_feed(v, length) for k,v in dict_label.items()]

    return dict_label


def get_representative_docs(topic_model, tid=None, length=120, max_topics=5):
    rep_docs = topic_model.get_representative_docs()

    if tid is not None:
        rep_docs = {k: v for k, v in rep_docs.items() if k in tid}

    for i, (k, v) in enumerate(rep_docs.items()):
        _ = [print_with_line_feed(f'{k}-{x}: {y}', length) for x, y in enumerate(v)]
        if i > max_topics:
            print(f'the docs of {max_topics} topics printed.')
            break

    return rep_docs

### customize visualize_topics_per_class

In [17]:
from typing import List, Union
import plotly.graph_objects as go
import collections
from sklearn.preprocessing import normalize


def visualize_topics_per_class(topic_model,
                               topics_per_class: pd.DataFrame,
                               group: List[str] = None,
                               docs: List[str] = None,
                               classes: List[str] = None,
                               top_n_topics: int = 10,
                               topics: List[int] = None,
                               normalize_frequency: bool = False,
                               relative_share = False,
                               custom_labels: Union[bool, str] = False,
                               title: str = "<b>Topics per Class</b>",
                               width: int = 1250,
                               height: int = 900) -> go.Figure:
    """
    customized BERTopic.visualize_topics_per_class:
     plot relative shares and display only the selected group
    """

    colors = ["#E69F00", "#56B4E9", "#009E73", "#F0E442", "#D55E00", "#0072B2", "#CC79A7"]

    if group is None:
        freq_df = topic_model.get_topic_freq()
    elif (group is not None) and (docs is None or classes is None):
        print('WARNING: visualizing the selected group needs docs and classes as well\n')
        freq_df = topic_model.get_topic_freq()
    else:
        # redefine topics_per_class for the group
        topics_per_class = topics_per_class.loc[topics_per_class.Class.isin(group)]
        # redefine freq_df for the group
        classes_g = [i for i, x in enumerate(classes) if x in group]
        documents = pd.DataFrame({"Document": docs, "ID": range(len(docs)), "Topic": topic_model.topics_})
        documents = documents.loc[classes_g]
        topic_sizes = collections.Counter(documents.Topic.values.tolist())
        freq_df = (pd
                    .DataFrame(topic_sizes.items(), columns=['Topic', 'Count'])
                    .sort_values("Count",ascending=False)
                    )

    if relative_share:
        if docs is None or classes is None:
            print('WARNING: relative share plot needs docs and classes as well\n')
        else:
            total_freq = pd.DataFrame({"Document": docs, "Class":classes}).groupby('Class')['Document'].count().to_dict()
            topics_per_class = topics_per_class.assign(Frequency=topics_per_class.apply(lambda x: x.Frequency/total_freq[x.Class], axis=1))

    # Select topics based on top_n and topics args
    freq_df = freq_df.loc[freq_df.Topic != -1, :]
    if topics is not None:
        selected_topics = list(topics)
    elif top_n_topics is not None:
        selected_topics = sorted(freq_df.Topic.to_list()[:top_n_topics])
    else:
        selected_topics = sorted(freq_df.Topic.to_list())

    # Prepare data
    if isinstance(custom_labels, str):
        topic_names = [[[str(topic), None]] + topic_model.topic_aspects_[custom_labels][topic] for topic in topics]
        topic_names = ["_".join([label[0] for label in labels[:4]]) for labels in topic_names]
        topic_names = [label if len(label) < 30 else label[:27] + "..." for label in topic_names]
        topic_names = {key: topic_names[index] for index, key in enumerate(topic_model.topic_labels_.keys())}
    elif topic_model.custom_labels_ is not None and custom_labels:
        topic_names = {key: topic_model.custom_labels_[key + topic_model._outliers] for key, _ in topic_model.topic_labels_.items()}
    else:
        topic_names = {key: value[:40] + "..." if len(value) > 40 else value
                        for key, value in topic_model.topic_labels_.items()}
    topics_per_class["Name"] = topics_per_class.Topic.map(topic_names)
    data = topics_per_class.loc[topics_per_class.Topic.isin(selected_topics), :]

    # Add traces
    fig = go.Figure()
    for index, topic in enumerate(selected_topics):
        if index == 0:
            visible = True
        else:
            visible = "legendonly"
        trace_data = data.loc[data.Topic == topic, :]
        topic_name = trace_data.Name.values[0]
        words = trace_data.Words.values
        if normalize_frequency:
            x = normalize(trace_data.Frequency.values.reshape(1, -1))[0]
        else:
            x = trace_data.Frequency
        fig.add_trace(go.Bar(y=trace_data.Class,
                                x=x,
                                visible=visible,
                                marker_color=colors[index % 7],
                                hoverinfo="text",
                                name=topic_name,
                                orientation="h",
                                hovertext=[f'<b>Topic {topic}</b><br>Words: {word}' for word in words]))

    # Styling of the visualization
    fig.update_xaxes(showgrid=True)
    fig.update_yaxes(showgrid=True)
    fig.update_layout(
        xaxis_title="Normalized Frequency" if normalize_frequency else "Frequency",
        yaxis_title="Class",
        title={
            'text': f"{title}",
            'y': .95,
            'x': 0.40,
            'xanchor': 'center',
            'yanchor': 'top',
            'font': dict(
                size=22,
                color="Black")
        },
        template="simple_white",
        width=width,
        height=height,
        hoverlabel=dict(
            bgcolor="white",
            font_size=16,
            font_family="Rockwell"
        ),
        legend=dict(
            title="<b>Global Topic Representation",
        )
    )
    return fig


def visualize_topics_per_class_all(topic_model,
                                   docs: List[str],
                                   classes: List[str],
                                   top_n_topics: int = 10,
                                   topics: List[int] = None,
                                   group: List[str] = None,
                                   custom_labels: Union[bool, str] = False,
                                   horizontal_spacing=.05,
                                   vertical_spacing=.3,
                                   width: int = 1200,
                                   height: int = 500) -> go.Figure:
    subplot_titles = ['Topic per class', 'Topic per class']

    fig = make_subplots(rows=1, cols=2,
                        shared_xaxes=False,
                        shared_yaxes=True,
                        horizontal_spacing=horizontal_spacing,
                        vertical_spacing=0,
                        subplot_titles=subplot_titles)

    topics_per_class = topic_model.topics_per_class(docs, classes=classes)

    # plot 1
    f = visualize_topics_per_class(topic_model, topics_per_class,
                                   group=group, docs=docs, classes=classes,
                                   top_n_topics=top_n_topics, topics=topics, custom_labels=custom_labels)

    # update visible to show all topics
    #_ = [fig.add_trace(x.update({'visible':True, 'legendgroup':f'g{i}'}), row=1, col=1) for i, x in enumerate(f.data)]
    _ = [fig.add_trace(x.update({'legendgroup':f'g{i}'}), row=1, col=1) for i, x in enumerate(f.data)]

    # plot 2: relative share of reviews
    f = visualize_topics_per_class(topic_model, topics_per_class,
                                   group=group, docs=docs, classes=classes, relative_share=True,
                                   top_n_topics=top_n_topics, topics=topics, custom_labels=custom_labels)

    # update visible to show all topics
    #_ = [fig.add_trace(x.update({'visible':True, 'legendgroup':f'g{i}', 'showlegend':False}), row=1, col=2) for i, x in enumerate(f.data)]
    _ = [fig.add_trace(x.update({'legendgroup':f'g{i}', 'showlegend':False}), row=1, col=2) for i, x in enumerate(f.data)]


    fig.update_layout(
        template="plotly_white",
        showlegend=True,
        width=width,
        height=height,
        hoverlabel=dict(
            bgcolor="white",
            font_size=14,
            font_family="Rockwell"
        ),
    )

    fig.update_xaxes(showgrid=True)
    fig.update_yaxes(showgrid=True)
    return fig

## Set params

In [None]:
params_study = {
    # When CountVectorizer building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold.
    # If float, the parameter represents a proportion of documents, integer absolute counts
    #'min_df': [1, 10], # error when 10 even if default max_df is 1.0
    'min_df': [0.001, 0.01],

    'max_df': [0.5, 1.0],

    # If you are interested in (density based) clustering, or other machine learning techniques,
    # it can be beneficial to pick a larger embedding dimension (say 10, or 50) closer to the the dimension
    # of the underlying manifold on which your data lies.
    'n_components': [10, 20, 50],

    # how UMAP balances local versus global structure in the data.
    # low values of n_neighbors will force UMAP to concentrate on very local structure (potentially to the detriment of the big picture)
    'n_neighbors': [10, 20, 50],

    # the minimum distance apart that points are allowed to be in the low dimensional representation.
    # This means that low values of min_dist will result in clumpier embeddings.
    # This can be useful if you are interested in clustering, or in finer topological structure.
    # Larger values of min_dist will prevent UMAP from packing points together and will focus on the preservation of the broad topological structure instead.
    'min_dist': [0, 0.05, 0.1],

    # set it to the smallest size grouping that you wish to consider a cluster.
    # It can have slightly non-obvious effects with min_samples
    'min_cluster_size': [20, 50, 100],

    # The implementation defaults this value (if it is unspecified) to whatever min_cluster_size is set to.
    # The larger the value of min_samples you provide, the more conservative the clustering –
    # more points will be declared as noise, and clusters will be restricted to progressively more dense area
    # UPDATE as the values are ratio to min_cluster_size
    'min_samples': [0.1, 0.5, 1.]
}

In [None]:
# Embedding model: 12-layer, 384-hidden
st_id = 'all-MiniLM-L12-v2'

params_base = {
    'ngram_range': (1, 3),
    'top_n_words': 5,
    'random_state': 42,
}

Product param sets

In [None]:
from itertools import product

param_names = params_study.keys()

param_values = params_study.values()
param_values = list(product(*param_values))

df_params = pd.DataFrame(param_values, columns=param_names)

try:
    df_params.min_samples = df_params.min_samples.mul(df_params.min_cluster_size).astype(int)
except:
    print('No min_samples')

df_params = df_params.reset_index(drop=True)
df_params.head(5)

Unnamed: 0,min_df,max_df,n_components,n_neighbors,min_dist,min_cluster_size,min_samples
0,0.001,0.5,10,10,0.0,20,2
1,0.001,0.5,10,10,0.0,20,10
2,0.001,0.5,10,10,0.0,20,20
3,0.001,0.5,10,10,0.0,50,5
4,0.001,0.5,10,10,0.0,50,25


import old study

In [None]:
file = 'wr_param_study_01.csv'

!cp {path_src}/{file} {path_data}

In [None]:
#df_result_old = pd.read_csv(f'{path_data}/{file}')
#df_result_old.head(5)

Unnamed: 0,min_df,n_components,n_neighbors,min_dist,min_cluster_size,min_samples,0,1,2,3,4,5,6,7,8,9
0,1,10,10,0.0,20,2,"['fruity', 'delicious fruity', 'fruity good', ...","['good value money', 'money good value', 'good...","['chilean cabernet sauvignon', 'cabernet sauvi...","['citrus tropical', 'citrus pear tropical', 'p...","['chilean wine', 'wine chile', 'chilean wine w...","['oak', 'oak oak', 'vanilla oak', 'oak vanilla...","['fruity scent', 'citrus scent', 'aromatic fru...","['håkon rekdal', 'like håkon rekdal', 'rekdal'...","['medium tannins', 'medium tannin', 'soft tann...","['easy drink good', 'drink easy', 'easy drinki..."
1,1,10,10,0.0,20,10,"['nice wine', 'good wine', 'great wine', 'exce...","['vanilla oak blackberry', 'oak blackberry van...","['nice chardonnay', 'good chardonnay', 'chardo...","['fruity', 'fruity good', 'tasty fruity', 'swe...","['sour taste', 'bitter', 'retrospect add taste...","['good value money', 'money good value', 'good...","['drink easy', 'easy drinking', 'easy drink', ...","['citrus tropical', 'citrus pear tropical', 'p...","['smooth dry', 'flavor smooth', 'dry smooth', ...","['cabernet sauvignon', 'chilean cabernet sauvi..."
2,1,10,10,0.0,20,20,"['good', 'great', 'excellent', 'bad', 'good va...","['nice chardonnay', 'chardonnay', 'good chardo...",,,,,,,,
3,1,10,10,0.0,50,5,"['good wine', 'wine', 'sauvignon', 'cabernet s...","['nice chardonnay', 'chardonnay', 'good chardo...",,,,,,,,
4,1,10,10,0.0,50,25,"['good', 'great', 'excellent', 'good value', '...","['nice chardonnay', 'chardonnay', 'good chardo...",,,,,,,,


drop param set studied before

In [None]:
df = df_result_old[df_params.columns]
df_params = pd.concat([df_params, df]).drop_duplicates(keep=False)
len(df_params)

255

**Reviews to use in param search**

In [None]:
max_wid = 18
docs = df_reviews.loc[df_reviews.wid <= max_wid].review_transl.tolist()
len(docs)

3068

Update base params with embedding model

In [None]:
show_progress_bar=False,

embedding_model = SentenceTransformer(st_id)
embeddings = embedding_model.encode(docs, show_progress_bar=show_progress_bar)

In [None]:
params_base.update({
    'embedding_model': embedding_model,
    'embeddings': embeddings
})

## Search params

make batch

In [None]:
batch = 3 #0, 1, 2, 3
num_params = 250

run_idx = [df_params.index[i*num_params:(i+1)*num_params] for i in range(len(df_params) // num_params +1)]
run_file = [f'a{i}' for i in range(1,len(run_idx)+1)]

df_params_b = df_params.loc[run_idx[batch]]
file = f'wr_param_study_{run_file[batch]}'

no batch

In [None]:
#df_params_b = df_params
#file = 'wr_param_study_01'

In [None]:
num_topics = 10 # num of topics to save in param search result

df_result = pd.DataFrame()

total = len(df_params_b)

for rec in tqdm(df_params_b.iloc[:total].itertuples(), total=total):
    idx = rec[0]
    kwargs = rec._asdict()
    kwargs.pop('Index', None)
    kwargs.update(params_base)

    df = bertopic_batch(docs, **kwargs)

    if df is None:
        df = pd.DataFrame({0: ['ERROR']}).rename_axis('index')
        break # testing
    else:
        df = get_topics(df, idx, num_topics=num_topics)
    df_result = pd.concat([df_result, df])

df_result = df_params_b.join(df_result)
df_result.head(5)

100%|██████████| 222/222 [2:02:42<00:00, 33.17s/it]


Unnamed: 0,min_df,max_df,n_components,n_neighbors,min_dist,min_cluster_size,min_samples,0,1,2,3,4,5,6,7,8,9
750,0.01,1.0,10,10,0.1,50,5,"[good wine, wine, good, great, excellent, sauv...","[nice chardonnay, chardonnay, good chardonnay,...",,,,,,,,
751,0.01,1.0,10,10,0.1,50,25,"[good wine, wine, wines, sauvignon, cabernet s...","[good value money, good value, excellent value...","[nice chardonnay, chardonnay, good chardonnay,...",,,,,,,
752,0.01,1.0,10,10,0.1,50,50,"[oak vanilla, oak blackberry, oak, vanilla, fl...","[good wine, nice wine, great wine, wine good, ...","[good value money, good value, excellent value...","[nice chardonnay, chardonnay, good chardonnay,...",,,,,,
753,0.01,1.0,10,10,0.1,100,10,"[good wine, excellent wine, nice wine, wine go...","[dry, dry dry, dry good, sweet dry, dry smooth...","[oak, oak oak, blackberry oak, vanilla oak, oa...","[good value money, excellent value money, good...","[fruity, fruity good, light fruity, fruity int...","[chilean cabernet sauvignon, cabernet sauvigno...","[drink easy, easy drinking, easy drink, drink ...",,,
754,0.01,1.0,10,10,0.1,100,50,"[oak vanilla, oak blackberry, oak, vanilla, fl...","[good wine, nice wine, wine good, wine, red wi...","[good value money, good value, excellent value...",,,,,,,


join param set to result if param search interupted

In [None]:
#df_result = df_params_b.loc[df_result.index].join(df_result)
#len(df_result)

In [None]:
cond = df_result[0].isna()
print('num of param sets to be studied:', cond.sum())

#df_result = df_result.loc[~cond]

df_result.loc[cond].head()

num of param sets to be studied: 0


Unnamed: 0,min_df,max_df,n_components,n_neighbors,min_dist,min_cluster_size,min_samples,0,1,2,3,4,5,6,7,8,9


In [None]:
# compare topics
topic_id = 0
num = 5

_ = [print(f'{i:>2}:', ', '.join(rec[topic_id][:num])) for i, rec in df_result.iterrows()]

In [None]:
#file = f'wr_param_study_{}'

f = f'{path_data}/{file}.csv'
df_result.to_csv(f, index = False)

!zip -j {file}.zip {f}
!cp {file}.zip {path_src}

  adding: wr_param_study_a4.csv (deflated 92%)


## Review result

In [None]:
file = 'wr_param_study_a'
!unzip {path_src}/{file}.zip -d {path_data}

Archive:  /content/drive/MyDrive/Colab Notebooks//wr_param_study_a.zip
replace sample_data/wr_param_study_a1.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [None]:
df_result = read_csv(file, path_data)

# find topic names
cols = [x for x in df_result.columns if x.isdigit()]

# convert values to list
df_result.loc[:, cols] = df_result.loc[:, cols].applymap(lambda x: eval(x) if x is not np.nan else np.nan)

# convert topics cols to int
cols_topic = [int(x) for x in cols]
df_result = df_result.rename(columns=dict(zip(cols, cols_topic)))

df_result.head()

Unnamed: 0,min_df,max_df,n_components,n_neighbors,min_dist,min_cluster_size,min_samples,0,1,2,3,4,5,6,7,8,9
0,0.001,1.0,10,10,0.0,100,50,"[oak blackberry, oak vanilla, oak, black cherr...","[good wine, excellent wine, nice wine, wine go...","[good value money, good value, good value exce...","[chilean cabernet sauvignon, cabernet sauvigno...",,,,,,
1,0.001,1.0,10,10,0.0,100,100,"[oak cherry, vanilla oak, oak vanilla, oak bla...","[good wine, nice wine, wine good, great wine, ...","[good value money, excellent value money, good...",,,,,,,
2,0.001,1.0,10,10,0.05,20,2,"[good wine, great wine, nice wine, light wine,...","[good value money, good value, money excellent...","[spicy, fruity spicy, light spice, spice, litt...","[nice chardonnay, good chardonnay, chardonnay,...","[bad, decent, quite good average, really bad e...","[smooth dry, flavor smooth, dry smooth, delici...","[quite sour honestly, quite sour, sour taste, ...","[drink easy, easy drinking, easy drink, drink ...","[mart 2022 08, 2021, 2023, 2022, money good dr...","[dark ruby red, medium ruby color, intense rub..."
3,0.001,1.0,10,10,0.05,20,10,"[good wine, nice wine, excellent wine, wine go...","[vanilla oak blackberry, oak blackberry vanill...","[good value money, money good value, good valu...","[nice chardonnay, good chardonnay, chardonnay,...","[smooth dry, dry smooth, sour dry, flavor smoo...","[sour taste, bitter, retrospect add tasted, sw...","[bad, really good worse, just bad, decent, qui...","[fruity, fruity good, tasty fruity, sweet frui...","[oak, oak oak, vanilla oak, oak vanilla, oak o...","[citrus tropical, citrus pear tropical, pear c..."
4,0.001,1.0,10,10,0.05,20,20,"[vanilla oak, oak vanilla, oak blackberry, oak...","[good wine, wine good, nice wine, excellent wi...","[cabernet sauvignon, chilean cabernet sauvigno...","[nice chardonnay, chardonnay, good chardonnay,...","[good value money, money good value, good valu...","[lotte mart 2022, good emart 4980won, mids 201...","[chilean wine, wine chile, chilean wines, chil...","[bad, really good worse, just bad, bad just aw...","[quite good, quite good good, good quite good,...","[quite acidic overly, good acidic really, high..."


In [None]:
res_df = (df_result[cols_topic]
          .apply(lambda x: x.str.join(', '))
          .fillna('')
          .apply(lambda x: x.tolist(), axis=1)
          .apply(lambda row: '; '.join(filter(None, map(str, row))))
)

res_docs = res_df.tolist()
res_docs_id = res_df.index

len(res_docs)

972

In [None]:
res_docs[0]

'oak blackberry, oak vanilla, oak, black cherry, blackberry, strawberry, cherry, citrus, ruby, peach; good wine, excellent wine, nice wine, wine good, great wine, wine great, wine nice, wine, wines, light wine; good value money, good value, good value excellent, price good value, good great value, excellent value, excellent value money, value money good, great value money, value excellent; chilean cabernet sauvignon, cabernet sauvignon, sauvignon, chilean wine, chilean cabernet, good cabernet, wines, cabernet, wine, cab sauv'

In [None]:
from sklearn.cluster import KMeans

#cluster_model = KMeans(n_clusters=min_cluster_size)
cluster_model = None

In [None]:
min_dist = 0.2
min_cluster_size = 10
random_state = 42

st_id = 'all-MiniLM-L12-v2'
embedding_model = SentenceTransformer(st_id)

topic_model = bertopic_batch(res_docs,
                            min_df=0,
                            max_df=1.0,
                            n_components=15,
                            n_neighbors=10,
                            min_dist=min_dist,
                            min_cluster_size=min_cluster_size,
                            embedding_model=embedding_model,
                            calculate_probabilities=True,
                            random_state=random_state,
                            hdbscan_model=cluster_model,
                            return_model=True)
topic_model.get_topic_info().head()

In [None]:
print_topic_info(topic_model)

num of topics: 5
outliers: 0.009


In [None]:
topic_model.visualize_topics()

In [None]:
#topic_model.visualize_distribution(probs[0])

In [None]:
# Larger values of min_dist will focus on the preservation of the broad topological structure.
#min_dist=0.1
#min_dist=0.9

embedding_model = SentenceTransformer(st_id)
res_embeddings = embedding_model.encode(res_docs, show_progress_bar=False)

res_reduced = UMAP(n_components=2, random_state=random_state, min_dist=min_dist).fit_transform(res_embeddings)

In [None]:
##d = res_docs
#d = [f'{i}: {rec[0]}' for i, rec in pd.DataFrame(res_df).iterrows()]
#d = [f'{x}:{y}' for x, y in zip(res_docs_id, res_docs)]

d = [f'{x[0]}: {list(x)[1:]}' for x in df_result.iloc[:, :-10].to_records()]
title = ', '.join([x for x in list(df_result.iloc[:, :-10].columns)])

topic_model.visualize_documents(d, reduced_embeddings=res_reduced,
                                title=title,
                                hide_annotations=True)

In [None]:
# Extract hierarchical topics and their representations
hierarchical_topics = topic_model.hierarchical_topics(res_docs)

# Visualize these representations
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

100%|██████████| 30/30 [00:00<00:00, 104.95it/s]


In [None]:
topic_model.visualize_barchart()

### Topics per Class
- min_df: makes no difference which could be due to its small value

In [None]:
n = 12
visualize_topics_per_param(df_result, topic_model, res_docs, top_n_topics=n)

## Topic modeling with the search result

**Check docs defined in Data**

In [18]:
docs = df_reviews.review_transl.tolist()
count_wine(df_reviews)

Unnamed: 0_level_0,wine,count
wid,Unnamed: 1_level_1,Unnamed: 2_level_1
7,Kendall-Jackson Vintner's Reserve Cabernet Sau...,121
18,Montes Montes Alpha Cabernet Sauvignon,220
19,Montes Montes Alpha Merlot,60
20,Montes Montes Alpha Syrah,81
24,Mollydooker The Boxer Shiraz,276
25,William Hill North Coast Cabernet Sauvignon,51
28,Bread & Butter Pinot Noir,1236
42,Two Hands Angels' Share Shiraz,82
43,Two Hands Gnarly Dudes Shiraz,102
44,Two Hands Sexy Beast Cabernet Sauvignon,82


In [19]:
st_id = 'all-MiniLM-L12-v2'
embedding_model = SentenceTransformer(st_id)
embeddings = embedding_model.encode(docs, show_progress_bar=True)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/573 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/134M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/352 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/73 [00:00<?, ?it/s]

In [20]:
param_set = {
    'min_df': 0.0,
    'max_df': .6,
    'n_components': 100, #100, 50, 30
    'n_neighbors': 15, #5, 20, 30
    'min_dist': 0,
    'min_cluster_size': 20, #50, 10
    'min_samples': 10, #20, 10
}

param_set.update({
    'random_state': 42,
    'return_model': True,
})

custom_labels = False

In [21]:
topic_model = bertopic_batch(docs,
                             embedding_model=embedding_model,
                             embeddings=embeddings,
                             calculate_probabilities=True,
                             hdbscan_model=None,
                             **param_set)

print_topic_info(topic_model)
topic_model.get_topic_info().head(5)

num of topics: 23
outliers: 0.264


Unnamed: 0,Topic,Count,Name,CustomName,Representation,KeyBERT,Representative_Docs
0,-1,611,-1_vanilla_oak_red_little,-1 fruit flavor; flavors; flavours; oak vanill...,"[vanilla, oak, red, little, finish]","[fruit flavor, flavors, flavours, oak vanilla,...","[Oh my god, this is heaven in a glass! Light r..."
1,0,472,0_wine_red_vanilla_oak,0 good wine; nice wine; wine good; great wine;...,"[wine, red, vanilla, oak, cabernet]","[good wine, nice wine, wine good, great wine, ...","[Good wine @, Good wine, Very good go to wine]"
2,1,124,1_blackberry_plum_oak_pepper,1 pepper blackberry oak; plum blackberry oak; ...,"[blackberry, plum, oak, pepper, chocolate]","[pepper blackberry oak, plum blackberry oak, p...","[vanilla oak blackberry chocolate pepper plum,..."
3,2,117,2_noir_pinot noir_pinot_vanilla,2 good pinot noir; nice pinot noir; pinot noir...,"[noir, pinot noir, pinot, vanilla, red]","[good pinot noir, nice pinot noir, pinot noir ...","[Most delicious Pinot Noir, A smooth Pinot Noi..."
4,3,92,3_oak_butter_vanilla_vanilla oak,3 butter vanilla oak; vanilla butter oak; vani...,"[oak, butter, vanilla, vanilla oak, cherry]","[butter vanilla oak, vanilla butter oak, vanil...","[vanilla oak cherry butter, butter vanilla oak..."


In [None]:
th = 0.02

df = topic_model.get_topic_info()
tid = df.loc[(df.Topic>-1) & (df.Count>len(docs)*th)].Topic.to_list()

_ = print_custom_labels(topic_model, tid)

0 good wine; nice wine; wine good; great wine; wine nice; wine great; wines; wine; red wine; wine light 
1 pepper blackberry oak; plum blackberry oak; pepper oak blackberry; blackberry plum oak; blackberry oak pepper; vanilla 
  oak blackberry; chocolate plum blackberry; oak blackberry vanilla; oak blackberry chocolate; chocolate blackberry plum 
2 good pinot noir; nice pinot noir; pinot noir good; best pinot noir; pinot noir great; pinot noir; pinot noir best; 
  pinot noir really; pinot noirs; californian pinot noir 
3 butter vanilla oak; vanilla butter oak; vanilla oak butter; oak cherry butter; oak butter; butter oak; oak butter oak; 
  oak butter really; oak earthy butter; oak butter cherry 
4 flavorful pinot; tasting pinot; really good pinot; pinot strawberry; pinot vanilla; pinot good; pinot fantastic; pinot 
  beautiful; pinot nice; nice pinot 
5 red fruit; berries; dark fruit; plum blackberry; red fruits; ruby red; ruby color; intense ruby; dark chocolate; black 
  cherry 
6 e

Pre-reduce embeddings for visualization purposes

In [None]:
kw = ['n_neighbors', 'min_dist','random_state' ]
kwargs = {k:v for k, v in param_set.items() if k in kw}
reduced_embeddings = UMAP(n_components=2, metric='cosine', **kwargs).fit_transform(embeddings)

In [None]:
custom_labels = False

In [None]:
titles = [x[:100] for x in docs]
topics_to_visualize = range(20)

topic_model.visualize_documents(titles, reduced_embeddings=reduced_embeddings,
                                hide_annotations=True, hide_document_hover=False,
                                topics=topics_to_visualize,
                                custom_labels=custom_labels
                                )

In [None]:
_ = print_custom_labels(topic_model, [0,1,2,3])

0 good wine; great wine; wine good; nice wine; wine great; wine nice; wines; wine; light wine; daily wine 
1 plum blackberry oak; blackberry plum oak; chocolate plum blackberry; chocolate blackberry plum; fruit plum blackberry; 
  pepper blackberry oak; vanilla oak blackberry; oak blackberry vanilla; blackberry oak pepper; oak blackberry chocolate 
2 good pinot noir; nice pinot noir; pinot noir good; best pinot noir; pinot noir great; pinot noir; pinot noir best; 
  pinot noir really; pinot noirs; californian pinot noir 
3 flavorful pinot; tasting pinot; really good pinot; pinot strawberry; pinot vanilla; pinot good; pinot fantastic; pinot 
  beautiful; pinot nice; nice pinot 


### Topic similarity

In [None]:
df = check_similarity(topic_model)
df.head(10)

Unnamed: 0,topic1,topic2,distance
0,13_vanilla_red fruit_vanilla oak_strawberry,7_oak_butter_vanilla_vanilla oak,0.847151
1,0_wine_vanilla_oak_red wine,4_cabernet_syrah_cab_cabernet sauvignon,0.844911
2,13_vanilla_red fruit_vanilla oak_strawberry,1_blackberry_plum_oak_chocolate,0.835241
3,1_blackberry_plum_oak_chocolate,8_tannins_acidity_black_nose,0.823997
4,15_value_price_tasty_value light,5_easy drink_drinking_easy drinking_light easy,0.823143
5,22_disgusting_taste light_tastes_weak,9_scent_aroma_little_acidity,0.807289
6,13_vanilla_red fruit_vanilla oak_strawberry,8_tannins_acidity_black_nose,0.806825
7,15_value_price_tasty_value light,22_disgusting_taste light_tastes_weak,0.80037
8,8_tannins_acidity_black_nose,9_scent_aroma_little_acidity,0.799257
9,11_buttery_spicy_smooth buttery_buttery smooth,15_value_price_tasty_value light,0.785454


In [None]:
df = check_similarity(topic_model, custom_labels=True, embedding_model=embedding_model, min_distance=0.7)
df.sort_values('c/label sim', ascending=False).head(10)

Calculating the similarity of custom label pairs for which the topic similarity exceeds 0.7...


Unnamed: 0,topic1,topic2,distance,c/label sim
0,13 oak strawberry raspberry; vanilla oak straw...,7 butter vanilla oak; vanilla butter oak; vani...,0.847151,0.813654
3,1 plum blackberry oak; blackberry plum oak; ch...,8 plum blackberry; berries; dark fruit; red fr...,0.823997,0.789642
2,1 plum blackberry oak; blackberry plum oak; ch...,13 oak strawberry raspberry; vanilla oak straw...,0.835241,0.768997
7,15 light wonderful flavors; price point tastin...,22 flavors rich tastes; unnatural tasting flav...,0.80037,0.737828
21,1 plum blackberry oak; blackberry plum oak; ch...,7 butter vanilla oak; vanilla butter oak; vani...,0.734262,0.721489
6,13 oak strawberry raspberry; vanilla oak straw...,8 plum blackberry; berries; dark fruit; red fr...,0.806825,0.713746
28,15 light wonderful flavors; price point tastin...,16 light fruity dry; fruity really dry; dry fr...,0.701624,0.690039
14,11 quite buttery; good buttery slightly; butte...,5 easy drinking; easy drink; easy drink quite;...,0.75906,0.656577
12,10 smooth sweet finish; pleasant finish dry; n...,11 quite buttery; good buttery slightly; butte...,0.769584,0.653234
15,11 quite buttery; good buttery slightly; butte...,22 flavors rich tastes; unnatural tasting flav...,0.747706,0.63914


In [None]:
tid = [13,7]
ct = print_custom_labels(topic_model, tid)

7 butter vanilla oak; vanilla butter oak; vanilla oak butter; oak vanilla butter; butter oak vanilla; oak cherry butter; 
  oak butter; butter oak; oak earthy butter; oak lovely butter 
13 oak strawberry raspberry; vanilla oak strawberry; oak cherry strawberry; vanilla red fruit; cherry strawberry 
  raspberry; strawberry raspberry; strawberry raspberry red; butter strawberry raspberry; strawberry red fruit; cherry 
  butter strawberry 


In [None]:
df = topic_model.get_topic_info()
df.loc[df.Topic.isin(tid)]

Unnamed: 0,Topic,Count,Name,CustomName,Representation,KeyBERT,Representative_Docs
8,7,78,7_oak_butter_vanilla_vanilla oak,7 butter vanilla oak; vanilla butter oak; vani...,"[oak, butter, vanilla, vanilla oak, cherry]","[butter vanilla oak, vanilla butter oak, vanil...","[vanilla oak cherry butter, butter vanilla oak..."
14,13,48,13_vanilla_red fruit_vanilla oak_strawberry,13 oak strawberry raspberry; vanilla oak straw...,"[vanilla, red fruit, vanilla oak, strawberry, ...","[oak strawberry raspberry, vanilla oak strawbe...","[Vanilla oak strawberry raspberry red fruit, v..."


**merge topics**

In [None]:
topics_to_merge = [tid]
topic_model.merge_topics(docs, topics_to_merge)

# update custom labels
topic_model = set_custom_labels(topic_model)

In [None]:
df = topic_model.get_topic_info()
df.loc[df.Topic.isin(tid)]

Unnamed: 0,Topic,Count,Name,CustomName,Representation,KeyBERT,Representative_Docs
8,7,79,7_favorite_yum_house_don remember,7 favorite; personal favorite; okey favorite; ...,"[favorite, yum, house, don remember, mmmmmm]","[favorite, personal favorite, okey favorite, f...","[my favorite❤️, A favorite, My favorite]"
14,13,45,13_2019_2023_mart_repurchase,13 2020 mart; kashow 2020 mart; mart 32000 rep...,"[2019, 2023, mart, repurchase, costco]","[2020 mart, kashow 2020 mart, mart 32000 repur...",[2021 GS Grape Bubble Gum Simple but just deli...


### Topic per class

In [None]:
classes_all = df_reviews.wine.tolist()

Review of topics before merging topics

- Topic 0 is about good wine: Though bread & butter pinot garners more overall praise, kendall-jackson cab boasts a higher ratio of favorable comments relative to its total comments.
- Topic 1, focusing on oak, chocolate, plum, and blackberry flavors, is the least representative of bread & butter pinot, william hill cab and montes syrah.
- Topics 2 & 3 are about pinot noir: the majority of comments in the topics are from the only pinot wine, bread & butter.
- Topic 4 seems about the 2 montes wines, but william hill cab shares the topic.
- Topic 5 is about easy driniking, the most representative of bread & butter pinot and william hill cab.
- Topic 6, centered on favorite wines, is most emblematic of Gnarly Dudes and William Hill, despite the limited number of comments on the topic.
- Topic 7, featuring buttery vanilla oak notes, starkly contrasts with Topic 1, as it exclusively pertains to the Bread & Butter Pinot.
- Topic 8 discusses cherry and acidity, with Montes Syrah being the wine most associated with this characteristic, intriguingly surpassing even the Bread & Butter Pinot.
- Topic 9, centered on fruity scents, is most closely associated with Montes Merlot.

In [None]:
visualize_topics_per_class_all(topic_model, docs, classes_all)

In [None]:
tid = list(range(10))
ct = print_custom_labels(topic_model, tid)

0 good wine; great wine; wine good; nice wine; wine great; wine nice; wines; wine; light wine; daily wine 
1 plum blackberry oak; blackberry plum oak; chocolate plum blackberry; chocolate blackberry plum; fruit plum blackberry; 
  pepper blackberry oak; vanilla oak blackberry; oak blackberry vanilla; blackberry oak pepper; oak blackberry chocolate 
2 good pinot noir; nice pinot noir; pinot noir good; best pinot noir; pinot noir great; pinot noir; pinot noir best; 
  pinot noir really; pinot noirs; californian pinot noir 
3 flavorful pinot; tasting pinot; really good pinot; pinot strawberry; pinot vanilla; pinot good; pinot fantastic; pinot 
  beautiful; pinot nice; nice pinot 
4 cabernet sauvignon; sauvignon; chilean wine; wines; wine; great cabernet; cabernet; chilean cab; montes alpha; 
  cabernets 
5 easy drinking; easy drink; easy drink quite; light easy drinking; good easy drink; light easy drink; easy drink taste; 
  easy drink light; smooth easy drink; easy drinking light 
6 fav

In [None]:
visualize_topics_per_class_all(topic_model, docs, classes_all)

In [None]:
tid = list(range(10))
ct = print_custom_labels(topic_model, tid)

0 good wine; great wine; wine good; wine great; nice wine; wine nice; wines; wine; light wine; wine strong 
1 vanilla butter oak; butter vanilla oak; vanilla oak butter; butter oak vanilla; oak cherry butter; cherry vanilla oak; 
  oak vanilla cherry; vanilla oak cherry; oak butter cherry; fruit vanilla oak 
2 blackberry oak plum; plum blackberry oak; blackberry plum oak; chocolate plum blackberry; chocolate blackberry plum; 
  fruit plum blackberry; pepper blackberry oak; vanilla oak blackberry; oak blackberry vanilla; blackberry oak pepper 
3 good pinot noir; nice pinot noir; pinot noir good; best pinot noir; pinot noir great; pinot noir; pinot noir best; 
  pinot noir really; pinot noirs; californian pinot noir 
4 flavorful pinot; tasting pinot; really good pinot; pinot strawberry; pinot vanilla; pinot good; pinot fantastic; pinot 
  beautiful; pinot nice; nice pinot 
5 cabernet sauvignon; sauvignon; chilean wine; wines; wine; great cabernet; cabernet; chilean cab; montes alpha; 
  

In [None]:
a = count_wine(df_reviews).wine.to_dict()
a

{7: "Kendall-Jackson Vintner's Reserve Cabernet Sauvignon",
 18: 'Montes Montes Alpha Cabernet Sauvignon',
 19: 'Montes Montes Alpha Merlot',
 20: 'Montes Montes Alpha Syrah',
 24: 'Mollydooker The Boxer Shiraz',
 25: 'William Hill North Coast Cabernet Sauvignon',
 28: 'Bread & Butter Pinot Noir',
 42: "Two Hands Angels' Share Shiraz",
 43: 'Two Hands Gnarly Dudes Shiraz',
 44: 'Two Hands Sexy Beast Cabernet Sauvignon'}

In [None]:
group = [7,18,20,25,28,43]
group = [v for k,v in a.items() if k in group]

visualize_topics_per_class_all(topic_model, docs, classes_all, group=group)

In [None]:
tid = list(range(13))
ct = print_custom_labels(topic_model, tid)

0 good wine; great wine; wine good; wine great; nice wine; wine nice; wines; wine; light wine; wine strong 
1 vanilla butter oak; butter vanilla oak; vanilla oak butter; butter oak vanilla; oak cherry butter; cherry vanilla oak; 
  oak vanilla cherry; vanilla oak cherry; oak butter cherry; fruit vanilla oak 
2 blackberry oak plum; plum blackberry oak; blackberry plum oak; chocolate plum blackberry; chocolate blackberry plum; 
  fruit plum blackberry; pepper blackberry oak; vanilla oak blackberry; oak blackberry vanilla; blackberry oak pepper 
3 good pinot noir; nice pinot noir; pinot noir good; best pinot noir; pinot noir great; pinot noir; pinot noir best; 
  pinot noir really; pinot noirs; californian pinot noir 
4 flavorful pinot; tasting pinot; really good pinot; pinot strawberry; pinot vanilla; pinot good; pinot fantastic; pinot 
  beautiful; pinot nice; nice pinot 
5 cabernet sauvignon; sauvignon; chilean wine; wines; wine; great cabernet; cabernet; chilean cab; montes alpha; 
  

### develop metrics

In [32]:
#tid = [0,1,2]
tid = [3,4,5]
rep_docs = get_representative_docs(topic_model, tid)

3-0: vanilla oak cherry butter 
3-1: butter vanilla oak 
3-2: Butter, vanilla, oak 
4-0: Pale garnet in the glass with watery rim. On the nose forrest floor spicy red fruit and damp earth and well 
  integrated oak. On the palate light to med body strawberry red cherry earthy barest vanilla low tannin med minus acid 
  with a long finish. Expected it to be oaked out of its mind based on past reviews but very pleasantly suprised with a 
  well integrated and classic pinot. Best served slightly chilled 
4-1: Color Light rose leaf and grapefruit Smell Fermenting bread, mild buttery scent, hazelnut, vanilla, cherry wood, 
  oak, grass. String instruments and rosin. The scent of berries, light fruit, and a very light and strange jujube scent. 
  Depending on the glass, aromas of whiskey and grain may be present. Maple Syrup and Berry Taste Depending on the glass, 
  there is a taste between Ssanghwacha-like, moderately bitter, astringent and sweet, and whiskey. Tannin is very light. 
  Over

In [33]:
#topic_model.get_topic_info().head()
ctopic = print_custom_labels(topic_model, tid)
{k:', '.join(v.split(';')) for k,v in ctopic.items()}

3 butter vanilla oak; vanilla butter oak; vanilla oak butter; oak cherry butter; oak butter; butter oak; oak butter oak; 
  oak butter really; oak earthy butter; oak butter cherry 
4 flavorful pinot; tasting pinot; really good pinot; pinot strawberry; pinot vanilla; pinot good; pinot fantastic; pinot 
  beautiful; pinot nice; nice pinot 
5 red fruit; berries; dark fruit; plum blackberry; red fruits; ruby red; ruby color; intense ruby; dark chocolate; black 
  cherry 


{3: '3 butter vanilla oak,  vanilla butter oak,  vanilla oak butter,  oak cherry butter,  oak butter,  butter oak,  oak butter oak,  oak butter really,  oak earthy butter,  oak butter cherry',
 4: '4 flavorful pinot,  tasting pinot,  really good pinot,  pinot strawberry,  pinot vanilla,  pinot good,  pinot fantastic,  pinot beautiful,  pinot nice,  nice pinot',
 5: '5 red fruit,  berries,  dark fruit,  plum blackberry,  red fruits,  ruby red,  ruby color,  intense ruby,  dark chocolate,  black cherry'}

In [34]:
from transformers import pipeline
classifier = pipeline("zero-shot-classification",
                      model="facebook/bart-large-mnli")

config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

{'sequence': 'one day I will see the world',
 'labels': ['travel', 'dancing', 'cooking'],
 'scores': [0.9938650727272034, 0.0032738037407398224, 0.0028610494919121265]}

In [35]:
sequence_to_classify = "one day I will see the world"
candidate_labels = ['travel', 'cooking', 'dancing']
classifier(sequence_to_classify, candidate_labels, multi_label=True)

{'sequence': 'one day I will see the world',
 'labels': ['travel', 'dancing', 'cooking'],
 'scores': [0.994511067867279, 0.0057061826810240746, 0.0018193012801930308]}

In [42]:
from transformers import pipeline
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

In [62]:
def calc_score(topic_model, aspect='KeyBERT', tid=None):
    """
    """
    if aspect not in topic_model.topic_aspects_.keys():
        print('ERROR')

    scores_all = topic_model.topic_aspects_[aspect]
    if tid is None:
        scores = {k:v for k,v in scores_all.items() if k > -1}
    else:
        scores = {k:v for k,v in scores_all.items() if k in tid}

    list_s = []
    for t, d in scores.items():
        s = [x[1] for x in d]
        list_s.append(np.mean(s))

    return list_s


calc_score(topic_model)

[0.814644,
 0.8783926,
 0.9244801,
 0.9109027,
 0.7353877,
 0.50917304,
 0.77352536,
 0.8164369,
 0.6879089,
 0.6297637,
 0.7047819,
 0.54371613,
 0.8294821,
 0.40675402,
 0.699839,
 0.885255,
 0.7621645,
 0.79073876,
 0.8835963,
 0.5928229,
 0.5518284,
 0.7376727,
 0.70267785]

In [60]:
aspect='KeyBERT'
tid=None
top_n_words=10

if aspect not in topic_model.topic_aspects_.keys():
    print('ERROR')

scores_all = topic_model.topic_aspects_[aspect]
if tid is None:
    scores = {k:v for k,v in scores_all.items() if k > -1}
else:
    scores = {k:v for k,v in scores_all.items() if k in tid}

#df = pd.DataFrame(columns=range(top_n_words)).rename_axis('Topic')
#df = pd.DataFrame(columns=range(len(scores)))
list_s = []
for t, d in scores.items():
    s = [x[1] for x in d]
    #df.loc[t] = s
    list_s.append(np.mean(s))

In [61]:
list_s

[0.814644,
 0.8783926,
 0.9244801,
 0.9109027,
 0.7353877,
 0.50917304,
 0.77352536,
 0.8164369,
 0.6879089,
 0.6297637,
 0.7047819,
 0.54371613,
 0.8294821,
 0.40675402,
 0.699839,
 0.885255,
 0.7621645,
 0.79073876,
 0.8835963,
 0.5928229,
 0.5518284,
 0.7376727,
 0.70267785]

In [47]:
#topic_model.topic_labels_.keys()
#topic_model.custom_labels_

name = 'KeyBERT'
score = topic_model.topic_aspects_[name]
score[0]

[('good wine', 0.95093286),
 ('nice wine', 0.9084027),
 ('wine good', 0.8879647),
 ('great wine', 0.85873663),
 ('wine nice', 0.84636873),
 ('wine great', 0.8308164),
 ('wines', 0.7667335),
 ('wine', 0.75390387),
 ('red wine', 0.68604016),
 ('wine light', 0.6565398)]

In [49]:
score

{-1: [('fruit flavor', 0.5555409),
  ('flavors', 0.5322805),
  ('flavours', 0.53137136),
  ('oak vanilla', 0.5019547),
  ('vanilla oak', 0.4961951),
  ('oak cherry', 0.4858615),
  ('strawberry', 0.48004597),
  ('red fruit', 0.4707541),
  ('dark fruit', 0.466573),
  ('red berries', 0.46340665)],
 0: [('good wine', 0.95093286),
  ('nice wine', 0.9084027),
  ('wine good', 0.8879647),
  ('great wine', 0.85873663),
  ('wine nice', 0.84636873),
  ('wine great', 0.8308164),
  ('wines', 0.7667335),
  ('wine', 0.75390387),
  ('red wine', 0.68604016),
  ('wine light', 0.6565398)],
 1: [('pepper blackberry oak', 0.90628374),
  ('plum blackberry oak', 0.903329),
  ('pepper oak blackberry', 0.9032758),
  ('blackberry plum oak', 0.8970441),
  ('blackberry oak pepper', 0.8868091),
  ('vanilla oak blackberry', 0.8631151),
  ('chocolate plum blackberry', 0.8619579),
  ('oak blackberry vanilla', 0.8611591),
  ('oak blackberry chocolate', 0.85401165),
  ('chocolate blackberry plum', 0.8469409)],
 2: [('g

In [None]:
 tid_all = topic_model.topic_labels_.keys()
    dict_label = dict(zip(tid_all, topic_model.custom_labels_))

In [43]:
score_function(topic_model)

{0: ['Good wine @', 'Good wine', 'Very good go to wine'],
 1: ['vanilla oak blackberry chocolate pepper plum',
  'plum blackberry oak',
  'oak blackberry'],
 2: ['Most delicious Pinot Noir',
  'A smooth Pinot Noir',
  'good for pinot noir'],
 3: ['vanilla oak cherry butter',
  'butter vanilla oak',
  'Butter, vanilla, oak'],
 4: ['Pale garnet in the glass with watery rim. On the nose forrest floor spicy red fruit and damp earth and well integrated oak. On the palate light to med body strawberry red cherry earthy barest vanilla low tannin med minus acid with a long finish. Expected it to be oaked out of its mind based on past reviews but very pleasantly suprised with a well integrated and classic pinot. Best served slightly chilled',
  "Color Light rose leaf and grapefruit Smell Fermenting bread, mild buttery scent, hazelnut, vanilla, cherry wood, oak, grass. String instruments and rosin. The scent of berries, light fruit, and a very light and strange jujube scent. Depending on the glas

In [36]:
topic_model.get_representative_docs()


{-1: ['Oh my god, this is heaven in a glass! Light red in color, wide rim and really slow curtain. The nose is packed with red fruit along with vanilla and oak. The palate is smack-your-face full of raspberry, cherry and strawberry. Coats the whole palate like butter. Quite long finish where earthy notes meet cream, caramel, leather and tobacco. Hints of clove and mushroom. Wow. Just wow. This is the reason I just love Californian PN. An absolute winner!',
  'Very good merlot Deep garnet Pronounced nose intensity Dark cherry, plum, tobacco, oak aromas Medium + acidity, medium tanins, high alc, full body, medium + finish blackcurrant plum tobacco chocolate leather flavours Very good balanced merlot, enjoy with friends. Maybe has some ageing potential.',
  'Very good merlot Deep garnet Pronounced nose intensity Dark cherry, plum, tobacco, oak aromas Medium + acidity, medium tanins, high alc, full body, medium + finish blackcurrant plum tobacco chocolate leather flavours Very good balance