<a href="https://colab.research.google.com/github/lbk209/topic_modeling/blob/main/tm_wine_reviews_params.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setting

In [1]:
%%capture
!pip install bertopic accelerate adjustText

In [2]:
import os
import pandas as pd
import plotly.express as px
import numpy as np

from tqdm import tqdm

In [3]:
# to work with path name having blank
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [4]:
import os
import pandas as pd

def read_csv(file, path_data, **kwargs):
    """
    kwargs: keyword args for pd.read_csv
    """
    files = [x for x in os.listdir(path_data) if x.startswith(file)]

    df_reviews = pd.DataFrame()
    for f in files:
        df = pd.read_csv(f'{path_data}/{f}', **kwargs)
        df_reviews = pd.concat([df_reviews, df])

    return df_reviews.reset_index(drop=True)


def print_topic_info(topic_model):
    """
    print number of topics and percentage of outliers
    """
    df = topic_model.get_topic_info()

    a = len(df) - 1
    print(f'num of topics: {a}')

    a = df.loc[df.Topic == -1]['Count']
    if a.count() > 0:
        a = a.values[0]/df['Count'].sum()
    else:
        a = 0
    print(f'outliers: {a:.3f}')

run to copy files from google drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


run to upload files from local

In [None]:
from google.colab import files
uploaded = files.upload()

# 🗂️ Data

In [5]:
file = 'wine_reviews'
path_data = 'sample_data'
path_src = '/content/drive/MyDrive/Colab\ Notebooks/'

In [None]:
!unzip {path_src}/{file}.zip -d {path_data}

Archive:  /content/drive/MyDrive/Colab Notebooks//wine_reviews.zip
  inflating: sample_data/wine_reviews_240124.csv  
  inflating: sample_data/wine_reviews_240207.csv  
  inflating: sample_data/wine_reviews_240212.csv  


In [None]:
df_reviews = read_csv(file, path_data, parse_dates=['date'])
df_reviews = df_reviews.rename_axis('id').reset_index()
df_reviews.head()

Unnamed: 0,id,wid,wine,date,review,source,lang,review_transl
0,0,0,Casillero del Diablo Cabernet Sauvignon (Reserva),2024-01-24,Little too cherry on the front end for me,vivino,en,Little too cherry on the front end for me
1,1,0,Casillero del Diablo Cabernet Sauvignon (Reserva),2024-01-20,"En liten skarp knekk i smaken. Ok fredagsvin,m...",vivino,no,A small sharp crack in the taste. Ok Friday wi...
2,2,0,Casillero del Diablo Cabernet Sauvignon (Reserva),2024-01-19,Aight,vivino,en,Aight
3,3,0,Casillero del Diablo Cabernet Sauvignon (Reserva),2024-01-18,무난한 맛 가성비 좋은듯,vivino,ko,Good taste and good value for money
4,4,0,Casillero del Diablo Cabernet Sauvignon (Reserva),2024-01-17,oak cherry black cherry chocolate blackcurrant...,vivino,en,oak cherry black cherry chocolate blackcurrant...


## Review data

**reviews not translated**

In [None]:
#df_reviews.loc[df_reviews.lang.str.contains('ERROR')]

cond = df_reviews.review_transl.isna()
df_reviews.loc[cond]

Unnamed: 0,id,wid,wine,date,review,source,lang,review_transl
1745,1745,7,Kendall-Jackson Vintner's Reserve Cabernet Sau...,2023-03-18,:-),vivino,<-- ERROR -->,
2019,2019,9,Frontera Cabernet Sauvignon,2022-05-21,😡,vivino,<-- ERROR -->,
2065,2065,9,Frontera Cabernet Sauvignon,2022-01-24,🤌🏻,vivino,<-- ERROR -->,
2352,2352,12,The 7th Generation - G7 Chardonnay,2022-01-31,🌵🍏🥝🧈,vivino,<-- ERROR -->,
2556,2556,13,San Pedro Gato Negro Chardonnay,2022-04-27,… …. ….,vivino,<-- ERROR -->,
2941,2941,18,Montes Montes Alpha Cabernet Sauvignon,2022-12-19,♡♡♡,vivino,<-- ERROR -->,
3057,3057,18,Montes Montes Alpha Cabernet Sauvignon,2021-12-24,.,vivino,<-- ERROR -->,


In [None]:
df_reviews = df_reviews.loc[~cond]

In [None]:
#df_reviews.groupby(['wid', 'wine']).review_transl.count()
df_reviews.groupby(['wid', 'wine']).id.count()

wid  wine                                                
0    Casillero del Diablo Cabernet Sauvignon (Reserva)       472
1    Yellow Tail Cabernet Sauvignon                          136
2    Roche Mazet Cuvée Spéciale Cabernet Sauvignon            61
3    San Pedro Gato Negro Cabernet Sauvignon                 879
4    Aguirre Dos Copas Cabernet Sauvignon                      7
5    The 7th Generation - G7 Cabernet Sauvignon               26
6    Casillero del Diablo Chardonnay (Reserva)               122
7    Kendall-Jackson Vintner's Reserve Cabernet Sauvignon    121
8    Viña Santa Helena Reservado Cabernet Sauvignon'          51
9    Frontera Cabernet Sauvignon                             207
10   Long Barn Chardonnay                                    142
11   Cono Sur Bicicleta Reserva Unoaked Chardonnay           109
12   The 7th Generation - G7 Chardonnay                       42
13   San Pedro Gato Negro Chardonnay                         230
14   Roche Mazet Cuvée Spéciale 

In [None]:
df_reviews.id.count()

3061

check wines of small numbers of reviews

In [None]:
df = df_reviews.groupby(by=['wid','wine']).id.count().rename('count').reset_index(1)
swid = df.loc[df['count']>100]
swid

Unnamed: 0_level_0,wine,count
wid,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Casillero del Diablo Cabernet Sauvignon (Reserva),472
1,Yellow Tail Cabernet Sauvignon,136
3,San Pedro Gato Negro Cabernet Sauvignon,879
6,Casillero del Diablo Chardonnay (Reserva),122
7,Kendall-Jackson Vintner's Reserve Cabernet Sau...,121
9,Frontera Cabernet Sauvignon,207
10,Long Barn Chardonnay,142
11,Cono Sur Bicicleta Reserva Unoaked Chardonnay,109
13,San Pedro Gato Negro Chardonnay,230
15,Yellow Tail Chardonnay,122


In [None]:
#df_reviews = df_reviews.loc[df_reviews.wid.isin(swid.index)]
df_reviews.id.count()

3061

In [None]:
# DO NOT CHANGE the (document) id as it is index to topics of topic model (topic_model.topics_)
docs = df_reviews.review_transl.tolist()

# 🗨️ **BERTopic**

## Params

### Embedding

In [None]:
# 12-layer, 384-hidden
st_id = 'all-MiniLM-L12-v2'

### Dimensionality Reduction

In [None]:
n_components = 15
n_neighbors = 10

# the minimum distance apart that points are allowed to be in the low dimensional representation.
# This means that low values of min_dist will result in clumpier embeddings.
# This can be useful if you are interested in clustering, or in finer topological structure.
# Larger values of min_dist will prevent UMAP from packing points together and will focus on the preservation of the broad topological structure instead.
min_dist = 0 #0.1

random_state=42

### Clustering

In [None]:
# a lower min_cluster_size will generate more topics
min_cluster_size = 50

# The implementation defaults this value (if it is unspecified) to whatever min_cluster_size is set to.
# The larger the value of min_samples you provide, the more conservative the clustering – more points will be declared as noise,
# and clusters will be restricted to progressively more dense area
#min_samples = None
r = 0.6
min_samples = round(min_cluster_size*r)

# We need this to avoid an AttributeError when integrating our custom HDBSCAN step with BERTopic
prediction_data=True

# can improve the resultant clusters
gen_min_span_tree=True

### BERTopic

In [None]:
# top n words in combined documents in a cluster
top_n_words = 5 #10

# Calculate the probabilities of all topics per document instead of the probability of the assigned topic per document.
# This could slow down the extraction of topics if you have many documents (> 100_000).
calculate_probabilities=False

## **Sub-models**

In [None]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer(st_id)
embeddings = embedding_model.encode(docs, show_progress_bar=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Batches:   0%|          | 0/87 [00:00<?, ?it/s]

In [None]:
from umap import UMAP

umap_model = UMAP(
    n_components=n_components,
    n_neighbors=n_neighbors,
    min_dist=min_dist,
    metric='cosine', random_state=random_state)

In [None]:
# Pre-reduce embeddings for visualization purposes
reduced_embeddings = UMAP(n_neighbors=n_neighbors, n_components=2, min_dist=min_dist, metric='cosine', random_state=random_state).fit_transform(embeddings)

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


In [None]:
from hdbscan import HDBSCAN

hdbscan_model = HDBSCAN(
    min_cluster_size=min_cluster_size,
    min_samples=min_samples,
    metric='euclidean',
    cluster_selection_method='eom',
    prediction_data=prediction_data,
    gen_min_span_tree=gen_min_span_tree
    )

In [None]:
from bertopic.representation import KeyBERTInspired

keybert = KeyBERTInspired()

representation_model = {
    "KeyBERT": keybert
}

CountVectorizer before training the topic model to minimize the size of the resulting c-TF-IDF matrix:

In [None]:
min_df = 0.001
len(docs), min_df * len(docs)

(2760, 2.7600000000000002)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

#vectorizer_model = None
vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 3), min_df=min_df)

## 🔥 **Training**

Now that we have our models prepared, we can start training our topic model! We supply BERTopic with the sub-models of interest, run `.fit_transform`, and see what kind of topics we get.

In [None]:
from bertopic import BERTopic

topic_model = BERTopic(
  # Sub-models
  embedding_model=embedding_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  representation_model=representation_model,
  vectorizer_model=vectorizer_model,

  top_n_words=top_n_words,
  calculate_probabilities=calculate_probabilities,
  verbose=True
)

# Train model
topics, probs = topic_model.fit_transform(docs, embeddings)

2024-02-07 10:16:28,985 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-02-07 10:16:47,455 - BERTopic - Dimensionality - Completed ✓
2024-02-07 10:16:47,457 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-02-07 10:16:47,671 - BERTopic - Cluster - Completed ✓
2024-02-07 10:16:47,678 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-02-07 10:16:50,375 - BERTopic - Representation - Completed ✓


In [None]:
# num of topics
len(topic_model.get_topics()) - 1

2

In [None]:
# Show topics
topic_model.get_topic_info().head(7)

Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,Representative_Docs
0,-1,96,-1_good_good good_gato_gato negro,"[good, good good, gato, gato negro, negro]","[good, good good, good good good, best, good l...","[Very good, Very good, Good]"
1,0,2573,0_wine_good_oak_taste,"[wine, good, oak, taste, red]","[good wine, wine, good, alcohol, bad, good val...","[Good wine!!, Good wine., A good wine.]"
2,1,91,1_chardonnay_citrus_acidity_good,"[chardonnay, citrus, acidity, good, tropical]","[nice chardonnay, chardonnay, good chardonnay,...","[Fruity Chardonnay, Nice chardonnay, Not my Ch..."


## 💦 **Post-processing**

### Vectorizer
Pass the CountVectorizer after training where llm used full context in training => ???

In [None]:
try:
    vectorizer_model
    print('vectorizer_model assigned before!')
except:
    print('passing vectorizer_model ater training')

vectorizer_model assigned before!


In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
#stop_additional = ['cabernet', 'sauvignon', 'cab', 'wine']
stop_additional = []

ngram_range = (1, 3)

# When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold.
# This value is also called cut-off in the literature.
# If float, the parameter represents a proportion of documents, integer absolute counts
min_df = 10

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

# Fine-tune topic representations after training BERTopic
# you can pass the CountVectorizer before and after training your topic model.
# Passing it before training allows you to minimize the size of the resulting c-TF-IDF matrix

stopwords = list(stopwords.words('english')) + stop_additional

vectorizer_model = CountVectorizer(stop_words=stopwords, ngram_range=ngram_range, min_df=min_df)

topic_model.update_topics(docs, vectorizer_model=vectorizer_model, top_n_words=top_n_words)

In [None]:
topic_model.get_topic_info().head(7)

Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,Representative_Docs
0,-1,580,-1_chardonnay_good_citrus_acidity,"[chardonnay, good, citrus, acidity, wine]","[good, great, best, nice, better, good cost, g...","[Good, Very good, Not my Chardonnay]"
1,0,1166,0_oak_vanilla_good_taste,"[oak, vanilla, good, taste, cherry]","[chocolate oak, oak blackberry, oak vanilla, o...",[oak cherry black cherry chocolate blackcurran...
2,1,781,1_wine_good_cabernet_red,"[wine, good, cabernet, red, price]","[good wine, wine good, nice wine, great wine, ...","[Very good wine, Good wine., Good wine!!]"
3,2,233,2_good_value_money_value money,"[good, value, money, value money, bad]","[good value money, good value, excellent value...","[Good value for money, Good value for money, V..."


### Outlier reduction

In [None]:
# Use the "c-TF-IDF" strategy with a threshold
# threshold is the minimum similarity.
new_topics = topic_model.reduce_outliers(docs, topics , strategy="c-tf-idf", threshold=0.1)

# count outliers
len([x for x in new_topics if x < 0])

6455

In [None]:
t = topics
#t = new_topics

new_topics = topic_model.reduce_outliers(docs, t , strategy="distributions", threshold=0.5)

# count outliers
len([x for x in new_topics if x < 0])

100%|██████████| 7/7 [00:22<00:00,  3.25s/it]


6231

In [None]:
t = topics
#t = new_topics
new_topics = topic_model.reduce_outliers(docs, t, strategy="embeddings", threshold=0.5)

# count outliers
len([x for x in new_topics if x < 0])

1988

In [None]:
t = topics
#t = new_topics

# the threshold is minimum probability when strategy="probabilities"
new_topics = topic_model.reduce_outliers(docs, t, strategy="probabilities", probabilities=probs, threshold=0.05)

# count outliers
len([x for x in new_topics if x < 0])

6678

#### update with new_topics

In [None]:
topic_model.update_topics(docs, topics=new_topics)



## 📊 Visualization

In [None]:
titles = [x[:100] for x in docs]
topics_to_visualize = range(20)

topic_model.visualize_documents(titles, reduced_embeddings=reduced_embeddings,
                                hide_annotations=True, hide_document_hover=False,
                                topics=topics_to_visualize,
                                #custom_labels=True
                                )

### Topics per Class

In [None]:
classes = df_reviews.wine.tolist()
topics_per_class = topic_model.topics_per_class(docs, classes=classes)

11it [00:00, 15.52it/s]


In [None]:
custom_labels = True
normalize_frequency = False

In [None]:
topic_model.visualize_topics_per_class(topics_per_class, top_n_topics=10,
                                       normalize_frequency = normalize_frequency,
                                       width=1000, height=500,
                                       custom_labels=custom_labels)

**Share of reviews**

In [None]:
total_freq = df_reviews.groupby('wine').count().id.to_dict() # num of documents per class

df = topics_per_class.assign(Frequency=topics_per_class.apply(lambda x: x.Frequency/total_freq[x.Class], axis=1))

sum([v for k,v in total_freq.items()]), len(docs)

(2760, 2760)

In [None]:
total_freq

{'Casillero del Diablo Cabernet Sauvignon (Reserva)': 472,
 'Casillero del Diablo Chardonnay (Reserva)': 122,
 'Cono Sur Bicicleta Reserva Unoaked Chardonnay': 109,
 'Frontera Cabernet Sauvignon': 207,
 "Kendall-Jackson Vintner's Reserve Cabernet Sauvignon": 121,
 'Long Barn Chardonnay': 142,
 'Montes Montes Alpha Cabernet Sauvignon': 220,
 'San Pedro Gato Negro Cabernet Sauvignon': 879,
 'San Pedro Gato Negro Chardonnay': 230,
 'Yellow Tail Cabernet Sauvignon': 136,
 'Yellow Tail Chardonnay': 122}

In [None]:
topic_model.visualize_topics_per_class(df, top_n_topics=10,
                                       normalize_frequency = normalize_frequency,
                                       width=1000, height=500,
                                       custom_labels=custom_labels)

# 🎚️ **Parameter Study**

In [6]:
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from bertopic.representation import KeyBERTInspired
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic


def bertopic_param_search(docs,
                   ### hyperparams
                   min_df=0.001, # CountVectorizer
                   max_df=1.0,
                   n_components=15,
                   n_neighbors=10,
                   min_dist=0.1, # UMAP
                   min_cluster_size=10,
                   min_samples=None, # HDBSCAN
                   ####
                   embedding_model=None,
                   embeddings=None,
                   ngram_range=(1, 3),
                   prediction_data=True,
                   gen_min_span_tree=True,
                   top_n_words=5,
                   calculate_probabilities=False,
                   random_state=42,
                   verbose=False,
                   return_model=False,
                   hdbscan_model=None
                   ):

    #-- sub-models
    vectorizer_model = CountVectorizer(stop_words="english", ngram_range=ngram_range,
                                       min_df=min_df, max_df=max_df)

    umap_model = UMAP(
        n_components=n_components,
        n_neighbors=n_neighbors,
        min_dist=min_dist,
        metric='cosine', random_state=random_state)

    if hdbscan_model is None:
        hdbscan_model = HDBSCAN(
            min_cluster_size=min_cluster_size,
            min_samples=min_samples,
            metric='euclidean',
            cluster_selection_method='eom',
            prediction_data=prediction_data,
            gen_min_span_tree=gen_min_span_tree
            )

    keybert = KeyBERTInspired()
    representation_model = {
        "KeyBERT": keybert
    }

    #-- train bertopic
    topic_model = BERTopic(
        embedding_model=embedding_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        vectorizer_model=vectorizer_model,
        representation_model=representation_model,

        top_n_words=top_n_words,
        calculate_probabilities=calculate_probabilities,
        verbose=verbose
    )

    # Train model
    try:
        topics, probs = topic_model.fit_transform(docs, embeddings)
        df = topic_model.get_topic_info()
    except:
        df = None

    if return_model:
        return topic_model
    else:
        return df


def get_topics(df, index, num_topics=10, cols = ['Topic', 'KeyBERT']):
    """
    get a row from df, the result of bertopic_param_search
    index: index of a param set
    """
    # get the position of topic 0 which might be 0 if no outlier
    i = df.loc[df.Topic==0].index[0]
    return df.iloc[i:num_topics+i].loc[:, cols].rename(columns=dict(zip(cols, ['index', index]))).set_index('index').transpose()

## Set params

In [7]:
params_study = {
    # When CountVectorizer building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold.
    # If float, the parameter represents a proportion of documents, integer absolute counts
    #'min_df': [1, 10], # error when 10 even if default max_df is 1.0
    'min_df': [0.001, 0.01],

    'max_df': [0.5, 1.0],

    # If you are interested in (density based) clustering, or other machine learning techniques,
    # it can be beneficial to pick a larger embedding dimension (say 10, or 50) closer to the the dimension
    # of the underlying manifold on which your data lies.
    'n_components': [10, 20, 50],

    # how UMAP balances local versus global structure in the data.
    # low values of n_neighbors will force UMAP to concentrate on very local structure (potentially to the detriment of the big picture)
    'n_neighbors': [10, 20, 50],

    # the minimum distance apart that points are allowed to be in the low dimensional representation.
    # This means that low values of min_dist will result in clumpier embeddings.
    # This can be useful if you are interested in clustering, or in finer topological structure.
    # Larger values of min_dist will prevent UMAP from packing points together and will focus on the preservation of the broad topological structure instead.
    'min_dist': [0, 0.05, 0.1],

    # set it to the smallest size grouping that you wish to consider a cluster.
    # It can have slightly non-obvious effects with min_samples
    'min_cluster_size': [20, 50, 100],

    # The implementation defaults this value (if it is unspecified) to whatever min_cluster_size is set to.
    # The larger the value of min_samples you provide, the more conservative the clustering –
    # more points will be declared as noise, and clusters will be restricted to progressively more dense area
    # UPDATE as the values are ratio to min_cluster_size
    'min_samples': [0.1, 0.5, 1.]
}

In [8]:
# Embedding model: 12-layer, 384-hidden
st_id = 'all-MiniLM-L12-v2'

params_base = {
    'ngram_range': (1, 3),
    'top_n_words': 5,
    'random_state': 42,
}

Product param sets

In [9]:
from itertools import product

param_names = params_study.keys()

param_values = params_study.values()
param_values = list(product(*param_values))

df_params = pd.DataFrame(param_values, columns=param_names)

try:
    df_params.min_samples = df_params.min_samples.mul(df_params.min_cluster_size).astype(int)
except:
    print('No min_samples')

df_params = df_params.reset_index(drop=True)
df_params.head(5)

Unnamed: 0,min_df,max_df,n_components,n_neighbors,min_dist,min_cluster_size,min_samples
0,0.001,0.5,10,10,0.0,20,2
1,0.001,0.5,10,10,0.0,20,10
2,0.001,0.5,10,10,0.0,20,20
3,0.001,0.5,10,10,0.0,50,5
4,0.001,0.5,10,10,0.0,50,25


import old study

In [None]:
file = 'wr_param_study_01.csv'

!cp {path_src}/{file} {path_data}

In [None]:
#df_result_old = pd.read_csv(f'{path_data}/{file}')
#df_result_old.head(5)

Unnamed: 0,min_df,n_components,n_neighbors,min_dist,min_cluster_size,min_samples,0,1,2,3,4,5,6,7,8,9
0,1,10,10,0.0,20,2,"['fruity', 'delicious fruity', 'fruity good', ...","['good value money', 'money good value', 'good...","['chilean cabernet sauvignon', 'cabernet sauvi...","['citrus tropical', 'citrus pear tropical', 'p...","['chilean wine', 'wine chile', 'chilean wine w...","['oak', 'oak oak', 'vanilla oak', 'oak vanilla...","['fruity scent', 'citrus scent', 'aromatic fru...","['håkon rekdal', 'like håkon rekdal', 'rekdal'...","['medium tannins', 'medium tannin', 'soft tann...","['easy drink good', 'drink easy', 'easy drinki..."
1,1,10,10,0.0,20,10,"['nice wine', 'good wine', 'great wine', 'exce...","['vanilla oak blackberry', 'oak blackberry van...","['nice chardonnay', 'good chardonnay', 'chardo...","['fruity', 'fruity good', 'tasty fruity', 'swe...","['sour taste', 'bitter', 'retrospect add taste...","['good value money', 'money good value', 'good...","['drink easy', 'easy drinking', 'easy drink', ...","['citrus tropical', 'citrus pear tropical', 'p...","['smooth dry', 'flavor smooth', 'dry smooth', ...","['cabernet sauvignon', 'chilean cabernet sauvi..."
2,1,10,10,0.0,20,20,"['good', 'great', 'excellent', 'bad', 'good va...","['nice chardonnay', 'chardonnay', 'good chardo...",,,,,,,,
3,1,10,10,0.0,50,5,"['good wine', 'wine', 'sauvignon', 'cabernet s...","['nice chardonnay', 'chardonnay', 'good chardo...",,,,,,,,
4,1,10,10,0.0,50,25,"['good', 'great', 'excellent', 'good value', '...","['nice chardonnay', 'chardonnay', 'good chardo...",,,,,,,,


drop param set studied before

In [None]:
df = df_result_old[df_params.columns]
df_params = pd.concat([df_params, df]).drop_duplicates(keep=False)
len(df_params)

255

Update base params with embedding model

In [10]:
show_progress_bar=False,

embedding_model = SentenceTransformer(st_id)
embeddings = embedding_model.encode(docs, show_progress_bar=show_progress_bar)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/573 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/134M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/352 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

NameError: name 'docs' is not defined

In [None]:
params_base.update({
    'embedding_model': embedding_model,
    'embeddings': embeddings
})

## Search params

make batch

In [None]:
batch = 3 #0, 1, 2, 3
num_params = 250

run_idx = [df_params.index[i*num_params:(i+1)*num_params] for i in range(len(df_params) // num_params +1)]
run_file = [f'a{i}' for i in range(1,len(run_idx)+1)]

df_params_b = df_params.loc[run_idx[batch]]
file = f'wr_param_study_{run_file[batch]}'

no batch

In [None]:
#df_params_b = df_params
#file = 'wr_param_study_01'

In [None]:
df_result = pd.DataFrame()

total = len(df_params_b)

for rec in tqdm(df_params_b.iloc[:total].itertuples(), total=total):
    idx = rec[0]
    kwargs = rec._asdict()
    kwargs.pop('Index', None)
    kwargs.update(params_base)

    df = bertopic_param_search(docs, **kwargs)

    if df is None:
        df = pd.DataFrame({0: ['ERROR']}).rename_axis('index')
        break # testing
    else:
        df = get_topics(df, idx)
    df_result = pd.concat([df_result, df])

df_result = df_params_b.join(df_result)
df_result.head(5)

100%|██████████| 222/222 [2:02:42<00:00, 33.17s/it]


Unnamed: 0,min_df,max_df,n_components,n_neighbors,min_dist,min_cluster_size,min_samples,0,1,2,3,4,5,6,7,8,9
750,0.01,1.0,10,10,0.1,50,5,"[good wine, wine, good, great, excellent, sauv...","[nice chardonnay, chardonnay, good chardonnay,...",,,,,,,,
751,0.01,1.0,10,10,0.1,50,25,"[good wine, wine, wines, sauvignon, cabernet s...","[good value money, good value, excellent value...","[nice chardonnay, chardonnay, good chardonnay,...",,,,,,,
752,0.01,1.0,10,10,0.1,50,50,"[oak vanilla, oak blackberry, oak, vanilla, fl...","[good wine, nice wine, great wine, wine good, ...","[good value money, good value, excellent value...","[nice chardonnay, chardonnay, good chardonnay,...",,,,,,
753,0.01,1.0,10,10,0.1,100,10,"[good wine, excellent wine, nice wine, wine go...","[dry, dry dry, dry good, sweet dry, dry smooth...","[oak, oak oak, blackberry oak, vanilla oak, oa...","[good value money, excellent value money, good...","[fruity, fruity good, light fruity, fruity int...","[chilean cabernet sauvignon, cabernet sauvigno...","[drink easy, easy drinking, easy drink, drink ...",,,
754,0.01,1.0,10,10,0.1,100,50,"[oak vanilla, oak blackberry, oak, vanilla, fl...","[good wine, nice wine, wine good, wine, red wi...","[good value money, good value, excellent value...",,,,,,,


join param set to result if param search interupted

In [None]:
#df_result = df_params_b.loc[df_result.index].join(df_result)
#len(df_result)

In [None]:
cond = df_result[0].isna()
print('num of param sets to be studied:', cond.sum())

#df_result = df_result.loc[~cond]

df_result.loc[cond].head()

num of param sets to be studied: 0


Unnamed: 0,min_df,max_df,n_components,n_neighbors,min_dist,min_cluster_size,min_samples,0,1,2,3,4,5,6,7,8,9


In [None]:
# compare topics
topic_id = 0
num_topics = 5

_ = [print(f'{i:>2}:', ', '.join(rec[topic_id][:num_topics])) for i, rec in df_result.iterrows()]

In [None]:
#file = f'wr_param_study_{}'

f = f'{path_data}/{file}.csv'
df_result.to_csv(f, index = False)

!zip -j {file}.zip {f}
!cp {file}.zip {path_src}

  adding: wr_param_study_a4.csv (deflated 92%)


## Review result

In [9]:
file = 'wr_param_study_a'
!unzip {path_src}/{file}.zip -d {path_data}

Archive:  /content/drive/MyDrive/Colab Notebooks//wr_param_study_a.zip
  inflating: sample_data/wr_param_study_a1.csv  
  inflating: sample_data/wr_param_study_a2.csv  
  inflating: sample_data/wr_param_study_a3.csv  
  inflating: sample_data/wr_param_study_a4.csv  


In [10]:
df_result = read_csv(file, path_data)
n = range(10)

df_result.columns = list(df_result.columns[:-10]) + list(df_result.columns[-10:].astype(int))
df_result.loc[:, n] = df_result.loc[:, n].applymap(lambda x: eval(x) if x is not np.nan else np.nan)

df_result.head()

Unnamed: 0,min_df,max_df,n_components,n_neighbors,min_dist,min_cluster_size,min_samples,0,1,2,3,4,5,6,7,8,9
0,0.001,0.5,10,10,0.0,20,2,"[fruit flavor, light fruity excellent, melon f...","[good value money, money good value, good valu...","[cabernet sauvignon, like cabernet sauvignon, ...","[citrus tropical, citrus pear tropical, pear c...","[chilean wine, wine chile, chilean wine wine, ...","[oak, vanilla oak, oak oak, oak vanilla, butte...","[fruity scent, olfactory aromatic fruity, citr...","[håkon rekdal, like håkon rekdal, rekdal like,...","[medium tannins, medium tannin, soft tannins, ...","[easy drink good, inexpensive easy drink, pric..."
1,0.001,0.5,10,10,0.0,20,10,"[nice wine, good wine, great wine, excellent w...","[oak blackberry oak, oak blackberry, blackberr...","[nice chardonnay, good chardonnay, chardonnay,...","[light fruity excellent, light fruity, good sm...","[retrospect add tasted, sour taste, add tasted...","[good value money, money good value, good valu...","[easy drinking, drink easy, easy drink, inexpe...","[citrus tropical, citrus pear tropical, pear c...","[delicious smooth, flavor smooth, smooth dry, ...","[ordinary cabernet sauvignon, cabernet sauvign..."
2,0.001,0.5,10,10,0.0,20,20,"[cherry blackberry, blackberry cherry, oak bla...","[chardonnay chile, chilean chardonnay, chardon...",,,,,,,,
3,0.001,0.5,10,10,0.0,50,5,"[chocolate oak, oak chocolate, cherry chocolat...","[chardonnay chile, chilean chardonnay, chardon...",,,,,,,,
4,0.001,0.5,10,10,0.0,50,25,"[plum blackberry, cherry blackberry, blackberr...","[chardonnay chile, chilean chardonnay, chardon...",,,,,,,,


In [None]:
# case 1: compapre topic 0's
res_df = df_result[0].str.join(', ')
res_docs = res_df.to_list()

In [None]:
# caes 2: compare models with 10 topics
n = range(10)

res_df = df_result[n].apply(lambda x: x.str.join(', ')).stack()

res_docs = []
res_docs_id = [] # save param set and topic id for res_docs

for idx, s in res_df.items():
    if s is None:
        break
    res_docs.append(s)
    i = '_'.join([str(x) for x in idx])
    res_docs_id.append(i)
len(res_docs)

4629

In [11]:
# caes 3: compare param sets
n = range(10)

res_df = (df_result[n]
          .apply(lambda x: x.str.join(', '))
          .fillna('')
          .apply(lambda x: x.tolist(), axis=1)
          .apply(lambda row: '; '.join(filter(None, map(str, row))))
)

res_docs = res_df.tolist()
res_docs_id = res_df.index

len(res_docs)

972

In [12]:
res_docs[0]

'fruit flavor, light fruity excellent, melon flavors remarkable, light fruity, taste good, fruity good, fruity excellent, tasty fruity, delicious fruity, nice taste berries; good value money, money good value, good value, money excellent value, excellent value money, good great value, great value money, value money good, money good price, excellent value; cabernet sauvignon, like cabernet sauvignon, chilean cabernet sauvignon, diablo cabernet sauvignon, world cabernet sauvignon, red cabernet sauvignon, cabernet sauvignon rich, sauvignon, wines, chilean cabernet; citrus tropical, citrus pear tropical, pear citrus tropical, tropical citrus, thailand citrus pear, citrus tropical peach, citrus tropical lemon, thailand citrus, tropical lemon, apple pear citrus; chilean wine, wine chile, chilean wine wine, good chilean wine, chilean wines, classic chilean wine, chilean, chile, chilean red, american wine; oak, vanilla oak, oak oak, oak vanilla, butter oak, oak oaked, oak chocolate oak, oak oa

In [13]:
from sklearn.cluster import KMeans

#cluster_model = KMeans(n_clusters=min_cluster_size)
cluster_model = None

In [14]:
min_dist = 0.5
min_cluster_size = 10
random_state = 42

st_id = 'all-MiniLM-L12-v2'
embedding_model = SentenceTransformer(st_id)

topic_model = bertopic_param_search(res_docs,
                                    min_df=0,
                                    max_df=1.0,
                                    n_components=15,
                                    n_neighbors=10,
                                    min_dist=min_dist,
                                    min_cluster_size=min_cluster_size,
                                    embedding_model=embedding_model,
                                    calculate_probabilities=True,
                                    random_state=random_state,
                                    hdbscan_model=cluster_model,
                                    return_model=True)
topic_model.get_topic_info().head()

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/573 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/134M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/352 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,Representative_Docs
0,-1,145,-1_good_wine_oak_value,"[good, wine, oak, value, chardonnay]","[cabernet sauvignon, wine good wine, good wine...","[good wine, nice wine, excellent wine, wine go..."
1,0,325,0_wine_cabernet_sauvignon_cabernet sauvignon,"[wine, cabernet, sauvignon, cabernet sauvignon...","[like cabernet sauvignon, cabernet sauvignon g...","[chocolate oak, vanilla oak, oak vanilla, oak ..."
2,1,65,1_value_wine_value money_oak,"[value, wine, value money, oak, money]","[wine good value, wine wine good, wine good, g...","[chocolate oak, vanilla oak, oak vanilla, oak,..."
3,2,56,2_fruity_oak_citrus_tropical,"[fruity, oak, citrus, tropical, blackberry]","[wine nice wine, wine good wine, wine wine goo...","[nice wine, good wine, great wine, excellent w..."
4,3,55,3_chardonnay_blackberry_chile_cherry,"[chardonnay, blackberry, chile, cherry, chardo...","[blackberries chardonnay chile, chardonnay che...","[cherry blackberry, blackberry cherry, oak bla..."


In [15]:
print_topic_info(topic_model)

num of topics: 21
outliers: 0.149


In [16]:
topic_model.visualize_topics()

In [None]:
#topic_model.visualize_distribution(probs[0])

In [19]:
from sentence_transformers import SentenceTransformer

# Larger values of min_dist will focus on the preservation of the broad topological structure.
#min_dist=0.1
#min_dist=0.9

embedding_model = SentenceTransformer(st_id)
res_embeddings = embedding_model.encode(res_docs, show_progress_bar=False)

res_reduced = UMAP(n_components=2, random_state=random_state, min_dist=min_dist).fit_transform(res_embeddings)

In [20]:
##d = res_docs
#d = [f'{i}: {rec[0]}' for i, rec in pd.DataFrame(res_df).iterrows()]
#d = [f'{x}:{y}' for x, y in zip(res_docs_id, res_docs)]

d = [f'{x[0]}: {list(x)[1:]}' for x in df_result.iloc[:, :-10].to_records()]
title = ', '.join([x for x in list(df_result.iloc[:, :-10].columns)])

topic_model.visualize_documents(d, reduced_embeddings=res_reduced,
                                title=title,
                                hide_annotations=True)

In [21]:
# Extract hierarchical topics and their representations
hierarchical_topics = topic_model.hierarchical_topics(res_docs)

# Visualize these representations
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

100%|██████████| 20/20 [00:00<00:00, 123.86it/s]


In [22]:
topic_model.visualize_barchart()

### Topics per Class

In [32]:
# param set as class
#classes = res_df.index.get_level_values(0)

# param as class
i = 2
classes = df_result.iloc[:, i].apply(str)
title = df_result.columns[i]

len(res_docs), len(classes), len(set(classes))

(972, 972, 3)

In [47]:
topics_per_class = topic_model.topics_per_class(res_docs, classes=classes)

fig = topic_model.visualize_topics_per_class(topics_per_class,
                                       #top_n_topics=10,
                                       normalize_frequency = False,
                                       #width=1000, height=500,
                                       title = title
                                       )
fig

In [107]:
from plotly.subplots import make_subplots

ncols = 3
subplot_titles = df_result.columns[:-10]

nrows = len(subplot_titles)//ncols+1

fig = make_subplots(rows=nrows,
                    cols=ncols,
                    shared_xaxes=False,
                    horizontal_spacing=.1,
                    vertical_spacing=.4 / nrows if nrows > 1 else 0,
                    subplot_titles=subplot_titles)


row, col = 1, 1
for i, p in enumerate(subplot_titles):
    classes = df_result.iloc[:, i].apply(str)
    #title = df_result.columns[i]

    topics_per_class = topic_model.topics_per_class(res_docs, classes=classes)

    f = topic_model.visualize_topics_per_class(topics_per_class,
                                        #top_n_topics=10,
                                        normalize_frequency = False,
                                        #width=1000, height=500,
                                        #title = title
                                        )

    #[fig.add_trace(x, row=row, col=col) for x in f.data]
    #for x in f.data:
    #    fig.add_trace(x, row=row, col=col)
    fig.add_trace(f.data[0], row=row, col=col)
    fig.add_trace(f.data[1].update({'visible':True}), row=row, col=col)
    #fig.add_trace(f.data[2], row=row, col=col)


    if col == ncols:
        col = 1
        row += 1
    else:
        col += 1

In [106]:
f.data[1]

Bar({
    'hoverinfo': 'text',
    'hovertext': [<b>Topic 1</b><br>Words: value, wine, value money, money, oak,
                  <b>Topic 1</b><br>Words: value, wine, value money, oak, money,
                  <b>Topic 1</b><br>Words: value, oak, wine, money, value money,
                  <b>Topic 1</b><br>Words: bom, value, wine, good, good value],
    'marker': {'color': '#56B4E9'},
    'name': '1_value_wine_value money_oak',
    'orientation': 'h',
    'visible': 'legendonly',
    'x': array([20, 40,  3,  2]),
    'y': array(['50', '100', '2', '10'], dtype=object)
})

In [108]:
width = 350
height = 350

fig.update_layout(
    #template="plotly_white",
    showlegend=False,
    width=width*ncols,
    height=height*nrows if nrows > 1 else height * 1.3,
    hoverlabel=dict(
        bgcolor="white",
        font_size=16,
        font_family="Rockwell"
    ),
)

fig.update_xaxes(showgrid=True)
fig.update_yaxes(showgrid=True)

In [89]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

def custom_visualize_topics_per_class(topic_model, topics_per_class,
                                      top_n_topics=None, normalize_frequency=False):
    colors = ["#E69F00", "#56B4E9", "#009E73", "#F0E442", "#D55E00", "#0072B2", "#CC79A7"]

    topic_names = {key: value[:40] + "..." if len(value) > 40 else value
                       for key, value in topic_model.topic_labels_.items()}
    topics_per_class["Name"] = topics_per_class.Topic.map(topic_names)

    freq_df = topic_model.get_topic_freq()
    freq_df = freq_df.loc[freq_df.Topic != -1, :]

    if top_n_topics is not None:
        selected_topics = sorted(freq_df.Topic.to_list()[:top_n_topics])
    else:
        selected_topics = sorted(freq_df.Topic.to_list())

    data = topics_per_class.loc[topics_per_class.Topic.isin(selected_topics), :]

    fig = go.Figure()
    for index, topic in enumerate(selected_topics):
        trace_data = data.loc[data.Topic == topic, :]
        topic_name = trace_data.Name.values[0]
        words = trace_data.Words.values
        if normalize_frequency:
            x = normalize(trace_data.Frequency.values.reshape(1, -1))[0]
        else:
            x = trace_data.Frequency

        #print(topic_name) # testing

        fig.add_trace(go.Bar(y=trace_data.Class,
                             x=x,
                             visible=True,
                             marker_color=colors[index % 7],
                             hoverinfo="text",
                             name=topic_name,
                             orientation="h",
                             hovertext=[f'<b>Topic {topic}</b><br>Words: {word}' for word in words]))

    return fig




ncols = 3
subplot_titles = df_result.columns[:-10]

nrows = len(subplot_titles)//ncols+1

fig = make_subplots(rows=nrows,
                    cols=ncols,
                    shared_xaxes=False,
                    horizontal_spacing=.1,
                    vertical_spacing=.4 / nrows if nrows > 1 else 0,
                    subplot_titles=subplot_titles)

row, col = 1, 1
for i, p in enumerate(subplot_titles):
    classes = df_result.iloc[:, i].apply(str)
    topics_per_class = topic_model.topics_per_class(res_docs, classes=classes)

    f = custom_visualize_topics_per_class(topic_model, topics_per_class)

    fig.add_trace(f.data[0], row=row, col=col)
    fig.add_trace(f.data[1], row=row, col=col)
    if col == ncols:
        col = 1
        row += 1
    else:
        col += 1



In [90]:
fig

In [85]:
fig.add_trace(f.data[0], row=row, col=col)

In [74]:
width = 350
height = 350

fig.update_layout(
    #template="plotly_white",
    showlegend=True,
    width=width*ncols,
    height=height*nrows if nrows > 1 else height * 1.3,
    hoverlabel=dict(
        bgcolor="white",
        font_size=16,
        font_family="Rockwell"
    ),
)

fig.update_xaxes(showgrid=True)
fig.update_yaxes(showgrid=True)

fig

In [27]:
tid = 0
topics_per_class.loc[topics_per_class.Topic==tid].groupby('Class').Frequency.sum()

Class
0.001    167
0.01     158
Name: Frequency, dtype: int64

In [None]:
tid = 0
ps = range(944,948)
cond = (topics_per_class.Topic == tid) & (topics_per_class.Class.isin(ps))
topics_per_class.loc[cond]

Unnamed: 0,Topic,Words,Frequency,Class,Name
3294,0,"wine, sauvignon, cabernet sauvignon, cabernet,...",2,944,0_wine_sauvignon_cabernet_cabernet sauvi...
3296,0,"wine, sauvignon, cabernet, cabernet sauvignon,...",1,945,0_wine_sauvignon_cabernet_cabernet sauvi...
3298,0,"cabernet, wine, sauvignon, chilean, sauvignon ...",1,946,0_wine_sauvignon_cabernet_cabernet sauvi...
3302,0,"wine, sauvignon, cabernet, wine good, cabernet...",4,947,0_wine_sauvignon_cabernet_cabernet sauvi...


In [None]:
df_result.loc[ps].iloc[:,:8]

Unnamed: 0,min_df,max_df,n_components,n_neighbors,min_dist,min_cluster_size,min_samples,0
944,0.001,0.5,50,50,0.0,100,10,"[citrus peach, fruity good, citrus pear, pear ..."
945,0.001,0.5,50,50,0.0,100,50,"[fruity blackberry, blackberry vanilla cherry,..."
946,0.001,0.5,50,50,0.0,100,100,"[blackberry vanilla cherry, oak blackberry van..."
947,0.001,0.5,50,50,0.05,20,2,"[good value wine, wine good value, value wine,..."


In [None]:
tid = 947
[i for i,x in enumerate(res_docs_id) if x.startswith(f'{tid}')]

[4533, 4534, 4535, 4536, 4537, 4538, 4539, 4540, 4541, 4542]

In [None]:
i = 4542
topic_model.visualize_distribution(topic_model.probabilities_[i])