<a href="https://colab.research.google.com/github/lbk209/topic_modeling/blob/main/tm_wine_reviews_params.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setting

In [1]:
%%capture
!pip install bertopic accelerate adjustText

In [2]:
import os
import pandas as pd
import plotly.express as px
from tqdm import tqdm

In [3]:
# to work with path name having blank
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [4]:
import os
import pandas as pd

def read_csv(file, path_data, **kwargs):
    """
    kwargs: keyword args for pd.read_csv
    """
    files = [x for x in os.listdir(path_data) if x.startswith(file)]

    df_reviews = pd.DataFrame()
    for f in files:
        df = pd.read_csv(f'{path_data}/{f}', **kwargs)
        df_reviews = pd.concat([df_reviews, df])

    return df_reviews.reset_index(drop=True)

run to copy files from google drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


run to upload files from local

In [None]:
from google.colab import files
uploaded = files.upload()

# 🗂️ Data

In [5]:
file = 'wine_reviews'
path_data = 'sample_data'
path_src = '/content/drive/MyDrive/Colab\ Notebooks/'

In [6]:
!unzip {path_src}/{file}.zip -d {path_data}

Archive:  /content/drive/MyDrive/Colab Notebooks//wine_reviews.zip
  inflating: sample_data/wine_reviews_240124.csv  
  inflating: sample_data/wine_reviews_240207.csv  


In [7]:
df_reviews = read_csv(file, path_data, parse_dates=['date'])
df_reviews = df_reviews.rename_axis('id').reset_index()
df_reviews.head()

Unnamed: 0,id,wid,wine,date,review,source,lang,review_transl
0,0,0,Casillero del Diablo Cabernet Sauvignon (Reserva),2024-01-24,Little too cherry on the front end for me,vivino,en,Little too cherry on the front end for me
1,1,0,Casillero del Diablo Cabernet Sauvignon (Reserva),2024-01-20,"En liten skarp knekk i smaken. Ok fredagsvin,m...",vivino,no,A small sharp crack in the taste. Ok Friday wi...
2,2,0,Casillero del Diablo Cabernet Sauvignon (Reserva),2024-01-19,Aight,vivino,en,Aight
3,3,0,Casillero del Diablo Cabernet Sauvignon (Reserva),2024-01-18,무난한 맛 가성비 좋은듯,vivino,ko,Good taste and good value for money
4,4,0,Casillero del Diablo Cabernet Sauvignon (Reserva),2024-01-17,oak cherry black cherry chocolate blackcurrant...,vivino,en,oak cherry black cherry chocolate blackcurrant...


## Review data

**reviews not translated**

In [8]:
import numpy as np

#df_reviews.loc[df_reviews.lang.str.contains('ERROR')]

cond = df_reviews.review_transl.isna()
df_reviews.loc[cond]

Unnamed: 0,id,wid,wine,date,review,source,lang,review_transl
1745,1745,7,Kendall-Jackson Vintner's Reserve Cabernet Sau...,2023-03-18,:-),vivino,<-- ERROR -->,
2019,2019,9,Frontera Cabernet Sauvignon,2022-05-21,😡,vivino,<-- ERROR -->,
2065,2065,9,Frontera Cabernet Sauvignon,2022-01-24,🤌🏻,vivino,<-- ERROR -->,
2352,2352,12,The 7th Generation - G7 Chardonnay,2022-01-31,🌵🍏🥝🧈,vivino,<-- ERROR -->,
2556,2556,13,San Pedro Gato Negro Chardonnay,2022-04-27,… …. ….,vivino,<-- ERROR -->,
2941,2941,18,Montes Montes Alpha Cabernet Sauvignon,2022-12-19,♡♡♡,vivino,<-- ERROR -->,
3057,3057,18,Montes Montes Alpha Cabernet Sauvignon,2021-12-24,.,vivino,<-- ERROR -->,


In [9]:
df_reviews = df_reviews.loc[~cond]

In [10]:
#df_reviews.groupby(['wid', 'wine']).review_transl.count()
df_reviews.groupby(['wid', 'wine']).id.count()

wid  wine                                                
0    Casillero del Diablo Cabernet Sauvignon (Reserva)       472
1    Yellow Tail Cabernet Sauvignon                          136
2    Roche Mazet Cuvée Spéciale Cabernet Sauvignon            61
3    San Pedro Gato Negro Cabernet Sauvignon                 879
4    Aguirre Dos Copas Cabernet Sauvignon                      7
5    The 7th Generation - G7 Cabernet Sauvignon               26
6    Casillero del Diablo Chardonnay (Reserva)               122
7    Kendall-Jackson Vintner's Reserve Cabernet Sauvignon    121
8    Viña Santa Helena Reservado Cabernet Sauvignon'          51
9    Frontera Cabernet Sauvignon                             207
10   Long Barn Chardonnay                                    142
11   Cono Sur Bicicleta Reserva Unoaked Chardonnay           109
12   The 7th Generation - G7 Chardonnay                       42
13   San Pedro Gato Negro Chardonnay                         230
14   Roche Mazet Cuvée Spéciale 

In [11]:
df_reviews.id.count()

3061

check wines of small numbers of reviews

In [None]:
df = df_reviews.groupby(by=['wid','wine']).id.count().rename('count').reset_index(1)
swid = df.loc[df['count']>100]
swid

Unnamed: 0_level_0,wine,count
wid,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Casillero del Diablo Cabernet Sauvignon (Reserva),472
1,Yellow Tail Cabernet Sauvignon,136
3,San Pedro Gato Negro Cabernet Sauvignon,879
6,Casillero del Diablo Chardonnay (Reserva),122
7,Kendall-Jackson Vintner's Reserve Cabernet Sau...,121
9,Frontera Cabernet Sauvignon,207
10,Long Barn Chardonnay,142
11,Cono Sur Bicicleta Reserva Unoaked Chardonnay,109
13,San Pedro Gato Negro Chardonnay,230
15,Yellow Tail Chardonnay,122


In [None]:
#df_reviews = df_reviews.loc[df_reviews.wid.isin(swid.index)]
df_reviews.id.count()

3061

In [12]:
# DO NOT CHANGE the (document) id as it is index to topics of topic model (topic_model.topics_)
docs = df_reviews.review_transl.tolist()

# 🗨️ **BERTopic**

## Params

### Embedding

In [None]:
# 12-layer, 384-hidden
st_id = 'all-MiniLM-L12-v2'

### Dimensionality Reduction

In [None]:
n_components = 15
n_neighbors = 10

# the minimum distance apart that points are allowed to be in the low dimensional representation.
# This means that low values of min_dist will result in clumpier embeddings.
# This can be useful if you are interested in clustering, or in finer topological structure.
# Larger values of min_dist will prevent UMAP from packing points together and will focus on the preservation of the broad topological structure instead.
min_dist = 0 #0.1

random_state=42

### Clustering

In [None]:
# a lower min_cluster_size will generate more topics
min_cluster_size = 50

# The implementation defaults this value (if it is unspecified) to whatever min_cluster_size is set to.
# The larger the value of min_samples you provide, the more conservative the clustering – more points will be declared as noise,
# and clusters will be restricted to progressively more dense area
#min_samples = None
r = 0.6
min_samples = round(min_cluster_size*r)

# We need this to avoid an AttributeError when integrating our custom HDBSCAN step with BERTopic
prediction_data=True

# can improve the resultant clusters
gen_min_span_tree=True

### BERTopic

In [None]:
# top n words in combined documents in a cluster
top_n_words = 5 #10

# Calculate the probabilities of all topics per document instead of the probability of the assigned topic per document.
# This could slow down the extraction of topics if you have many documents (> 100_000).
calculate_probabilities=False

## **Sub-models**

In [None]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer(st_id)
embeddings = embedding_model.encode(docs, show_progress_bar=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Batches:   0%|          | 0/87 [00:00<?, ?it/s]

In [None]:
from umap import UMAP

umap_model = UMAP(
    n_components=n_components,
    n_neighbors=n_neighbors,
    min_dist=min_dist,
    metric='cosine', random_state=random_state)

In [None]:
# Pre-reduce embeddings for visualization purposes
reduced_embeddings = UMAP(n_neighbors=n_neighbors, n_components=2, min_dist=min_dist, metric='cosine', random_state=random_state).fit_transform(embeddings)

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


In [None]:
from hdbscan import HDBSCAN

hdbscan_model = HDBSCAN(
    min_cluster_size=min_cluster_size,
    min_samples=min_samples,
    metric='euclidean',
    cluster_selection_method='eom',
    prediction_data=prediction_data,
    gen_min_span_tree=gen_min_span_tree
    )

In [None]:
from bertopic.representation import KeyBERTInspired

keybert = KeyBERTInspired()

representation_model = {
    "KeyBERT": keybert
}

CountVectorizer before training the topic model to minimize the size of the resulting c-TF-IDF matrix:

In [None]:
min_df = 0.001
len(docs), min_df * len(docs)

(2760, 2.7600000000000002)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

#vectorizer_model = None
vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 3), min_df=min_df)

## 🔥 **Training**

Now that we have our models prepared, we can start training our topic model! We supply BERTopic with the sub-models of interest, run `.fit_transform`, and see what kind of topics we get.

In [None]:
from bertopic import BERTopic

topic_model = BERTopic(
  # Sub-models
  embedding_model=embedding_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  representation_model=representation_model,
  vectorizer_model=vectorizer_model,

  top_n_words=top_n_words,
  calculate_probabilities=calculate_probabilities,
  verbose=True
)

# Train model
topics, probs = topic_model.fit_transform(docs, embeddings)

2024-02-07 10:16:28,985 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-02-07 10:16:47,455 - BERTopic - Dimensionality - Completed ✓
2024-02-07 10:16:47,457 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-02-07 10:16:47,671 - BERTopic - Cluster - Completed ✓
2024-02-07 10:16:47,678 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-02-07 10:16:50,375 - BERTopic - Representation - Completed ✓


In [None]:
# num of topics
len(topic_model.get_topics()) - 1

2

In [None]:
# Show topics
topic_model.get_topic_info().head(7)

Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,Representative_Docs
0,-1,96,-1_good_good good_gato_gato negro,"[good, good good, gato, gato negro, negro]","[good, good good, good good good, best, good l...","[Very good, Very good, Good]"
1,0,2573,0_wine_good_oak_taste,"[wine, good, oak, taste, red]","[good wine, wine, good, alcohol, bad, good val...","[Good wine!!, Good wine., A good wine.]"
2,1,91,1_chardonnay_citrus_acidity_good,"[chardonnay, citrus, acidity, good, tropical]","[nice chardonnay, chardonnay, good chardonnay,...","[Fruity Chardonnay, Nice chardonnay, Not my Ch..."


## 💦 **Post-processing**

### Vectorizer
Pass the CountVectorizer after training where llm used full context in training => ???

In [None]:
try:
    vectorizer_model
    print('vectorizer_model assigned before!')
except:
    print('passing vectorizer_model ater training')

vectorizer_model assigned before!


In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
#stop_additional = ['cabernet', 'sauvignon', 'cab', 'wine']
stop_additional = []

ngram_range = (1, 3)

# When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold.
# This value is also called cut-off in the literature.
# If float, the parameter represents a proportion of documents, integer absolute counts
min_df = 10

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

# Fine-tune topic representations after training BERTopic
# you can pass the CountVectorizer before and after training your topic model.
# Passing it before training allows you to minimize the size of the resulting c-TF-IDF matrix

stopwords = list(stopwords.words('english')) + stop_additional

vectorizer_model = CountVectorizer(stop_words=stopwords, ngram_range=ngram_range, min_df=min_df)

topic_model.update_topics(docs, vectorizer_model=vectorizer_model, top_n_words=top_n_words)

In [None]:
topic_model.get_topic_info().head(7)

Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,Representative_Docs
0,-1,580,-1_chardonnay_good_citrus_acidity,"[chardonnay, good, citrus, acidity, wine]","[good, great, best, nice, better, good cost, g...","[Good, Very good, Not my Chardonnay]"
1,0,1166,0_oak_vanilla_good_taste,"[oak, vanilla, good, taste, cherry]","[chocolate oak, oak blackberry, oak vanilla, o...",[oak cherry black cherry chocolate blackcurran...
2,1,781,1_wine_good_cabernet_red,"[wine, good, cabernet, red, price]","[good wine, wine good, nice wine, great wine, ...","[Very good wine, Good wine., Good wine!!]"
3,2,233,2_good_value_money_value money,"[good, value, money, value money, bad]","[good value money, good value, excellent value...","[Good value for money, Good value for money, V..."


### Outlier reduction

In [None]:
# Use the "c-TF-IDF" strategy with a threshold
# threshold is the minimum similarity.
new_topics = topic_model.reduce_outliers(docs, topics , strategy="c-tf-idf", threshold=0.1)

# count outliers
len([x for x in new_topics if x < 0])

6455

In [None]:
t = topics
#t = new_topics

new_topics = topic_model.reduce_outliers(docs, t , strategy="distributions", threshold=0.5)

# count outliers
len([x for x in new_topics if x < 0])

100%|██████████| 7/7 [00:22<00:00,  3.25s/it]


6231

In [None]:
t = topics
#t = new_topics
new_topics = topic_model.reduce_outliers(docs, t, strategy="embeddings", threshold=0.5)

# count outliers
len([x for x in new_topics if x < 0])

1988

In [None]:
t = topics
#t = new_topics

# the threshold is minimum probability when strategy="probabilities"
new_topics = topic_model.reduce_outliers(docs, t, strategy="probabilities", probabilities=probs, threshold=0.05)

# count outliers
len([x for x in new_topics if x < 0])

6678

#### update with new_topics

In [None]:
topic_model.update_topics(docs, topics=new_topics)



# 📊 Visualization

In [None]:
titles = [x[:100] for x in docs]
topics_to_visualize = range(20)

topic_model.visualize_documents(titles, reduced_embeddings=reduced_embeddings,
                                hide_annotations=True, hide_document_hover=False,
                                topics=topics_to_visualize,
                                #custom_labels=True
                                )

## Topics per Class

In [None]:
classes = df_reviews.wine.tolist()
topics_per_class = topic_model.topics_per_class(docs, classes=classes)

11it [00:00, 15.52it/s]


In [None]:
custom_labels = True
normalize_frequency = False

In [None]:
topic_model.visualize_topics_per_class(topics_per_class, top_n_topics=10,
                                       normalize_frequency = normalize_frequency,
                                       width=1000, height=500,
                                       custom_labels=custom_labels)

**Share of reviews**

In [None]:
total_freq = df_reviews.groupby('wine').count().id.to_dict() # num of documents per class

df = topics_per_class.assign(Frequency=topics_per_class.apply(lambda x: x.Frequency/total_freq[x.Class], axis=1))

sum([v for k,v in total_freq.items()]), len(docs)

(2760, 2760)

In [None]:
total_freq

{'Casillero del Diablo Cabernet Sauvignon (Reserva)': 472,
 'Casillero del Diablo Chardonnay (Reserva)': 122,
 'Cono Sur Bicicleta Reserva Unoaked Chardonnay': 109,
 'Frontera Cabernet Sauvignon': 207,
 "Kendall-Jackson Vintner's Reserve Cabernet Sauvignon": 121,
 'Long Barn Chardonnay': 142,
 'Montes Montes Alpha Cabernet Sauvignon': 220,
 'San Pedro Gato Negro Cabernet Sauvignon': 879,
 'San Pedro Gato Negro Chardonnay': 230,
 'Yellow Tail Cabernet Sauvignon': 136,
 'Yellow Tail Chardonnay': 122}

In [None]:
topic_model.visualize_topics_per_class(df, top_n_topics=10,
                                       normalize_frequency = normalize_frequency,
                                       width=1000, height=500,
                                       custom_labels=custom_labels)

# 🎚️ **Parameter Study**

In [61]:
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from bertopic.representation import KeyBERTInspired
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic


def bertopic_param_search(docs,
                   ### hyperparams
                   min_df=0.001, # CountVectorizer
                   max_df=1.0,
                   n_components=15,
                   n_neighbors=10,
                   min_dist=0.1, # UMAP
                   min_cluster_size=10,
                   min_samples=None, # HDBSCAN
                   ####
                   embedding_model=None,
                   embeddings=None,
                   ngram_range=(1, 3),
                   prediction_data=True,
                   gen_min_span_tree=True,
                   top_n_words=5,
                   calculate_probabilities=False,
                   random_state=42,
                   verbose=False,
                   return_model=False,
                   hdbscan_model=None
                   ):

    #-- sub-models
    vectorizer_model = CountVectorizer(stop_words="english", ngram_range=ngram_range,
                                       min_df=min_df, max_df=max_df)

    umap_model = UMAP(
        n_components=n_components,
        n_neighbors=n_neighbors,
        min_dist=min_dist,
        metric='cosine', random_state=random_state)

    if hdbscan_model is None:
        hdbscan_model = HDBSCAN(
            min_cluster_size=min_cluster_size,
            min_samples=min_samples,
            metric='euclidean',
            cluster_selection_method='eom',
            prediction_data=prediction_data,
            gen_min_span_tree=gen_min_span_tree
            )

    keybert = KeyBERTInspired()
    representation_model = {
        "KeyBERT": keybert
    }

    #-- train bertopic
    topic_model = BERTopic(
        embedding_model=embedding_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        vectorizer_model=vectorizer_model,
        representation_model=representation_model,

        top_n_words=top_n_words,
        calculate_probabilities=calculate_probabilities,
        verbose=verbose
    )

    # Train model
    try:
        topics, probs = topic_model.fit_transform(docs, embeddings)
        df = topic_model.get_topic_info()
    except:
        df = None

    if return_model:
        return topic_model
    else:
        return df


def get_topics(df, index, num_topics=10, cols = ['Topic', 'KeyBERT']):
    """
    get a row from df, the result of bertopic_param_search
    index: index of a param set
    """
    # get the position of topic 0 which might be 0 if no outlier
    i = df.loc[df.Topic==0].index[0]
    return df.iloc[i:num_topics+i].loc[:, cols].rename(columns=dict(zip(cols, ['index', index]))).set_index('index').transpose()

## Set params

In [14]:
params_study = {
    # When CountVectorizer building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold.
    # If float, the parameter represents a proportion of documents, integer absolute counts
    #'min_df': [1, 10], # error when 10 even if default max_df is 1.0
    'min_df': [0.001, 0.01],

    'max_df': [0.5, 1.0],

    # If you are interested in (density based) clustering, or other machine learning techniques,
    # it can be beneficial to pick a larger embedding dimension (say 10, or 50) closer to the the dimension
    # of the underlying manifold on which your data lies.
    'n_components': [10, 20, 50],

    # how UMAP balances local versus global structure in the data.
    # low values of n_neighbors will force UMAP to concentrate on very local structure (potentially to the detriment of the big picture)
    'n_neighbors': [10, 20, 50],

    # the minimum distance apart that points are allowed to be in the low dimensional representation.
    # This means that low values of min_dist will result in clumpier embeddings.
    # This can be useful if you are interested in clustering, or in finer topological structure.
    # Larger values of min_dist will prevent UMAP from packing points together and will focus on the preservation of the broad topological structure instead.
    'min_dist': [0, 0.05, 0.1],

    # set it to the smallest size grouping that you wish to consider a cluster.
    # It can have slightly non-obvious effects with min_samples
    'min_cluster_size': [20, 50, 100],

    # The implementation defaults this value (if it is unspecified) to whatever min_cluster_size is set to.
    # The larger the value of min_samples you provide, the more conservative the clustering –
    # more points will be declared as noise, and clusters will be restricted to progressively more dense area
    # UPDATE as the values are ratio to min_cluster_size
    'min_samples': [0.1, 0.5, 1.]
}

In [15]:
# Embedding model: 12-layer, 384-hidden
st_id = 'all-MiniLM-L12-v2'

params_base = {
    'ngram_range': (1, 3),
    'top_n_words': 5,
    'random_state': 42,
}

Product param sets

In [16]:
from itertools import product

param_names = params_study.keys()

param_values = params_study.values()
param_values = list(product(*param_values))

df_params = pd.DataFrame(param_values, columns=param_names)

try:
    df_params.min_samples = df_params.min_samples.mul(df_params.min_cluster_size).astype(int)
except:
    print('No min_samples')

df_params = df_params.reset_index(drop=True)
df_params.head(5)

Unnamed: 0,min_df,max_df,n_components,n_neighbors,min_dist,min_cluster_size,min_samples
0,0.001,0.5,10,10,0.0,20,2
1,0.001,0.5,10,10,0.0,20,10
2,0.001,0.5,10,10,0.0,20,20
3,0.001,0.5,10,10,0.0,50,5
4,0.001,0.5,10,10,0.0,50,25


import old study

In [None]:
file = 'wr_param_study_01.csv'

!cp {path_src}/{file} {path_data}

In [None]:
#df_result_old = pd.read_csv(f'{path_data}/{file}')
#df_result_old.head(5)

Unnamed: 0,min_df,n_components,n_neighbors,min_dist,min_cluster_size,min_samples,0,1,2,3,4,5,6,7,8,9
0,1,10,10,0.0,20,2,"['fruity', 'delicious fruity', 'fruity good', ...","['good value money', 'money good value', 'good...","['chilean cabernet sauvignon', 'cabernet sauvi...","['citrus tropical', 'citrus pear tropical', 'p...","['chilean wine', 'wine chile', 'chilean wine w...","['oak', 'oak oak', 'vanilla oak', 'oak vanilla...","['fruity scent', 'citrus scent', 'aromatic fru...","['håkon rekdal', 'like håkon rekdal', 'rekdal'...","['medium tannins', 'medium tannin', 'soft tann...","['easy drink good', 'drink easy', 'easy drinki..."
1,1,10,10,0.0,20,10,"['nice wine', 'good wine', 'great wine', 'exce...","['vanilla oak blackberry', 'oak blackberry van...","['nice chardonnay', 'good chardonnay', 'chardo...","['fruity', 'fruity good', 'tasty fruity', 'swe...","['sour taste', 'bitter', 'retrospect add taste...","['good value money', 'money good value', 'good...","['drink easy', 'easy drinking', 'easy drink', ...","['citrus tropical', 'citrus pear tropical', 'p...","['smooth dry', 'flavor smooth', 'dry smooth', ...","['cabernet sauvignon', 'chilean cabernet sauvi..."
2,1,10,10,0.0,20,20,"['good', 'great', 'excellent', 'bad', 'good va...","['nice chardonnay', 'chardonnay', 'good chardo...",,,,,,,,
3,1,10,10,0.0,50,5,"['good wine', 'wine', 'sauvignon', 'cabernet s...","['nice chardonnay', 'chardonnay', 'good chardo...",,,,,,,,
4,1,10,10,0.0,50,25,"['good', 'great', 'excellent', 'good value', '...","['nice chardonnay', 'chardonnay', 'good chardo...",,,,,,,,


drop param set studied before

In [None]:
df = df_result_old[df_params.columns]
df_params = pd.concat([df_params, df]).drop_duplicates(keep=False)
len(df_params)

255

Update base params with embedding model

In [None]:
show_progress_bar=False,

embedding_model = SentenceTransformer(st_id)
embeddings = embedding_model.encode(docs, show_progress_bar=show_progress_bar)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/573 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/134M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/352 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/96 [00:00<?, ?it/s]

In [None]:
params_base.update({
    'embedding_model': embedding_model,
    'embeddings': embeddings
})

## Search params

make batch

In [None]:
batch = 3 #0, 1, 2, 3
num_params = 250

run_idx = [df_params.index[i*num_params:(i+1)*num_params] for i in range(len(df_params) // num_params +1)]
run_file = [f'a{i}' for i in range(1,len(run_idx)+1)]

df_params_b = df_params.loc[run_idx[batch]]
file = f'wr_param_study_{run_file[batch]}'

no batch

In [None]:
#df_params_b = df_params
#file = 'wr_param_study_01'

In [None]:
df_result = pd.DataFrame()

total = len(df_params_b)

for rec in tqdm(df_params_b.iloc[:total].itertuples(), total=total):
    idx = rec[0]
    kwargs = rec._asdict()
    kwargs.pop('Index', None)
    kwargs.update(params_base)

    df = bertopic_param_search(docs, **kwargs)

    if df is None:
        df = pd.DataFrame({0: ['ERROR']}).rename_axis('index')
        break # testing
    else:
        df = get_topics(df, idx)
    df_result = pd.concat([df_result, df])

df_result = df_params_b.join(df_result)
df_result.head(5)

100%|██████████| 222/222 [2:02:42<00:00, 33.17s/it]


Unnamed: 0,min_df,max_df,n_components,n_neighbors,min_dist,min_cluster_size,min_samples,0,1,2,3,4,5,6,7,8,9
750,0.01,1.0,10,10,0.1,50,5,"[good wine, wine, good, great, excellent, sauv...","[nice chardonnay, chardonnay, good chardonnay,...",,,,,,,,
751,0.01,1.0,10,10,0.1,50,25,"[good wine, wine, wines, sauvignon, cabernet s...","[good value money, good value, excellent value...","[nice chardonnay, chardonnay, good chardonnay,...",,,,,,,
752,0.01,1.0,10,10,0.1,50,50,"[oak vanilla, oak blackberry, oak, vanilla, fl...","[good wine, nice wine, great wine, wine good, ...","[good value money, good value, excellent value...","[nice chardonnay, chardonnay, good chardonnay,...",,,,,,
753,0.01,1.0,10,10,0.1,100,10,"[good wine, excellent wine, nice wine, wine go...","[dry, dry dry, dry good, sweet dry, dry smooth...","[oak, oak oak, blackberry oak, vanilla oak, oa...","[good value money, excellent value money, good...","[fruity, fruity good, light fruity, fruity int...","[chilean cabernet sauvignon, cabernet sauvigno...","[drink easy, easy drinking, easy drink, drink ...",,,
754,0.01,1.0,10,10,0.1,100,50,"[oak vanilla, oak blackberry, oak, vanilla, fl...","[good wine, nice wine, wine good, wine, red wi...","[good value money, good value, excellent value...",,,,,,,


join param set to result if param search interupted

In [None]:
#df_result = df_params_b.loc[df_result.index].join(df_result)
#len(df_result)

In [None]:
cond = df_result[0].isna()
print('num of param sets to be studied:', cond.sum())

#df_result = df_result.loc[~cond]

df_result.loc[cond].head()

num of param sets to be studied: 0


Unnamed: 0,min_df,max_df,n_components,n_neighbors,min_dist,min_cluster_size,min_samples,0,1,2,3,4,5,6,7,8,9


In [None]:
# compare topics
topic_id = 0
num_topics = 5

_ = [print(f'{i:>2}:', ', '.join(rec[topic_id][:num_topics])) for i, rec in df_result.iterrows()]

In [None]:
#file = f'wr_param_study_{}'

f = f'{path_data}/{file}.csv'
df_result.to_csv(f, index = False)

!zip -j {file}.zip {f}
!cp {file}.zip {path_src}

  adding: wr_param_study_a4.csv (deflated 92%)


## Review result

In [17]:
file = 'wr_param_study_a'
!unzip {path_src}/{file}.zip -d {path_data}

Archive:  /content/drive/MyDrive/Colab Notebooks//wr_param_study_a.zip
  inflating: sample_data/wr_param_study_a1.csv  
  inflating: sample_data/wr_param_study_a2.csv  
  inflating: sample_data/wr_param_study_a3.csv  
  inflating: sample_data/wr_param_study_a4.csv  


In [18]:
df_result = read_csv(file, path_data)
n = range(10)

df_result.columns = list(df_result.columns[:-10]) + list(df_result.columns[-10:].astype(int))
df_result.loc[:, n] = df_result.loc[:, n].applymap(lambda x: eval(x) if x is not np.nan else np.nan)

df_result.head()

Unnamed: 0,min_df,max_df,n_components,n_neighbors,min_dist,min_cluster_size,min_samples,0,1,2,3,4,5,6,7,8,9
0,0.001,1.0,10,10,0.0,100,50,"[oak blackberry, oak vanilla, oak, black cherr...","[good wine, excellent wine, nice wine, wine go...","[good value money, good value, good value exce...","[chilean cabernet sauvignon, cabernet sauvigno...",,,,,,
1,0.001,1.0,10,10,0.0,100,100,"[oak cherry, vanilla oak, oak vanilla, oak bla...","[good wine, nice wine, wine good, great wine, ...","[good value money, excellent value money, good...",,,,,,,
2,0.001,1.0,10,10,0.05,20,2,"[good wine, great wine, nice wine, light wine,...","[good value money, good value, money excellent...","[spicy, fruity spicy, light spice, spice, litt...","[nice chardonnay, good chardonnay, chardonnay,...","[bad, decent, quite good average, really bad e...","[smooth dry, flavor smooth, dry smooth, delici...","[quite sour honestly, quite sour, sour taste, ...","[drink easy, easy drinking, easy drink, drink ...","[mart 2022 08, 2021, 2023, 2022, money good dr...","[dark ruby red, medium ruby color, intense rub..."
3,0.001,1.0,10,10,0.05,20,10,"[good wine, nice wine, excellent wine, wine go...","[vanilla oak blackberry, oak blackberry vanill...","[good value money, money good value, good valu...","[nice chardonnay, good chardonnay, chardonnay,...","[smooth dry, dry smooth, sour dry, flavor smoo...","[sour taste, bitter, retrospect add tasted, sw...","[bad, really good worse, just bad, decent, qui...","[fruity, fruity good, tasty fruity, sweet frui...","[oak, oak oak, vanilla oak, oak vanilla, oak o...","[citrus tropical, citrus pear tropical, pear c..."
4,0.001,1.0,10,10,0.05,20,20,"[vanilla oak, oak vanilla, oak blackberry, oak...","[good wine, wine good, nice wine, excellent wi...","[cabernet sauvignon, chilean cabernet sauvigno...","[nice chardonnay, chardonnay, good chardonnay,...","[good value money, money good value, good valu...","[lotte mart 2022, good emart 4980won, mids 201...","[chilean wine, wine chile, chilean wines, chil...","[bad, really good worse, just bad, bad just aw...","[quite good, quite good good, good quite good,...","[quite acidic overly, good acidic really, high..."


In [None]:
# case 1: compapre topic 0's
res_df = df_result[0].str.join(', ')
res_docs = res_df.to_list()

In [19]:
# caes 2: compare models with 10 topics
n = range(10)

res_df = df_result[n].apply(lambda x: x.str.join(', ')).stack()

res_docs = []
res_docs_id = [] # save param set and topic id for res_docs

for idx, s in res_df.items():
    if s is None:
        break
    res_docs.append(s)
    i = '_'.join([str(x) for x in idx])
    res_docs_id.append(i)
len(res_docs)

4629

In [20]:
res_docs[0]

'oak blackberry, oak vanilla, oak, black cherry, blackberry, strawberry, cherry, citrus, ruby, peach'

In [41]:
from sentence_transformers import SentenceTransformer

random_state = 42
min_dist = 0.9

st_id = 'all-MiniLM-L12-v2'
embedding_model = SentenceTransformer(st_id)
#embeddings = embedding_model.encode(res_docs, show_progress_bar=True)

umap_model = UMAP(min_dist=min_dist, random_state=random_state)
hdbscan_model = HDBSCAN(min_cluster_size=min_cluster_size)

topic_model = BERTopic(
    embedding_model=embedding_model,
    umap_model = umap_model,
    calculate_probabilities=True
    )

topics, probs = topic_model.fit_transform(res_docs)

topic_model.get_topic_info().head()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,604,-1_chocolate_vanilla_oak_cherry,"[chocolate, vanilla, oak, cherry, everyday, bl...","[oak cherry, chocolate oak, vanilla oak, oak v..."
1,0,208,0_sour_smooth_dry_bitter,"[sour, smooth, dry, bitter, add, little, taste...","[flavor smooth, bitter smooth, slightly sour, ..."
2,1,148,1_sauvignon_cabernet_australian_rich,"[sauvignon, cabernet, australian, rich, diablo...","[ordinary cabernet sauvignon, cabernet sauvign..."
3,2,99,2_2022_mart_2023_2021,"[2022, mart, 2023, 2021, lotte, 08, 21, mids, ...","[mart 2022 08, 2021, mart 2022, 2023, 2022, mo..."
4,3,89,3_price_quality_profitably_buy,"[price, quality, profitably, buy, best, perfor...","[price good quality, good value excellent, pri..."


In [45]:
len(topic_model.get_topic_info()) - 1

129

In [79]:
from sklearn.cluster import KMeans

min_dist = 0.9
min_cluster_size = 50

st_id = 'all-MiniLM-L12-v2'
embedding_model = SentenceTransformer(st_id)

#cluster_model = KMeans(n_clusters=min_cluster_size)
cluster_model = None

topic_model = bertopic_param_search(res_docs,
                                    min_df=0,
                                    max_df=1.0,
                                    n_components=15,
                                    n_neighbors=10,
                                    min_dist=min_dist,
                                    min_cluster_size=min_cluster_size,
                                    embedding_model=embedding_model,
                                    calculate_probabilities=True,
                                    random_state=42,
                                    hdbscan_model=cluster_model,
                                    return_model=True)
topic_model.get_topic_info().head()

Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,Representative_Docs
0,-1,1165,-1_good_chardonnay_citrus_tropical,"[good, chardonnay, citrus, tropical, blackberry]","[pear citrus tropical, citrus tropical citrus,...","[citrus tropical, citrus pear tropical, pear c..."
1,0,961,0_wine_sauvignon_cabernet_cabernet sauvignon,"[wine, sauvignon, cabernet, cabernet sauvignon...","[wine good wine, wine excellent wine, good win...","[good wine, wine good, wine good wine, excelle..."
2,1,889,1_oak_blackberry_cherry_vanilla,"[oak, blackberry, cherry, vanilla, fruity]","[oak blackberry vanilla, oak vanilla blackberr...","[oak blackberry vanilla, chocolate oak, oak ch..."
3,2,511,2_easy_drink_easy drink_sour,"[easy, drink, easy drink, sour, taste]","[drinker easy drink, easy drinker easy, drinke...","[drink easy, easy drinking, easy drink, drink ..."
4,3,302,3_value_money_value money_good,"[value, money, value money, good, excellent]","[value money excellent, value money good, mone...","[good value money, good value, good value exce..."


In [80]:
a = len(topic_model.get_topic_info()) - 1
print(f'num of topics: {a}')

df = topic_model.get_topic_info()
a = df.loc[df.Topic == -1]['Count']
if a.count() > 0:
    a = a.values[0]/df['Count'].sum()
else:
    a = 0
print(f'outliers: {a:.3f}')

num of topics: 13
outliers: 0.252


In [81]:
topic_model.visualize_topics()

In [None]:
#topic_model.visualize_distribution(probs[0])

In [82]:
from sentence_transformers import SentenceTransformer

# Larger values of min_dist will focus on the preservation of the broad topological structure.
#min_dist=0.1
min_dist=0.9

embedding_model = SentenceTransformer(st_id)
res_embeddings = embedding_model.encode(res_docs, show_progress_bar=False)

res_reduced = UMAP(n_components=2, random_state=random_state, min_dist=min_dist).fit_transform(res_embeddings)

In [83]:
#d = res_docs
#d = [f'{i}: {rec[0]}' for i, rec in pd.DataFrame(res_df).iterrows()]
d = [f'{x}:{y}' for x, y in zip(res_docs_id, res_docs)]

topic_model.visualize_documents(d, reduced_embeddings=res_reduced,
                                hide_annotations=True)

In [84]:
# Extract hierarchical topics and their representations
hierarchical_topics = topic_model.hierarchical_topics(res_docs)

# Visualize these representations
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

100%|██████████| 12/12 [00:00<00:00, 122.29it/s]


In [85]:
topic_model.visualize_barchart()

### Topics per Class

In [86]:
# param set as class
classes = res_df.index.get_level_values(0)

In [87]:
len(res_docs), len(classes), len(set(classes))

(4629, 4629, 972)

In [88]:
topics_per_class = topic_model.topics_per_class(res_docs, classes=classes)

In [89]:
topic_model.visualize_topics_per_class(topics_per_class,
                                       #top_n_topics=10,
                                       normalize_frequency = False,
                                       #width=1000, height=500
                                       )

In [90]:
tid = 0
ps = range(944,948)
cond = (topics_per_class.Topic == tid) & (topics_per_class.Class.isin(ps))
topics_per_class.loc[cond]

Unnamed: 0,Topic,Words,Frequency,Class,Name
3294,0,"wine, sauvignon, cabernet sauvignon, cabernet,...",2,944,0_wine_sauvignon_cabernet_cabernet sauvi...
3296,0,"wine, sauvignon, cabernet, cabernet sauvignon,...",1,945,0_wine_sauvignon_cabernet_cabernet sauvi...
3298,0,"cabernet, wine, sauvignon, chilean, sauvignon ...",1,946,0_wine_sauvignon_cabernet_cabernet sauvi...
3302,0,"wine, sauvignon, cabernet, wine good, cabernet...",4,947,0_wine_sauvignon_cabernet_cabernet sauvi...


In [94]:
df_result.loc[ps].iloc[:,:8]

Unnamed: 0,min_df,max_df,n_components,n_neighbors,min_dist,min_cluster_size,min_samples,0
944,0.001,0.5,50,50,0.0,100,10,"[citrus peach, fruity good, citrus pear, pear ..."
945,0.001,0.5,50,50,0.0,100,50,"[fruity blackberry, blackberry vanilla cherry,..."
946,0.001,0.5,50,50,0.0,100,100,"[blackberry vanilla cherry, oak blackberry van..."
947,0.001,0.5,50,50,0.05,20,2,"[good value wine, wine good value, value wine,..."


In [138]:
tid = 947
[i for i,x in enumerate(res_docs_id) if x.startswith(f'{tid}')]

[4533, 4534, 4535, 4536, 4537, 4538, 4539, 4540, 4541, 4542]

In [142]:
i = 4542
topic_model.visualize_distribution(topic_model.probabilities_[i])