<a href="https://colab.research.google.com/github/lbk209/topic_modeling/blob/main/tm_wine_reviews_cab6_02_emb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 📦 Install Packages

In [None]:
# for gpu runtime
%%capture
!pip install bertopic datasets accelerate bitsandbytes xformers adjustText

In [1]:
# for cpu runtime
%%capture
!pip install bertopic accelerate adjustText

Restart session if error occurs. you might need to copy data again

In [2]:
# test import
from bertopic import BERTopic

import accelerate

from torch import cuda
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
print(device)

cpu


In [None]:
# test import
import bitsandbytes

ModuleNotFoundError: No module named 'bitsandbytes'

In [3]:
import os
import pandas as pd
import plotly.express as px
from tqdm import tqdm

In [4]:
# to work with path name having blank
import locale
locale.getpreferredencoding = lambda: "UTF-8"

run if no connection to google drive yet

# 📄 **Data**

## Import review data

from local

In [None]:
from google.colab import files
uploaded = files.upload()

from google drive

In [None]:
# use colab menu if possible
from google.colab import drive
drive.mount('/content/drive')

In [None]:
path_wr = 'sample_data'
file_wr_zip = 'wine_reviews_cab6.zip'
file_wr = 'wine_%s.csv'

file = f'/content/drive/MyDrive/Colab\ Notebooks/{file_wr_zip}'
!cp {file} .

In [None]:
!unzip {file_wr_zip} -d {path_wr}

Archive:  wine_reviews_cab6.zip
  inflating: sample_data/wine_0.csv  
  inflating: sample_data/wine_1.csv  
  inflating: sample_data/wine_2.csv  
  inflating: sample_data/wine_3.csv  
  inflating: sample_data/wine_4.csv  
  inflating: sample_data/wine_5.csv  


## Merge wines

In [None]:
def find_files(directory, pattern):
    files = []
    for root, _, filenames in os.walk(directory):
        for filename in filenames:
            if filename.startswith(pattern.split('%s')[0]) and filename.endswith(pattern.split('%s')[1]):
                files.append(os.path.join(root, filename))
    return sorted(files)


df_reviews = pd.DataFrame()

for f in find_files(path_wr, file_wr):
    df = pd.read_csv(f, parse_dates=['date'])
    df_reviews = pd.concat([df_reviews if not df_reviews.empty else None, df])
    #break

df_reviews = df_reviews.reset_index(drop=True)

df_reviews.head(5)

Unnamed: 0,id,wine,date,review
0,0,Casillero del Diablo Cabernet Sauvignon (Reserva),2024-01-24,Little too cherry on the front end for me
1,0,Casillero del Diablo Cabernet Sauvignon (Reserva),2024-01-20,"En liten skarp knekk i smaken. Ok fredagsvin,m..."
2,0,Casillero del Diablo Cabernet Sauvignon (Reserva),2024-01-19,Aight
3,0,Casillero del Diablo Cabernet Sauvignon (Reserva),2024-01-18,무난한 맛 가성비 좋은듯
4,0,Casillero del Diablo Cabernet Sauvignon (Reserva),2024-01-17,oak cherry black cherry chocolate blackcurrant...


In [None]:
df_reviews = df_reviews.rename(columns={"id": "wid"}).reset_index(names=['id'])
df_reviews.head(5)

Unnamed: 0,id,wid,wine,date,review
0,0,0,Casillero del Diablo Cabernet Sauvignon (Reserva),2024-01-24,Little too cherry on the front end for me
1,1,0,Casillero del Diablo Cabernet Sauvignon (Reserva),2024-01-20,"En liten skarp knekk i smaken. Ok fredagsvin,m..."
2,2,0,Casillero del Diablo Cabernet Sauvignon (Reserva),2024-01-19,Aight
3,3,0,Casillero del Diablo Cabernet Sauvignon (Reserva),2024-01-18,무난한 맛 가성비 좋은듯
4,4,0,Casillero del Diablo Cabernet Sauvignon (Reserva),2024-01-17,oak cherry black cherry chocolate blackcurrant...


In [None]:
df_reviews.date.dt.year.value_counts().sort_index(ascending=False)

2024.0     29
2023.0    627
2022.0    954
2021.0     59
2020.0      2
2019.0      1
2017.0      1
Name: date, dtype: int64

In [None]:
df_reviews = df_reviews.loc[df_reviews.date.dt.year >= 2022]

In [None]:
def plot_reviews_by_wine(df_reviews, col1='wine', col2='id',
                         title='Reviews for Wines',
                         ylabel='number of reviews'):

    colormap = px.colors.sequential.YlGnBu

    fig = px.bar(
        df_reviews.groupby(col1)[[col2]].count()\
            .sort_values(col2, ascending = False),
        text_auto = 'd',
        title = title,
        labels = {'value': ylabel},
        width=800
    )

    fig.update_traces(marker_color=colormap[6], marker_line_color=colormap[6],
                    marker_line_width=1.5, opacity=0.9)

    fig.update_traces(textfont_size=12, textangle=0, textposition="outside", cliponaxis=False)

    fig.update_layout(showlegend = False)

    return fig


plot_reviews_by_wine(df_reviews)

## Translating comments

In [None]:
!pip install langdetect deep-translator

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting deep-translator
  Downloading deep_translator-1.11.4-py3-none-any.whl (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.3/42.3 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993225 sha256=4f98b8bde4a21728d01cd251f9537cb6f3a51e057dcfbe804e87be883f32c1c9
  Stored in directory: /root/.cache/pip/wheels/95/03/7d/59ea870c70ce4e5a370638b5462a7711ab78fba2f655d05106
Successfully built langdetect
Installing collected packages: langdetect, deep-translator
Successfully installed deep-translator-1.11.4 langdetect-1.0.9


In [None]:
import langdetect
import json
from deep_translator import GoogleTranslator

def detect_language(text):
    try:
        return langdetect.detect(text)
    except KeyboardInterrupt as e:
        raise(e)
    except:
        return '<-- ERROR -->'

def get_translation(text):
    try:
        return GoogleTranslator(source='auto', target='en').translate(str(text))
    except KeyboardInterrupt as e:
        raise(e)
    except:
        return '<-- ERROR -->'

In [None]:
tmp_data = []

for rec in tqdm(df_reviews.to_dict('records')):
    tmp_data.append(
        {'id': rec['id'],
        'lang': detect_language(rec['review']),
        'review_transl': get_translation(rec['review'])}
    )

100%|██████████| 1610/1610 [15:56<00:00,  1.68it/s]


In [None]:
df_reviews = df_reviews.merge(pd.DataFrame(tmp_data))
df_reviews.head()

Unnamed: 0,id,wid,wine,date,review,lang,review_transl
0,0,0,Casillero del Diablo Cabernet Sauvignon (Reserva),2024-01-24,Little too cherry on the front end for me,en,Little too cherry on the front end for me
1,1,0,Casillero del Diablo Cabernet Sauvignon (Reserva),2024-01-20,"En liten skarp knekk i smaken. Ok fredagsvin,m...",no,A small sharp crack in the taste. Ok Friday wi...
2,2,0,Casillero del Diablo Cabernet Sauvignon (Reserva),2024-01-19,Aight,en,Aight
3,3,0,Casillero del Diablo Cabernet Sauvignon (Reserva),2024-01-18,무난한 맛 가성비 좋은듯,ko,Good taste and good value for money
4,4,0,Casillero del Diablo Cabernet Sauvignon (Reserva),2024-01-17,oak cherry black cherry chocolate blackcurrant...,en,oak cherry black cherry chocolate blackcurrant...


In [None]:
path = 'sample_data'
file = 'wine_reviews_cab6_transl'

df_reviews.to_csv(f'{path}/{file}.csv', index = False)

In [None]:
!zip -j {file}.zip {path}/{file}.csv

  adding: wine_reviews_cab6_transl.csv (deflated 69%)


In [None]:
path = '/content/drive/MyDrive/Colab\ Notebooks/'
!cp {file}.zip {path}

#### Import saved

In [None]:
path = 'sample_data'
file = 'wine_reviews_cab6_transl'

In [None]:
!unzip {file}.zip -d {path}

Archive:  wine_reviews_cab6_transl.zip
replace sample_data/wine_reviews_cab6_transl.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [None]:
import pandas as pd
f = f'{path}/{file}.csv'
df_reviews = pd.read_csv(f, parse_dates=['date'])
df_reviews.head()

Unnamed: 0,id,wid,wine,date,review,lang,review_transl
0,0,0,Casillero del Diablo Cabernet Sauvignon (Reserva),2024-01-24,Little too cherry on the front end for me,en,Little too cherry on the front end for me
1,1,0,Casillero del Diablo Cabernet Sauvignon (Reserva),2024-01-20,"En liten skarp knekk i smaken. Ok fredagsvin,m...",no,A small sharp crack in the taste. Ok Friday wi...
2,2,0,Casillero del Diablo Cabernet Sauvignon (Reserva),2024-01-19,Aight,en,Aight
3,3,0,Casillero del Diablo Cabernet Sauvignon (Reserva),2024-01-18,무난한 맛 가성비 좋은듯,ko,Good taste and good value for money
4,4,0,Casillero del Diablo Cabernet Sauvignon (Reserva),2024-01-17,oak cherry black cherry chocolate blackcurrant...,en,oak cherry black cherry chocolate blackcurrant...


## Clearing not meaningful comments

### Languages

In [None]:
fig = px.bar(100*df_reviews.lang.value_counts(normalize = True).head(10), text_auto = '.2f',
    labels = {'value': 'share of reviews, %', 'index': 'language'},
    title = 'Top reviews languages',
            width=800)
colormap = px.colors.sequential.YlGnBu

fig.update_layout(showlegend = False)

fig.update_traces(marker_color=colormap[6], marker_line_color=colormap[6],
                  marker_line_width=1.5, opacity=0.9)

fig.update_traces(textfont_size=12, textangle=0, textposition="outside", cliponaxis=False)

In [None]:
#df_reviews.lang.value_counts()

cond = df_reviews.lang.str.contains('ERROR')
df_reviews.loc[cond].review.value_counts().head(10)

2020                      2
…… …. …… ……. …… …… …….    1
7,900                     1
👌🏻                        1
👍                         1
🥰                         1
🔝                         1
🥴                         1
😌                         1
@                         1
Name: review, dtype: int64

drop reviews of translation error

In [None]:
df_reviews = df_reviews.loc[~cond]

### Length of review

In [None]:
# strips any leading or trailing whitespace characters
df_reviews['review_transl'] = df_reviews['review_transl'].map(lambda x: str(x).strip())

df_reviews['review_len'] = df_reviews.review_transl.map(lambda x: len(x))
df_reviews.head()

Unnamed: 0,id,wid,wine,date,review,lang,review_transl,review_len
0,0,0,Casillero del Diablo Cabernet Sauvignon (Reserva),2024-01-24,Little too cherry on the front end for me,en,Little too cherry on the front end for me,41
1,1,0,Casillero del Diablo Cabernet Sauvignon (Reserva),2024-01-20,"En liten skarp knekk i smaken. Ok fredagsvin,m...",no,A small sharp crack in the taste. Ok Friday wi...,62
2,2,0,Casillero del Diablo Cabernet Sauvignon (Reserva),2024-01-19,Aight,en,Aight,5
3,3,0,Casillero del Diablo Cabernet Sauvignon (Reserva),2024-01-18,무난한 맛 가성비 좋은듯,ko,Good taste and good value for money,35
4,4,0,Casillero del Diablo Cabernet Sauvignon (Reserva),2024-01-17,oak cherry black cherry chocolate blackcurrant...,en,oak cherry black cherry chocolate blackcurrant...,77


In [None]:
fig = px.histogram(df_reviews, x="review_len",
                  nbins=250, range_x = [0, 500],
                  histnorm = 'percent',
                  labels = {'review_len': 'number of characters', 'percent': 'share of reviews, %'},
                  title = 'Number of characters in review',
                  width=800)

fig.update_traces(marker_color=colormap[6],
                  opacity=0.9)
fig

In [None]:
fig = px.histogram(df_reviews, x="review_len", color = 'wine',
                  nbins=500,
                   #range_x = [0, 1000],
                  labels = {'review_len': 'number of characters', 'percent': 'share of reviews, %'},
                  title = 'Number of characters in review',
                  #width=800
                   )

fig

*We can look at the most common examples to see whether there’s not much information in such comments or not.*

In [None]:
df_reviews.review_transl.map(lambda x: x.lower().strip()).value_counts().head(10)

very good     11
good           9
bom            7
tasty          5
great          5
bueno          4
good value     3
chocolate      3
delicious      3
ok             3
Name: review_transl, dtype: int64

In [None]:
min_len = 10
cond = df_reviews.review_len < min_len

x = df_reviews[cond].shape[0],
y = df_reviews.shape[0]
z = df_reviews[cond].shape[0]/df_reviews.shape[0]

print(f'{x} / {y} = {z:.4f}')

(156,) / 1581 = 0.0987


we can filter out all comments shorter than x symbols — y out of z reviews (w%). Then, we will analyse only long statements with more context. It’s an arbitrary threshold based on examples, you can try a couple of levels and see what texts are filtered out.


In [None]:
df_reviews['length_group'] = df_reviews.review_len.map(
    lambda x: f'<= {min_len}' if x < min_len else f'> {min_len}')

It’s worth checking whether this filter disproportionally affects some hotels. Shares of short comments are pretty close for different categories. So, the data looks OK.

In [None]:
len_stats_df = df_reviews.pivot_table(index = 'wine', values = 'id',
              columns = 'length_group', aggfunc = 'count')

len_stats_df['total'] = len_stats_df.sum(axis = 1)
len_stats_df = len_stats_df.sort_values('total', ascending = False)
px.bar(
    len_stats_df.apply(lambda x: 100.*x/len_stats_df.total).drop('total', axis = 1),
    text_auto = '.2f',
    color_discrete_map = {
          '<= 20': colormap[2],
          '> 20': colormap[5]
      }, title = "Reviews' length by hotel",
    labels = {'value': 'share of reviews, %', 'course_id': 'course',
             'length_group': 'review length'},
    width=800
)

In [None]:
#df_reviews2 = df_reviews[df_reviews.review_len > min_len].drop('length_group', axis = 1)
#df_reviews2.head()

In [None]:
path = 'sample_data'
file = 'wine_reviews_cab6_transl2'

df_reviews.to_csv(f'{path}/{file}.csv', index = False)

In [None]:
!zip -j {file}.zip {path}/{file}.csv

  adding: wine_reviews_cab6_transl2.csv (deflated 69%)


In [None]:
path = '/content/drive/MyDrive/Colab\ Notebooks/'
!cp {file}.zip {path}

## Create Docs

### Import saved

In [None]:
from google.colab import files
uploaded = files.upload()

In [5]:
file = 'wine_reviews_cab6_transl2'
path = 'sample_data'
path_src = '/content/drive/MyDrive/Colab\ Notebooks/'

In [6]:
!unzip {path_src}/{file}.zip -d {path}

Archive:  /content/drive/MyDrive/Colab Notebooks//wine_reviews_cab6_transl2.zip
  inflating: sample_data/wine_reviews_cab6_transl2.csv  


In [7]:
import pandas as pd
f = f'{path}/{file}.csv'
df_reviews = pd.read_csv(f, parse_dates=['date'])
df_reviews.head()

Unnamed: 0,id,wid,wine,date,review,lang,review_transl,review_len,length_group
0,0,0,Casillero del Diablo Cabernet Sauvignon (Reserva),2024-01-24,Little too cherry on the front end for me,en,Little too cherry on the front end for me,41,> 10
1,1,0,Casillero del Diablo Cabernet Sauvignon (Reserva),2024-01-20,"En liten skarp knekk i smaken. Ok fredagsvin,m...",no,A small sharp crack in the taste. Ok Friday wi...,62,> 10
2,2,0,Casillero del Diablo Cabernet Sauvignon (Reserva),2024-01-19,Aight,en,Aight,5,<= 10
3,3,0,Casillero del Diablo Cabernet Sauvignon (Reserva),2024-01-18,무난한 맛 가성비 좋은듯,ko,Good taste and good value for money,35,> 10
4,4,0,Casillero del Diablo Cabernet Sauvignon (Reserva),2024-01-17,oak cherry black cherry chocolate blackcurrant...,en,oak cherry black cherry chocolate blackcurrant...,77,> 10


In [8]:
# DO NOT CHANGE the (document) id as it is index to topics of topic model (topic_model.topics_)
docs = df_reviews.review_transl.tolist()

# 🗨️ **BERTopic**

Before we can start with topic modeling, we will first need to perform two steps:
* Pre-calculating Embeddings
* Defining Sub-models

## **Preparing Embeddings**

By pre-calculating the embeddings for each document, we can speed-up additional exploration steps and use the embeddings to quickly iterate over BERTopic's hyperparameters if needed.

🔥 **TIP**: You can find a great overview of good embeddings for clustering on the [MTEB Leaderboard](https://huggingface.co/spaces/mteb/leaderboard).

In [None]:
#docs = df_data.reviews_transl.tolist()

In [None]:
# Pre-calculate embeddings

# a 6 layer version of microsoft/MiniLM-L12-H384-uncased
# mapping sentences & paragraphs to a 384 dimensional dense vector space and can be used for tasks like clustering or semantic search.
st_id = 'all-MiniLM-L6-v2'


In [None]:
# 12-layer, 384-hidden
st_id = 'all-MiniLM-L12-v2'

In [None]:
#st_id = "BAAI/bge-small-en" # Recommend switching to newest BAAI/bge-small-en-v1.5
st_id = 'BAAI/bge-small-en-v1.5' # dim 384, seq len 512 (# of chars?)

In [None]:
# dim 1024, seq len 512
st_id = 'BAAI/bge-large-en-v1.5'

In [None]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer(st_id)

embeddings = embedding_model.encode(docs, show_progress_bar=True)

Batches:   0%|          | 0/50 [00:00<?, ?it/s]

## **Sub-models**

Next, we will define all sub-models in BERTopic and do some small tweaks to the number of clusters to be created, setting random states, etc.

In [None]:
n_components = 15 # dimensionality
n_neighbors = 10

min_cluster_size = 10

top_n_words = 10 # top n words in combined documents in a cluster

n_components/len(docs)

0.009487666034155597

In [None]:
from umap import UMAP
from hdbscan import HDBSCAN

# dimensionality reduction
umap_model = UMAP(
    n_components=n_components,
    n_neighbors=n_neighbors,
    min_dist=0.0,
    metric='cosine', random_state=42)

# clustering algorithm
hdbscan_model = HDBSCAN(
    # a lower min_cluster_size will generate more topics
    min_cluster_size=min_cluster_size,
    metric='euclidean',
    cluster_selection_method='eom',
    prediction_data=True
    )

As a small bonus, we are going to reduce the embeddings we created before to 2-dimensions so that we can use them for visualization purposes when we have created our topics.

In [None]:
# Pre-reduce embeddings for visualization purposes
reduced_embeddings = UMAP(n_neighbors=n_neighbors, n_components=2, min_dist=0.0, metric='cosine', random_state=42).fit_transform(embeddings)

### **Representation Models**

One of the ways we are going to represent the topics is with Llama 2 which should give us a nice label. However, we might want to have additional representations to view a topic from multiple angles.

Here, we will be using c-TF-IDF as our main representation and [KeyBERT](https://maartengr.github.io/BERTopic/getting_started/representation/representation.html#keybertinspired), [MMR](https://maartengr.github.io/BERTopic/getting_started/representation/representation.html#maximalmarginalrelevance), and [Llama 2](https://maartengr.github.io/BERTopic/getting_started/representation/llm.html) as our additional representations.

In [None]:
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, TextGeneration

# KeyBERT
keybert = KeyBERTInspired()

# MMR
mmr = MaximalMarginalRelevance(diversity=0.3)

# Text generation with Llama 2
#llama2 = TextGeneration(generator, prompt=prompt)
#llama2desc = TextGeneration(generator, prompt=prompt_desc)

# All representation models
representation_model = {
    "KeyBERT": keybert,
    #"Llama2": llama2,
    #"Llama2Desc": llama2desc,
    "MMR": mmr,
}

In [None]:
# for fast training to assess the number of topics
from bertopic.representation import KeyBERTInspired


representation_model = KeyBERTInspired()

# 🔥 **Training**

Now that we have our models prepared, we can start training our topic model! We supply BERTopic with the sub-models of interest, run `.fit_transform`, and see what kind of topics we get.

## Generate BERTopic

In [None]:
from bertopic import BERTopic

topic_model = BERTopic(

  # Sub-models
  embedding_model=embedding_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  representation_model=representation_model,

  # Hyperparameters
  top_n_words=top_n_words,
  verbose=True,

  # Calculate the probabilities of all topics per document instead of the probability of the assigned topic per document.
  # This could slow down the extraction of topics if you have many documents (> 100_000).
  calculate_probabilities=True
)

# Train model
topics, probs = topic_model.fit_transform(
    docs,
    embeddings # need embedding_model
    )

2024-01-29 09:59:37,882 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-01-29 09:59:47,421 - BERTopic - Dimensionality - Completed ✓
2024-01-29 09:59:47,424 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-01-29 09:59:47,761 - BERTopic - Cluster - Completed ✓
2024-01-29 09:59:47,769 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-01-29 10:02:17,082 - BERTopic - Representation - Completed ✓


In [None]:
len(topic_model.get_topics()) - 1

32

In [None]:
# Show topics
topic_model.get_topic_info().head(7)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,524,-1_cabernet_fruity_sauvignon_flavor,"[cabernet, fruity, sauvignon, flavor, taste, c...","[Great value for money! Intense, fruity, dry, ..."
1,0,139,0_blackcurrant_chocolate_blackberries_cherry,"[blackcurrant, chocolate, blackberries, cherry...",[vanilla oak cherry chocolate blackberry red f...
2,1,92,1_fruity_fruit_fruits_strawberry,"[fruity, fruit, fruits, strawberry, raspberry,...","[Fruity, Fruity and light, Fruity and light]"
3,2,86,2_sourness_sour_bitter_bitterness,"[sourness, sour, bitter, bitterness, unpleasan...","[Fruity, sour, light taste, Very sour and bitt..."
4,3,62,3_wine_wines_pleasant_drink,"[wine, wines, pleasant, drink, inexpensive, ni...","[Pleasant wine for the price, Nice wine with t..."
5,4,57,4_wine_winery_good_delicious,"[wine, winery, good, delicious, tasty, drink, ...","[A good wine!, Good wine, Good wine!!]"
6,5,51,5_cherry_fruity_tannins_flavor,"[cherry, fruity, tannins, flavor, sauvignon, a...","[Notes of red fruits, cassis, vanilla and choc..."


In [None]:
titles = [x[:100] for x in docs]
topics_to_visualize = range(20)

topic_model.visualize_documents(titles, reduced_embeddings=reduced_embeddings,
                                hide_annotations=True, hide_document_hover=False,
                                topics=topics_to_visualize,
                                #custom_labels=True
                                )

In [None]:
# set `calculate_probabilities` to True as it uses the topic probabilities of all topics
topic_model.visualize_distribution(probs[0])

## Save model

In [None]:
#name = 'tm_wine_reviews_cab6_emb1'

In [None]:
#name = 'tm_wine_reviews_cab6_emb2'

In [None]:
name = 'tm_wine_reviews_cab6_emb3'

In [None]:
name = 'tm_wine_reviews_cab6_emb4'

In [None]:
save_embedding_model = f"sentence-transformers/{st_id}" # works with 'BAAI/bge-small-en-v1.5'
#save_embedding_model = f"{st_id}"

save_embedding_model

'sentence-transformers/BAAI/bge-large-en-v1.5'

In [None]:
path = 'sample_data'
file_model = f'{path}/{name}'

# save_embedding_model: If serialization safetensors or pytorch,
# this variable can be used as a string pointing towards a huggingface model.
topic_model.save(file_model, serialization="safetensors", save_embedding_model=x, save_ctfidf=True)

In [None]:
import pickle

#Store sentences & embeddings on disc
file_attrs = f'{path}/{name}_attrs.pkl'

with open(file_attrs, "wb") as fOut:
    obj = {'reduced_embeddings': reduced_embeddings,
           'representative_docs_': topic_model.representative_docs_,
           'embedding model': f'{st_id}'
           }
    pickle.dump(obj, fOut, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
zfile = f'/content/drive/MyDrive/Colab\ Notebooks/{name}.zip'
!zip -r {zfile} {file_model} {file_attrs}

  adding: sample_data/tm_wine_reviews_cab6_emb4/ (stored 0%)
  adding: sample_data/tm_wine_reviews_cab6_emb4/ctfidf.safetensors (deflated 61%)
  adding: sample_data/tm_wine_reviews_cab6_emb4/config.json (deflated 41%)
  adding: sample_data/tm_wine_reviews_cab6_emb4/topic_embeddings.safetensors (deflated 7%)
  adding: sample_data/tm_wine_reviews_cab6_emb4/ctfidf_config.json (deflated 68%)
  adding: sample_data/tm_wine_reviews_cab6_emb4/topics.json (deflated 79%)
  adding: sample_data/tm_wine_reviews_cab6_emb4_attrs.pkl (deflated 27%)


## Load model

In [9]:
names = [
    'tm_wine_reviews_cab6_emb1',
    'tm_wine_reviews_cab6_emb2',
    'tm_wine_reviews_cab6_emb3',
    'tm_wine_reviews_cab6_emb4'
]
files_zip = []
files_src = []

for name in names:
    f1 = f'{name}.zip'
    f2 = f'/content/drive/MyDrive/Colab\ Notebooks/{f1}'
    files_zip.append(f1)
    files_src.append(f2)

In [10]:
f = ' '.join(files_src)
!cp {f} .

In [11]:
for x in files_zip:
    !unzip {x}

Archive:  tm_wine_reviews_cab6_emb1.zip
   creating: sample_data/tm_wine_reviews_cab6_emb1/
  inflating: sample_data/tm_wine_reviews_cab6_emb1/ctfidf.safetensors  
  inflating: sample_data/tm_wine_reviews_cab6_emb1/config.json  
  inflating: sample_data/tm_wine_reviews_cab6_emb1/topic_embeddings.safetensors  
  inflating: sample_data/tm_wine_reviews_cab6_emb1/ctfidf_config.json  
  inflating: sample_data/tm_wine_reviews_cab6_emb1/topics.json  
  inflating: sample_data/tm_wine_reviews_cab6_emb1_attrs.pkl  
Archive:  tm_wine_reviews_cab6_emb2.zip
   creating: sample_data/tm_wine_reviews_cab6_emb2/
  inflating: sample_data/tm_wine_reviews_cab6_emb2/ctfidf.safetensors  
  inflating: sample_data/tm_wine_reviews_cab6_emb2/config.json  
  inflating: sample_data/tm_wine_reviews_cab6_emb2/topic_embeddings.safetensors  
  inflating: sample_data/tm_wine_reviews_cab6_emb2/ctfidf_config.json  
  inflating: sample_data/tm_wine_reviews_cab6_emb2/topics.json  
  inflating: sample_data/tm_wine_reviews_

In [12]:
from bertopic import BERTopic
import pickle

def load_topic_model(name, path = 'sample_data', embedding_model=None):
    d = f'{path}/{name}'
    topic_model = BERTopic.load(d, embedding_model=embedding_model)

    #Load sentences & embeddings from disc
    f = f'{path}/{name}_attrs.pkl'

    with open(f, "rb") as fIn:
        obj = pickle.load(fIn)
        reduced_embeddings = obj['reduced_embeddings']
        topic_model.representative_docs_ = obj['representative_docs_']

    x = 'embedding model'
    print(f'{x}: {obj[x]}')

    return topic_model

In [13]:
name = names[0]
topic_model1 = load_topic_model(name)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

embedding model: all-MiniLM-L6-v2


In [14]:
name = names[1]
topic_model2 = load_topic_model(name)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/573 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/134M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/352 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

embedding model: all-MiniLM-L12-v2


In [15]:
name = names[2]

# why embedding warning? topic model saved with the embedding model
#topic_model3 = load_topic_model(name)

from sentence_transformers import SentenceTransformer

st_id = 'BAAI/bge-small-en-v1.5'
embedding_model = SentenceTransformer(st_id)

topic_model3 = load_topic_model(name, embedding_model=embedding_model)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/90.3k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]



embedding model: BAAI/bge-small-en-v1.5


In [None]:
topic_model3.topic_embeddings_

array([[-0.022467  , -0.01543293,  0.00584742, ...,  0.00524229,
         0.04450756,  0.00461061],
       [-0.00442707,  0.00759982, -0.00710111, ...,  0.02391023,
         0.07891507, -0.01256425],
       [-0.03369546, -0.02369783, -0.00794035, ...,  0.01522504,
         0.07557745, -0.00433885],
       ...,
       [ 0.00045844,  0.0172861 ,  0.00282718, ...,  0.00917665,
         0.05080281,  0.01437105],
       [-0.01263376, -0.02172365, -0.00599569, ..., -0.01279639,
         0.06703938, -0.00919181],
       [-0.0395014 ,  0.01956275,  0.0108313 , ...,  0.02969778,
         0.06021645, -0.01490256]], dtype=float32)

In [16]:
from sentence_transformers import SentenceTransformer

name = names[3]

st_id = 'BAAI/bge-large-en-v1.5'
embedding_model = SentenceTransformer(st_id)

topic_model4 = load_topic_model(name, embedding_model=embedding_model)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/90.3k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/779 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]



embedding model: BAAI/bge-large-en-v1.5


In [None]:
topic_model4.topic_embeddings_

array([[-0.01172725,  0.00986264,  0.01061895, ...,  0.0085314 ,
         0.00921902, -0.00166267],
       [-0.01109359, -0.00841544,  0.01118329, ...,  0.00483022,
        -0.00058128,  0.00060993],
       [-0.01999697,  0.00924525,  0.00422723, ...,  0.01834681,
         0.01604713,  0.00184325],
       ...,
       [ 0.0191648 ,  0.0126826 , -0.0019955 , ..., -0.0113982 ,
        -0.01499381,  0.01351074],
       [-0.01736942, -0.00709608,  0.01878775, ...,  0.01964875,
         0.0190496 , -0.00068658],
       [-0.01334336,  0.02530118,  0.00581131, ...,  0.00516371,
         0.02897627,  0.00105919]], dtype=float32)

# 🔄 Compare Models

## Custom utils

In [17]:
def print_with_line_feed(input_string, line_length=50):
    words = input_string.split()
    current_line_length = 0

    for word in words:
        if current_line_length + len(word) <= line_length:
            print(word, end=" ")
            current_line_length += len(word) + 1  # +1 for the space
        else:
            print()  # Start a new line
            print(word, end=" ")
            current_line_length = len(word) + 1

    print()  # Ensure the last line is printed

# Example usage
#your_string = "This is a sample string that you want to print with line feed every 50 characters. This string is just for demonstration purposes."
#print_with_line_feed(your_string)


def get_topic_stats(topic_model, extra_cols = []):
    topics_info_df = topic_model.get_topic_info().sort_values('Count', ascending = False)
    topics_info_df['Share'] = 100.*topics_info_df['Count']/topics_info_df['Count'].sum()
    topics_info_df['CumulativeShare'] = 100.*topics_info_df['Count'].cumsum()/topics_info_df['Count'].sum()
    return topics_info_df[['Topic', 'Count', 'Share', 'CumulativeShare', 'Name', 'Representation'] + extra_cols]

#extra_cols = ['Llama2', 'Llama2Desc']
#get_topic_stats(topic_model, extra_cols).head(10).set_index('Topic')

In [18]:
def print_topic(tm, tid=0, length=100, representative=True):
    """
    print Name, Count, Representation and Representative_Docs of a topic
    """
    df = tm.get_topic_info()
    x = df.loc[df.Topic==tid].to_dict('records')[0]
    print(f"{x['Name']} ({x['Count']})")
    print(', '.join(x['Representation']))

    if representative:
        #[print_with_line_feed(f'{i+1}) {s}', length) for i, s in enumerate(x['Representative_Docs'])]
        [print_with_line_feed(f'-. {s}', length) for s in x['Representative_Docs']]


def print_topics(tm, start=0, end=4, length=100, representative=True):
    for i in range(start, end+1):
        print_topic(tm, i, length=length, representative=representative)
        print()


def print_topic_docs(tid, docs, tm, num=5, length=100, return_docs=False, is_print_topic=True):
    if is_print_topic:
        print_topic(tm, tid, representative=False)
    docs_tid = [docs[i] for i, x in enumerate(tm.topics_) if x==tid]
    [print_with_line_feed(f'-. {s}', length) for s in docs_tid[:num]]

    if return_docs:
        return docs_tid


def find_topics(search, num_docs=10, models=None, names=None, highest_similarity = True):
    """
    Find topics most similar to a 'search' for each topic model in 'models'
    names: names of models
    """
    for n, tm in zip(names, models):
        t, s = tm.find_topics(search)

        if highest_similarity: # get topid id of highest similarity
            tid = t[s.index(max(s))]
        else: # get smallest topic id
            tid = t[t.index(min(t))]

        print(f'{n}:')
        print_topic_docs(tid, docs, tm, num=num_docs, length=100)
        print()

#### Utils for model comparison

In [19]:
import random

def sample_docs_similar_topic(models_docs, num_docs=5, seed=12):
    random.seed(seed)
    return [random.sample(x, num_docs) for x in models_docs]


def print_classification_result(models_docs, models_answers, length=80):
    for i in range(len(models_answers)):
        print(f'topic model {i+1}')
        for a, d in zip(models_answers[i], models_docs_[i]):
            print_with_line_feed(f'{a}: {d}', length)
        print()


def get_classification_result(models_docs_, func, *args, **kwargs):

    models_answers = list()

    for mid, tdocs in tqdm(enumerate(models_docs_), position=0, leave=False):
        tmp_list = list()
        for d in tqdm(tdocs, position=0, leave=True):
            out = func(d, *args, **kwargs)
            tmp_list.append(out)
        models_answers.append(tmp_list)

    return models_answers


#models_answers = get_classification_result(models_docs_, zs_classifier, ['sour'], print_result=False)

## Check models

### Model 1

In [None]:
tm = topic_model1
print_topics(tm, end=10, representative=False)
#print_topics(tm)

0_fruity_fruit_fruits_strawberry (80)
fruity, fruit, fruits, strawberry, berries, sweetness, flavor, berry, delicious, tasty

1_wines_wine_flavors_berries (79)
wines, wine, flavors, berries, cherry, blackberries, oak, flavor, alcohol, fruit

2_fruits_fruit_strawberry_blueberries (52)
fruits, fruit, strawberry, blueberries, sweetness, fruity, red, chocolate, aromas, ripe

3_blackberry_blackcurrant_blackcurrants_cherry (51)
blackberry, blackcurrant, blackcurrants, cherry, flavors, vanilla, oak, pepper, lime, ripe

4_chilean_chile_cabernet_wines (50)
chilean, chile, cabernet, wines, wine, grape, mendoza, vivino, eucalyptus, brasília

5_wines_wine_value_quality (45)
wines, wine, value, quality, inexpensive, pleasant, drink, taste, money, price

6_wine_booze_everyday_cola (45)
wine, booze, everyday, cola, daily, cheap, tomato, tasty, blood, health

7_sauvignon_wines_cabernet_wine (44)
sauvignon, wines, cabernet, wine, grape, grapes, noir, cab, currants, tannins

8_sour_sourness_tasted_bitte

In [None]:
tid = 0
num_docs = 10

print_topic_docs(tid, docs, tm, num_docs)

0_fruity_fruit_fruits_strawberry (80)
fruity, fruit, fruits, strawberry, berries, sweetness, flavor, berry, delicious, tasty
-. Very good, goes very well with red meat and pasta... 
-. Globe score 88; crowd pleasing style enjoyable fruity character soft creamy texture $17 
-. Full-bodied and has acidity. Meaty enough to chew! 😀 
-. Cruise to Alaska, my son brought this one. Good but a bit too spicy for my taste 
-. Always a winner, smooth and nice taste of berries - good price! 
-. Acceptable, medium body, soft with wood tones and red fruits 
-. Light, woody 
-. Surprisingly good with spicy food. 
-. Delicious in pasta sauce/stew meat! Mild in taste. 
-. Fruity. Very good price for performance 


### Model 2

In [None]:
tm = topic_model2
print_topics(tm, end=10, representative=False)
#print_topics(tm)

In [None]:
#tid = 1 # makes sense even thou some positive reviews exists
tid = 2 # contradictary desctiption but most of reviews seem positive

num_docs = 20

print_topic_docs(tid, docs, tm, num_docs)

2_bad_very_good_best (51)
bad, very, good, best, awful, nice, marvelous, excellent, amazing, lousy
-. Aight 
-. So damn cute 
-. very bad 
-. Same review 
-. Strong at first but then good. 
-. I really like 
-. Strong but ok 
-. It's good 
-. It’s in my blood - I love it 
-. Very strong 
-. I adore!!!! 
-. Super always liked it 
-. Stable 
-. I honestly think it's amazing 
-. I think it's my favorite 
-. Normal 
-. I love 
-. Not bad 
-. average 
-. Love it 


In [None]:
# avf length of doc of a topic
import numpy as np

for tid in range(10):
    x = print_topic_docs(tid, docs, tm, 0, return_docs=True, is_print_topic=False)
    z = np.mean([len(y) for y in x])
    w = np.median([len(y) for y in x])
    print(f'Topic_{tid}: {z:.0f}, {w:.0f}')


Topic_0: 106, 80
Topic_1: 65, 58
Topic_2: 19, 12
Topic_3: 64, 50
Topic_4: 32, 25
Topic_5: 30, 23
Topic_6: 40, 35
Topic_7: 163, 125
Topic_8: 7, 5
Topic_9: 78, 69


In [None]:
tid = 8
num_docs = 20

print_topic_docs(tid, docs, tm, num_docs)

8_great_excellent_fantastic_good (30)
great, excellent, fantastic, good, perfect, nice, sweet, pleasant, so, shit
-. Great ! 
-. good 
-. great to get through a Monday evening. 
-. Great 
-. Nice 
-. Perfect. Grades 
-. I feel good 
-. Good 
-. Good shit 
-. Great 
-. Sweet 
-. good 
-. So so 
-. Excellent 
-. Great 
-. Good 
-. Great 
-. Excellent 
-. Good for the day 
-. Good 


### Model 3

In [None]:
tm = topic_model3
print_topics(tm, end=10, representative=False)
#print_topics(tm)

In [None]:
#tid = 1
tid = 3
num_docs = 10

print_topic_docs(tid, docs, tm, num_docs)

3_sour_sourness_tastes_tasted (73)
sour, sourness, tastes, tasted, bitter, taste, unpleasant, flavour, flavor, bitterness
-. didn’t enjoy, unpleasant in the mouth. tart and a hint of spice, yet somehow boring. 
-. Getting drunk and drinking red wine❤️‍🔥 
-. I bought it cheaply at a convenience store, but the taste is not good.. You shouldn't buy just 
anything. 
-. Did not like. Bitter, tart, watery. I don't recommend it. Well, I don’t really want to throw it 
away, I can drink it. But the pleasure is about zero. 
-. Medium body, slightly spicy flavor, not sweet, but a sweet flavor comes up when swallowed, a 
bitter taste lingers after swallowing. 
-. Quite sour honestly. Ngl not even gonna cap frfr 💀💀💀💯💯🥶🥶 @ 
-. High acids, without the presence of tannins. Bitterness Impossible to taste It seemed spoiled. I 
will review it in another bottle 
-. Very sour and bitter, not delicious 
-. Not my fave taste! 
-. overpriced, despite being tasty, I expected more. 


### Model 4

In [None]:
tm = topic_model4
print_topics(tm, end=10, representative=False)
#print_topics(tm)

In [None]:
tid = 2
num_docs = 10

print_topic_docs(tid, docs, tm, num_docs)

2_sourness_sour_bitter_bitterness (86)
sourness, sour, bitter, bitterness, unpleasant, taste, flavor, bland, fruity, tastes
-. didn’t enjoy, unpleasant in the mouth. tart and a hint of spice, yet somehow boring. 
-. I bought it cheaply at a convenience store, but the taste is not good.. You shouldn't buy just 
anything. 
-. Cruise to Alaska, my son brought this one. Good but a bit too spicy for my taste 
-. It was my first time drinking Casillero del diablo. It was a little sour at first, but after a 
few days the flavor opened up and got a little better. 
-. Let it breathe for 25 minutes so that it is not so alcoholic in taste. 
-. Did not like. Bitter, tart, watery. I don't recommend it. Well, I don’t really want to throw it 
away, I can drink it. But the pleasure is about zero. 
-. Too strong alcohol scent, astringent, but good scent 
-. Medium body, slightly spicy flavor, not sweet, but a sweet flavor comes up when swallowed, a 
bitter taste lingers after swallowing. 
-. Cat urine 

## Topic Comparison

##### Search simliar topics

In [None]:
models = [topic_model1, topic_model2, topic_model3, topic_model4]
models_name = ['topic_model1', 'topic_model2', 'topic_model3', 'topic_model4']

def find_topics2(search, num_docs=10, models=models, names=models_name, highest_similarity = True):
    return find_topics(search, num_docs=num_docs, models=models, names=models_name, highest_similarity = highest_similarity)

##### Case: sour

In [None]:
search = 'sour'
find_topics2(search, 5)

topic_model1:
8_sour_sourness_tasted_bitter (38)
sour, sourness, tasted, bitter, spicy, flavor, bitterness, taste, delicious, unpleasant
-. didn’t enjoy, unpleasant in the mouth. tart and a hint of spice, yet somehow boring. 
-. Did not like. Bitter, tart, watery. I don't recommend it. Well, I don’t really want to throw it 
away, I can drink it. But the pleasure is about zero. 
-. Medium body, slightly spicy flavor, not sweet, but a sweet flavor comes up when swallowed, a 
bitter taste lingers after swallowing. 
-. Quite sour honestly. Ngl not even gonna cap frfr 💀💀💀💯💯🥶🥶 @ 
-. Tasted extra good from a social security block on camping 

topic_model2:
1_sour_flavour_bitter_taste (58)
sour, flavour, bitter, taste, flavor, tasted, fruit, bitterness, cherries, spicy
-. didn’t enjoy, unpleasant in the mouth. tart and a hint of spice, yet somehow boring. 
-. Salty yet savory. Plum, light red cherries. Good everyday drinker for the price Layne Yancey 
-. Light, smooth and soft flavor 
-. Did n

In [None]:
models_topic = [8,1,3,2]
models_docs = []

for i, tid in enumerate(models_topic):
    tm = models[i]
    d = print_topic_docs(tid, docs, tm, num=0, length=100, return_docs=True)
    models_docs.append(d)

8_sour_sourness_tasted_bitter (38)
sour, sourness, tasted, bitter, spicy, flavor, bitterness, taste, delicious, unpleasant
1_sour_flavour_bitter_taste (58)
sour, flavour, bitter, taste, flavor, tasted, fruit, bitterness, cherries, spicy
3_sour_sourness_tastes_tasted (73)
sour, sourness, tastes, tasted, bitter, taste, unpleasant, flavour, flavor, bitterness
2_sourness_sour_bitter_bitterness (86)
sourness, sour, bitter, bitterness, unpleasant, taste, flavor, bland, fruity, tastes


###### Save

In [None]:
file = f'models_docs_{search}'
file_pkl = f'{file}.pkl'
file_zip = f'{file}.zip'

path = 'sample_data'
path_src = '/content/drive/MyDrive/Colab\ Notebooks/'

In [None]:
import pickle

f = f'{path}/{file_pkl}'
with open(f, "wb") as fOut:
    obj = {
        'models_name': models_name,
        'models_topic': models_topic,
        'search term': search,
        'models_docs': models_docs,
    }
    pickle.dump(obj, fOut, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
!zip -j {file_zip} {path}/{file_pkl}

updating: models_docs_sour.pkl (deflated 50%)


In [None]:
!cp {file_zip} {path_src}

###### Load

In [None]:
search = 'sour'
file = f'models_docs_{search}'

file_pkl = f'{file}.pkl'
file_zip = f'{file}.zip'

path = 'sample_data'
path_src = '/content/drive/MyDrive/Colab\ Notebooks/'

In [None]:
!cp {path_src}/{file_zip} .

In [None]:
!unzip {file_zip} -d {path}

Archive:  models_docs_sour.zip
  inflating: sample_data/models_docs_sour.pkl  


In [None]:
import pickle

f = f'{path}/{file_pkl}'

with open(f, "rb") as fIn:
    obj = pickle.load(fIn)
    models_topic = obj['models_topic']
    models_docs = obj['models_docs']
    search = obj['search term']

search

'sour'

In [None]:
mid = 0
n = 10 #9999
#_ = [print(x) for x in models_docs[mid][:n]]
_ = [print_with_line_feed(f'-. {x}', 80) for x in models_docs[mid][:n]]

-. didn’t enjoy, unpleasant in the mouth. tart and a hint of spice, yet somehow 
boring. 
-. Did not like. Bitter, tart, watery. I don't recommend it. Well, I don’t 
really want to throw it away, I can drink it. But the pleasure is about zero. 
-. Medium body, slightly spicy flavor, not sweet, but a sweet flavor comes up 
when swallowed, a bitter taste lingers after swallowing. 
-. Quite sour honestly. Ngl not even gonna cap frfr 💀💀💀💯💯🥶🥶 @ 
-. Tasted extra good from a social security block on camping 
-. Very sour and bitter, not delicious 
-. Sour, but not too bitter or dry. Would want to try it again some time 
-. Masitttang, moderately bitter and heavy 
-. Tart, dense, without bitterness and without acid 
-. Tasty, without bitterness 


In [None]:
models_topic

[8, 1, 3, 2]

##### Case: value

In [None]:
search = 'value'
find_topics2(search, 5)

topic_model1:
15_value_money_profitably_worths (24)
value, money, profitably, worths, dollar, investment, spent, store, very, good
-. Good taste and good value for money 
-. Excellent value for money 
-. Good value for money 
-. Good value for money, quite powerful 
-. It has a reputation, but it could be better 

topic_model2:
4_value_worths_price_money (46)
value, worths, price, money, dollar, decent, sale, quality, profitably, pay
-. Good taste and good value for money 
-. Excellent value for money 
-. Good value for money 
-. Well, somewhat okay in this price range. But I doubt to get this one again 
-. Good value for money, quite powerful 

topic_model3:
15_normal_basic_sweat_old (17)
normal, basic, sweat, old, average, simple, medium, basicão, day, pts
-. SPEEDBLIC<3 : 10 4 4 4 4 
-. in balans 
-. Basic 
-. Normal 
-. Sweat and normal 

topic_model4:
7_bueno_good_happy_deliciaaaa (38)
bueno, good, happy, deliciaaaa, back, today, rip, ze, oui, live
-. SPEEDBLIC<3 : 10 4 4 4 4 
-. 

In [None]:
search = 'value'
find_topics2(search, 5, highest_similarity = False)

topic_model1:
5_wines_wine_value_quality (45)
wines, wine, value, quality, inexpensive, pleasant, drink, taste, money, price
-. Definitely one of the best wines I tried in the lower price range. Rarely saw such a well 
balanced combination of intensity and fruit for such an amount of money. 
-. Good value for money! The first wine of the East Sea Caravan. The sugar content was low and there 
was a bit of acidity at the end. 
-. Nice good value tesco wine 
-. Really edible for this price. Taste like young wines like Crianzas of Rioja. 
-. Great wine, balanced and great value for money 

topic_model2:
-1_flavor_strawberry_fruity_tannins (378)
flavor, strawberry, fruity, tannins, taste, berries, tannin, fruit, astringent, cherry
-. Little too cherry on the front end for me 
-. Globe score 88; crowd pleasing style enjoyable fruity character soft creamy texture $17 
-. Notes of oak, chocolate, black fruit and cranberries. Empty on the palate; a complete lack of 
concentration, complexity, s

[['reliable', 0.6666172742843628],
 ['good', 0.6356230974197388],
 ['value', 0.6239932775497437],
 ['nice', 0.6180570125579834],
 ['excellent', 0.6147314310073853],
 ['convenience', 0.6016929149627686],
 ['bought', 0.5844535827636719],
 ['performance', 0.5840137004852295],
 ['quality', 0.5834070444107056],
 ['great', 0.5729191899299622]]

### 🦙 **Llama 2**

Now comes one of the more interesting components of this tutorial, how to load in a Llama 2 model on a T4-GPU!

We will be focusing on the `'meta-llama/Llama-2-7b-chat-hf'` variant. It is large enough to give interesting and useful results whilst small enough that it can be run on our environment.


#### 🤗 HuggingFace Hub Credentials
Before we can load in Llama2 using a number of tricks, we will first need to accept the License for using Llama2. The steps are as follows:


* Create a HuggingFace account [here](https://huggingface.co)
* Apply for Llama 2 access [here](https://huggingface.co/meta-llama/Llama-2-13b-chat-hf)
* Get your HuggingFace token [here](https://huggingface.co/settings/tokens)

After doing so, we can login with our HuggingFace credentials so that this environment knows we have permission to download the Llama 2 model that we are interested in.

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…


We start by defining our model and identifying if our GPU is correctly selected. We expect the output of `device` to show a cuda device:

In [None]:
from torch import cuda

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

print(device)

cpu


#### **Optimization & Quantizatio (generator)**

In order to load our 13 billion parameter model, we will need to perform some optimization tricks. Since we have limited VRAM and not an A100 GPU, we will need to "condense" the model a bit so that we can run it.

There are a number of tricks that we can use but the main principle is going to be 4-bit quantization.

This process reduces the 64-bit representation to only 4-bits which reduces the GPU memory that we will need. It is a recent technique and quite an elegant at that for efficient LLM loading and usage. You can find more about that method [here](https://arxiv.org/pdf/2305.14314.pdf) in the QLoRA paper and on the amazing HuggingFace blog [here](https://huggingface.co/blog/4bit-transformers-bitsandbytes).

In [None]:
import transformers
#model_id = 'meta-llama/Llama-2-7b-chat-hf'
model_id = 'meta-llama/Llama-2-13b-chat-hf'

In [None]:
from torch import bfloat16

# set quantization configuration to load large model with less GPU memory
# this requires the `bitsandbytes` library

bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,  # 4-bit quantization
    bnb_4bit_quant_type='nf4',  # Normalized float 4
    bnb_4bit_use_double_quant=True,  # Second quantization after the first
    bnb_4bit_compute_dtype=bfloat16  # Computation type
)

quantization_config = bnb_config

In [None]:
# for cpu runtime
#quantization_config = None

These four parameters that we just run are incredibly important and bring many LLM applications to consumers:
* `load_in_4bit`
  * Allows us to load the model in 4-bit precision compared to the original 32-bit precision
  * This gives us an incredibly speed up and reduces memory!
* `bnb_4bit_quant_type`
  * This is the type of 4-bit precision. The paper recommends normalized float 4-bit, so that is what we are going to use!
* `bnb_4bit_use_double_quant`
  * This is a neat trick as it perform a second quantization after the first which further reduces the necessary bits
* `bnb_4bit_compute_dtype`
  * The compute type used during computation, which further speeds up the model.



Using this configuration, we can start loading in the model as well as the tokenizer:

In [None]:
# restart session if ImportError

# Llama 2 Tokenizer
tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)

# Llama 2 Model
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    quantization_config=quantization_config,
    device_map='auto',
)
model.eval()

ImportError: Using `load_in_8bit=True` requires Accelerate: `pip install accelerate` and the latest version of bitsandbytes `pip install -i https://test.pypi.org/simple/ bitsandbytes` or pip install bitsandbytes` 

Using the model and tokenizer, we will generate a HuggingFace transformers pipeline that allows us to easily generate new text:

In [None]:
# Our text generator
generator = transformers.pipeline(
    model=model, tokenizer=tokenizer,
    task='text-generation',
    return_full_text=False,
    temperature=0.1,
    max_new_tokens=500, # the number of tokens the model shall generate
    #repetition_penalty=1.1
    repetition_penalty=1.2
)

#### **Prompt Engineering**

To check whether our model is correctly loaded, let's try it out with a few prompts.

In [None]:
#_ = [print(f'{i}: {x}') for i, x in enumerate(models_docs[0])]

i = 22
models_docs[0][i]

'Very sweet really loved how sweet it was with just hints of sour.Very delicious with snacks and red meat'

In [None]:
# A: , would definitely recommend!
#prompt_q = "Would you describe the wine in the following review as having a sour taste or a taste similar to sour?\n"
#prompt_r = "Very sweet really loved how sweet it was with just hints of sour.Very delicious with snacks and red meat"

# A: . (Tasting Notes)
#prompt_q = "Would you describe the wine in the following review as having a sour taste or a taste similar to sour?\n"
#prompt_r = "Review: Very sweet really loved how sweet it was with just hints of sour.Very delicious with snacks and red meat"

# A: .
#prompt_q = "The following review is about a wine. Would you describe the wine as having a sour taste or a taste similar to sour?\n"
#prompt_r = "Review: Very sweet really loved how sweet it was with just hints of sour.Very delicious with snacks and red meat"

# A: Based on the review, I would describe the wine as having a slightly sour taste, but not overly so.
# The reviewer mentions that the wine is "very sweet" and has only "hints of sour," indicating that the sourness is relatively mild.
#prompt_q = "[INST]The following review is about a wine. Would you describe the wine as having a sour taste or a taste similar to sour?\n"
#prompt_r = "Review: Very sweet really loved how sweet it was with just hints of sour.Very delicious with snacks and red meat[/INST]"

# A: Based on the given review, I would describe the wine as having a neutral taste between sour and not sour.
# While the reviewer mentions that the wine has "hints of sour," they also use language like "very sweet" and "delicious."
# This suggests that the wine's overall flavor profile leans towards being sweet rather than sour.
# Therefore, I would classify this wine as having a neutral taste between sour and not sour.
prompt_q = """<<SYS>>
    The following review is about a wine. Would you describe the wine as having a sour taste or a taste similar to sour?.\n
    Return your answer as : Positive, Negative, or Neutral
<</SYS>>"""
prompt_r = """[INST]
    Review: Very sweet really loved how sweet it was with just hints of sour.Very delicious with snacks and red meat
[/INST]"""

# Positive. The reviewer describes the wine as "very sweet" with only "hints of sour,"
# indicating that the sourness is subtle and balanced by the overall sweetness of the wine.
# This suggests that the wine has a pleasant flavor profile with a balance of sweet and tart notes.
prompt_q = """<s>[INST]<<SYS>>
    You are a respectful assistant analyzing a wine review. Your response should be positive, negative, or neutral.\n
<</SYS>>"""
prompt_r = """
    Would you describe the wine as having a sour taste or a taste similar to sour?.\n
    Review: Very sour and bitter, not delicious\n
[/INST] positive"""
prompt_r = prompt_r + """[INST]
    Would you describe the wine as having a sour taste or a taste similar to sour?.\n
    Review: Very sweet really loved how sweet it was with just hints of sour.Very delicious with snacks and red meat
[/INST]"""


# positive
# => just took 26 sec
prompt_q = """<s>[INST]<<SYS>>
    You are a respectful assistant analyzing a wine review. Your response must be either positive, negative, or neutral.
    No explanations necessary.
<</SYS>>"""
prompt_r = """
    Would you describe the wine as having a sour taste or a taste similar to sour?.
    Review: Very sour and bitter, not delicious\n
[/INST] positive"""
prompt_r = prompt_r + """[INST]
    Would you describe the wine as having a sour taste or a taste similar to sour?.
    Review: Very sweet really loved how sweet it was with just hints of sour.Very delicious with snacks and red meat
[/INST]"""

prompt = prompt_q + prompt_r

res = generator(prompt)
print(res[0]["generated_text"])

  Positive


#### Test

In [None]:
models_docs_ = sample_docs_similar_topic(models_docs)
models_docs_

[['Bitter. Light body',
  'Light sour',
  "Not rich, light sourness. I wouldn't take it a second time",
  'Very sweet really loved how sweet it was with just hints of sour.Very delicious with snacks and red meat',
  'Tasty, without bitterness'],
 ['Pleasant flavor and texture. A little simple but suitable for a night of tapas.',
  'didn’t enjoy, unpleasant in the mouth. tart and a hint of spice, yet somehow boring.',
  'Light sour',
  'Slightly sour, good with salty things. Less good with cheeses.',
  "Thin, breezy, a taste correspondent with it's price"],
 ['Smells disgusting. Tastes like that.',
  'Not to my taste at all. Experienced it as bitter and with too much alcohol taste',
  'Astringent and not very good',
  'didn’t enjoy, unpleasant in the mouth. tart and a hint of spice, yet somehow boring.',
  'Good ALC of 13.5%, but really not a fan of the flavor. Will do in a pinch, but would pick something else to enjoy.'],
 ['Doesn’t smell good but ok to drink',
  'Slightly sour, good w

##### Prompt 1

In [None]:
prompt_q = """<s>[INST]<<SYS>>
    You are a respectful assistant analyzing a wine review. Your response must be either positive, negative, or neutral.
    No explanations necessary.
<</SYS>>"""
prompt_r = """
    Would you describe the wine as having a sour taste or a taste similar to sour?.
    Review: Very sour and bitter, not delicious\n
[/INST] positive"""
prompt_r = prompt_r + """[INST]
    Would you describe the wine as having a sour taste or a taste similar to sour?.
    Review: %s
[/INST]"""

prompt_q = prompt_q + prompt_r

In [None]:
models_answers = list()

for mid, tdocs in tqdm(enumerate(models_docs_), position=0, leave=False):
    tmp_list = list()
    for d in tqdm(tdocs, position=1, leave=True):
    #for d in tdocs:
        prompt = prompt_q % d
        res = generator(prompt)
        a = res[0]["generated_text"]
        tmp_list.append(a)
    models_answers.append(tmp_list)

In [None]:
# use util: print_classification_result(models_docs_, models_answers)

for i in range(len(models_answers)):
    print(f'topic model {i+1}')
    for a, d in zip(models_answers[i], models_docs_[i]):
        print_with_line_feed(f'{a}: {d}', 80)
    print()

topic model 1
negative: Bitter. Light body 
neutral: Light sour 
negative: Not rich, light sourness. I wouldn't take it a second time 

topic model 2
Positive: Guarantee of a good experience Light acidity Pleasant blend of flavor 
Positive: Balanced intensity. Rich aroma. Notes of fruit and chocolate. Pairs 
well with red meats and cheeses. 
positive: Sour, but not too bitter or dry. Would want to try it again some time 

topic model 3
neutral: Not my taste 
neutral: Getting drunk and drinking red wine❤️‍🔥 
negative: Lots of flavour, a bit sour, little aftertaste 

topic model 4
negative: The taste is too spicy 
negative: Stuffy aroma. Järnlukt. Surt 
neutral: Good aroma. But a rather thin body. Also slightly sweet taste. 



##### Prompt 2

In [None]:
prompt_q = """<s>[INST]<<SYS>>
    You are a respectful assistant analyzing a wine review. Your response must be either yes, no, or not sure.
    No explanations necessary.
<</SYS>>"""
prompt_r = """
    Would you describe the wine as having a sour taste or a taste similar to sour?.
    Review: Very sour and bitter, not delicious\n
[/INST] yes"""
prompt_r = prompt_r + """[INST]
    Would you describe the wine as having a sour taste or a taste similar to sour?.
    Review: %s
[/INST]"""

prompt_q = prompt_q + prompt_r


models_answers = list()
for mid, tdocs in tqdm(enumerate(models_docs_), position=0, leave=False):
    tmp_list = list()
    for d in tqdm(tdocs, position=0, leave=True):
    #for d in tdocs:
        prompt = prompt_q % d
        res = generator(prompt)
        a = res[0]["generated_text"]
        tmp_list.append(a)
    models_answers.append(tmp_list)


for i in range(len(models_answers)):
    print(f'topic model {i+1}')
    for a, d in zip(models_answers[i], models_docs_[i]):
        print_with_line_feed(f'{a}: {d}', 80)
    print()

100%|██████████| 3/3 [02:58<00:00, 59.53s/it]
100%|██████████| 3/3 [03:23<00:00, 67.71s/it]
100%|██████████| 3/3 [03:29<00:00, 69.82s/it]
100%|██████████| 3/3 [03:36<00:00, 72.20s/it]
                       

topic model 1
no: Bitter. Light body 
yes: Light sour 
no: Not rich, light sourness. I wouldn't take it a second time 

topic model 2
Not sure: Guarantee of a good experience Light acidity Pleasant blend of flavor 
no: Balanced intensity. Rich aroma. Notes of fruit and chocolate. Pairs well 
with red meats and cheeses. 
yes: Sour, but not too bitter or dry. Would want to try it again some time 

topic model 3
no: Not my taste 
Not sure: Getting drunk and drinking red wine❤️‍🔥 
no: Lots of flavour, a bit sour, little aftertaste 

topic model 4
no: The taste is too spicy 
Not sure: Stuffy aroma. Järnlukt. Surt 
not sure: Good aroma. But a rather thin body. Also slightly sweet taste. 





##### Prompt 3: Sentiment question

In [None]:
prompt_q = """<s>[INST]<<SYS>>
    You are a respectful assistant for sentiment analysis of wine reviews. Your response must be either positive, negative, or neutral.
    No explanations necessary.
<</SYS>>"""
prompt_r = """
    What is the sentiment conveyed in the review?
    Review: Very sour and bitter, not delicious\n
[/INST] negative"""
prompt_r = prompt_r + """[INST]
    What is the sentiment conveyed in the review?
    Review: %s
[/INST]"""

prompt_q = prompt_q + prompt_r


models_answers = list()
for mid, tdocs in tqdm(enumerate(models_docs_), position=0, leave=False):
    tmp_list = list()
    for d in tqdm(tdocs, position=0, leave=True):
    #for d in tdocs:
        prompt = prompt_q % d
        res = generator(prompt)
        a = res[0]["generated_text"]
        tmp_list.append(a)
    models_answers.append(tmp_list)


for i in range(len(models_answers)):
    print(f'topic model {i+1}')
    for a, d in zip(models_answers[i], models_docs_[i]):
        print_with_line_feed(f'{a}: {d}', 80)
    print()

100%|██████████| 3/3 [03:09<00:00, 63.06s/it]
100%|██████████| 3/3 [02:22<00:00, 47.64s/it]
100%|██████████| 3/3 [03:21<00:00, 67.11s/it]
100%|██████████| 3/3 [02:21<00:00, 47.23s/it]
                       

topic model 1
negative: Bitter. Light body 
neutral: Light sour 
negative: Not rich, light sourness. I wouldn't take it a second time 

topic model 2
positive: Guarantee of a good experience Light acidity Pleasant blend of flavor 
positive: Balanced intensity. Rich aroma. Notes of fruit and chocolate. Pairs 
well with red meats and cheeses. 
neutral: Sour, but not too bitter or dry. Would want to try it again some time 

topic model 3
neutral: Not my taste 
Positive: Getting drunk and drinking red wine❤️‍🔥 
neutral: Lots of flavour, a bit sour, little aftertaste 

topic model 4
negative: The taste is too spicy 
negative: Stuffy aroma. Järnlukt. Surt 
neutral: Good aroma. But a rather thin body. Also slightly sweet taste. 





### 🎯 Zero-shot classifincation

In [20]:
# see: https://huggingface.co/facebook/bart-large-mnli
model_id = "facebook/bart-large-mnli"

In [None]:
from transformers import pipeline

def zs_classifier(sequence, labels, print_result=True, multi_label=False):
    pipe = pipeline(model=model_id)
    res = pipe(sequence, candidate_labels=labels, multi_label=multi_label)
    sco = res['scores']

    out = [f'{l}: {sco[i]:.3f}' for i, l in enumerate(labels)]
    out = ', '.join(out)
    if print_result:
        print(out)
    return out

#_ = zs_classifier('Bitter. Light body', ['sour', 'bitter'])

In [25]:
import re

reg = r':\s*(\d+(?:\.\d+)?)'

def get_scores(tm, tid, docs=docs):
    """
    get score list of each word in a topic
    by zero-shot classification throu all the documents of the topic
    """
    tdocs = print_topic_docs(tid, docs, tm, 0, return_docs=True)
    labels = [x[0] for x in tm.get_topic(tid)]

    # calc socres of topic words for each doc
    scores = []
    for d in tqdm(tdocs):
        a = zs_classifier(d, labels, print_result=False, multi_label=True)
        scores.append(a)

    scores_f = [] # take number and convert to float
    for ss in scores:
        m = re.findall(reg, ss)
        m = [float(x) for x in m]
        scores_f.append(m)

    scores_topic = dict() # scores for each word in a topic
    for i, l in enumerate(labels):
        s = [ds[i] for ds in scores_f]
        scores_topic[l] = s

    return scores_topic

In [None]:
# testing
_ = zs_classifier('Bitter. Light body', ['sour', 'bitter'], multi_label=True)

sour: 0.979, bitter: 0.806


#### Test 1 with small data

In [None]:
search

'sour'

In [None]:
models_docs_ = sample_docs_similar_topic(models_docs, 3)
models_docs_

[['Bitter. Light body',
  'Light sour',
  "Not rich, light sourness. I wouldn't take it a second time"],
 ['Guarantee of a good experience Light acidity Pleasant blend of flavor',
  'Balanced intensity. Rich aroma. Notes of fruit and chocolate. Pairs well with red meats and cheeses.',
  'Sour, but not too bitter or dry. Would want to try it again some time'],
 ['Not my taste',
  'Getting drunk and drinking red wine❤️\u200d🔥',
  'Lots of flavour, a bit sour, little aftertaste'],
 ['The taste is too spicy',
  'Stuffy aroma. Järnlukt. Surt',
  'Good aroma. But a rather thin body. Also slightly sweet taste.']]

In [None]:
models_answers = get_classification_result(models_docs_, zs_classifier, ['sour'], print_result=False)

100%|██████████| 3/3 [00:45<00:00, 15.12s/it]
100%|██████████| 3/3 [00:48<00:00, 16.19s/it]
100%|██████████| 3/3 [00:22<00:00,  7.39s/it]
100%|██████████| 3/3 [00:20<00:00,  6.76s/it]


In [None]:
print_classification_result(models_docs_, models_answers, length=80)

topic model 1
sour: 0.806: Bitter. Light body 
sour: 0.967: Light sour 
sour: 0.996: Not rich, light sourness. I wouldn't take it a second time 

topic model 2
sour: 0.044: Guarantee of a good experience Light acidity Pleasant blend of 
flavor 
sour: 0.000: Balanced intensity. Rich aroma. Notes of fruit and chocolate. Pairs 
well with red meats and cheeses. 
sour: 0.995: Sour, but not too bitter or dry. Would want to try it again some 
time 

topic model 3
sour: 0.989: Not my taste 
sour: 0.001: Getting drunk and drinking red wine❤️‍🔥 
sour: 0.978: Lots of flavour, a bit sour, little aftertaste 

topic model 4
sour: 0.000: The taste is too spicy 
sour: 0.858: Stuffy aroma. Järnlukt. Surt 
sour: 0.000: Good aroma. But a rather thin body. Also slightly sweet taste. 



#### Test 2 with topic on sourness

In [None]:
models = [topic_model1, topic_model2, topic_model3, topic_model4]
models_name = ['topic_model1', 'topic_model2', 'topic_model3', 'topic_model4']

def find_topics2(search, num_docs=10, models=models, names=models_name, highest_similarity = True):
    return find_topics(search, num_docs=num_docs, models=models, names=models_name, highest_similarity = highest_similarity)

In [None]:
search = 'sour'
#find_topics2(search, 5)
find_topics2(search, 0)

topic_model1:
8_sour_sourness_tasted_bitter (38)
sour, sourness, tasted, bitter, spicy, flavor, bitterness, taste, delicious, unpleasant

topic_model2:
1_sour_flavour_bitter_taste (58)
sour, flavour, bitter, taste, flavor, tasted, fruit, bitterness, cherries, spicy

topic_model3:
3_sour_sourness_tastes_tasted (73)
sour, sourness, tastes, tasted, bitter, taste, unpleasant, flavour, flavor, bitterness

topic_model4:
2_sourness_sour_bitter_bitterness (86)
sourness, sour, bitter, bitterness, unpleasant, taste, flavor, bland, fruity, tastes



In [None]:
models_topic = [8, 1, 3, 2]

# to check models_topic correct
for i, tid in enumerate(models_topic):
    tm = models[i]
    print_topic(tm, tid, representative=False)

8_sour_sourness_tasted_bitter (38)
sour, sourness, tasted, bitter, spicy, flavor, bitterness, taste, delicious, unpleasant
1_sour_flavour_bitter_taste (58)
sour, flavour, bitter, taste, flavor, tasted, fruit, bitterness, cherries, spicy
3_sour_sourness_tastes_tasted (73)
sour, sourness, tastes, tasted, bitter, taste, unpleasant, flavour, flavor, bitterness
2_sourness_sour_bitter_bitterness (86)
sourness, sour, bitter, bitterness, unpleasant, taste, flavor, bland, fruity, tastes


In [None]:
scores = dict()
for i, tid in enumerate(models_topic):
    tm = models[i]
    s = get_scores(tm, tid, docs=docs)
    scores[i] = s

8_sour_sourness_tasted_bitter (38)
sour, sourness, tasted, bitter, spicy, flavor, bitterness, taste, delicious, unpleasant


100%|██████████| 38/38 [11:06<00:00, 17.55s/it]


1_sour_flavour_bitter_taste (58)
sour, flavour, bitter, taste, flavor, tasted, fruit, bitterness, cherries, spicy


100%|██████████| 58/58 [13:28<00:00, 13.94s/it]


3_sour_sourness_tastes_tasted (73)
sour, sourness, tastes, tasted, bitter, taste, unpleasant, flavour, flavor, bitterness


100%|██████████| 73/73 [16:20<00:00, 13.43s/it]


2_sourness_sour_bitter_bitterness (86)
sourness, sour, bitter, bitterness, unpleasant, taste, flavor, bland, fruity, tastes


100%|██████████| 86/86 [19:30<00:00, 13.61s/it]


In [None]:
import pandas as pd

x_range = 10

df_scores = pd.DataFrame()
for i, s in scores.items():
    df = (pd.DataFrame().from_dict(s)
            .stack().reset_index())
    df.columns = ['x', 'word', 'score']

    tm = models[i]
    tid = models_topic[i]
    w = tm.get_topic(tid) # rep. words in the topic
    # assign numbers for plotting and adjust the range to compare models in a fig
    m = {x[0]: i * x_range / len(w) for i, x in enumerate(w)}
    df['x'] = df.word.map(m)

    df['tm'] = i
    df_scores = pd.concat([df_scores, df])

In [None]:
#df_scores.loc[(df_scores.word=='flavor') & (df_scores.tm==3)].x.unique()
df_scores

Unnamed: 0,x,word,score,tm
0,0.0,sour,0.999,0
1,1.0,sourness,0.991,0
2,2.0,tasted,0.989,0
3,3.0,bitter,0.984,0
4,4.0,spicy,0.982,0
...,...,...,...,...
855,5.0,taste,0.994,3
856,6.0,flavor,0.988,3
857,7.0,bland,0.837,3
858,8.0,fruity,0.000,3


In [None]:
import plotly.express as px

fig = px.line(df_scores,
              x='x',
              y="score",
              color="tm",
              #line_group="word",
              hover_name="word",
              line_shape="spline",
              render_mode="svg",
              width=1000
              )
fig.show()

In [None]:
import plotly.express as px

fig = px.box(df_scores,
              x='x',
              y="score",
              color="tm",
              hover_name="word",
              #render_mode="svg",
              width=1000
              )
fig.show()

In [None]:
df = df_scores.groupby(by=['tm','x']).score.median().reset_index()

import plotly.express as px

fig = px.line(df,
              x='x',
              y="score",
              color="tm",
              line_shape="spline",
              render_mode="svg",
              width=1000
              )
fig.show()

#### Test 3 with top 10 topics with top 5 words
use same util in test 2

In [22]:
models = [topic_model1, topic_model2, topic_model3, topic_model4]
models_name = ['topic_model1', 'topic_model2', 'topic_model3', 'topic_model4']

def find_topics2(search, num_docs=10, models=models, names=models_name, highest_similarity = True):
    return find_topics(search, num_docs=num_docs, models=models, names=models_name, highest_similarity = highest_similarity)

In [26]:
num_topics = 10
tids = list(range(num_topics))

scores_tm = []
for i, tm in enumerate(models):
    print(f'Evaluating model {i} ...')
    scores_tid = dict()
    for tid in tids:
        s = get_scores(tm, tid, docs=docs)
        scores_tid[tid] = s
    scores_tm.append(scores_tid)
    print()

Evaluating model 0 ...
0_fruity_fruit_fruits_strawberry (80)
fruity, fruit, fruits, strawberry, berries, sweetness, flavor, berry, delicious, tasty


100%|██████████| 80/80 [17:03<00:00, 12.80s/it]


1_wines_wine_flavors_berries (79)
wines, wine, flavors, berries, cherry, blackberries, oak, flavor, alcohol, fruit


100%|██████████| 79/79 [20:02<00:00, 15.23s/it]


2_fruits_fruit_strawberry_blueberries (52)
fruits, fruit, strawberry, blueberries, sweetness, fruity, red, chocolate, aromas, ripe


100%|██████████| 52/52 [14:24<00:00, 16.62s/it]


3_blackberry_blackcurrant_blackcurrants_cherry (51)
blackberry, blackcurrant, blackcurrants, cherry, flavors, vanilla, oak, pepper, lime, ripe


100%|██████████| 51/51 [10:19<00:00, 12.14s/it]


4_chilean_chile_cabernet_wines (50)
chilean, chile, cabernet, wines, wine, grape, mendoza, vivino, eucalyptus, brasília


100%|██████████| 50/50 [12:44<00:00, 15.29s/it]


5_wines_wine_value_quality (45)
wines, wine, value, quality, inexpensive, pleasant, drink, taste, money, price


100%|██████████| 45/45 [10:26<00:00, 13.92s/it]


6_wine_booze_everyday_cola (45)
wine, booze, everyday, cola, daily, cheap, tomato, tasty, blood, health


100%|██████████| 45/45 [09:31<00:00, 12.69s/it]


7_sauvignon_wines_cabernet_wine (44)
sauvignon, wines, cabernet, wine, grape, grapes, noir, cab, currants, tannins


100%|██████████| 44/44 [12:45<00:00, 17.39s/it]


8_sour_sourness_tasted_bitter (38)
sour, sourness, tasted, bitter, spicy, flavor, bitterness, taste, delicious, unpleasant


100%|██████████| 38/38 [08:19<00:00, 13.14s/it]


9_bom_bolognes_ba_bueno (35)
bom, bolognes, ba, bueno, brie, buchko, , , , 


100%|██████████| 35/35 [06:48<00:00, 11.67s/it]



Evaluating model 1 ...
0_wines_wine_sauvignon_bottle (508)
wines, wine, sauvignon, bottle, aromas, alcohol, taste, acidity, flavor, cabernet


100%|██████████| 508/508 [2:03:56<00:00, 14.64s/it]


1_sour_flavour_bitter_taste (58)
sour, flavour, bitter, taste, flavor, tasted, fruit, bitterness, cherries, spicy


100%|██████████| 58/58 [12:52<00:00, 13.32s/it]


2_bad_very_good_best (51)
bad, very, good, best, awful, nice, marvelous, excellent, amazing, lousy


100%|██████████| 51/51 [09:53<00:00, 11.63s/it]


3_drink_drinkable_drinks_drinking (47)
drink, drinkable, drinks, drinking, drinker, alcoholic, cheap, easy, cup, simple


100%|██████████| 47/47 [10:09<00:00, 12.96s/it]


4_value_worths_price_money (46)
value, worths, price, money, dollar, decent, sale, quality, profitably, pay


100%|██████████| 46/46 [09:15<00:00, 12.08s/it]


5_oak_cherry_chocolate_blackcurrant (45)
oak, cherry, chocolate, blackcurrant, oaky, black, flavors, strawberry, vanilla, flavor


100%|██████████| 45/45 [08:55<00:00, 11.89s/it]


6_fruity_meaty_delicious_juicyyyyy (43)
fruity, meaty, delicious, juicyyyyy, flavor, punchy, catchy, light, salad, soft


100%|██████████| 43/43 [08:55<00:00, 12.45s/it]


7_ruby_strawberry_sweetness_fruit (37)
ruby, strawberry, sweetness, fruit, blueberries, fruity, flavor, fruits, strawberries, taste


100%|██████████| 37/37 [10:12<00:00, 16.57s/it]


8_great_excellent_fantastic_good (30)
great, excellent, fantastic, good, perfect, nice, sweet, pleasant, so, shit


100%|██████████| 30/30 [05:37<00:00, 11.24s/it]


9_blackberries_flavors_berries_strawberry (29)
blackberries, flavors, berries, strawberry, fruit, strawberries, chocolate, fruits, cherries, blueberry


100%|██████████| 29/29 [06:25<00:00, 13.30s/it]



Evaluating model 2 ...
0_sauvignon_cabernet_wine_wines (513)
sauvignon, cabernet, wine, wines, flavor, tannins, taste, drink, cherry, aromas


100%|██████████| 513/513 [2:07:16<00:00, 14.89s/it]


1_fruity_fruits_fruit_strawberry (94)
fruity, fruits, fruit, strawberry, delicious, berries, sweetness, cherries, sweet, tasty


  0%|          | 0/94 [00:05<?, ?it/s]


KeyboardInterrupt: 

In [32]:
len(scores_tm[1].keys())

10

In [36]:
file = 'scores_tm_tmp'
#file = 'scores_tm'

In [37]:

file_pkl = f'{file}.pkl'
file_zip = f'{file}.zip'

path = 'sample_data'
path_src = '/content/drive/MyDrive/Colab\ Notebooks/'

In [38]:
import pickle

f = f'{path}/{file_pkl}'
with open(f, "wb") as fOut:
    pickle.dump(scores_tm, fOut, protocol=pickle.HIGHEST_PROTOCOL)

In [39]:
!zip -j {file_zip} {path}/{file_pkl}

  adding: scores_tm_tmp.pkl (deflated 79%)


In [40]:
!cp {file_zip} {path_src}

###### Load

In [None]:
file = 'scores_tm'
file_pkl = f'{file}.pkl'
file_zip = f'{file}.zip'

path = 'sample_data'
path_src = '/content/drive/MyDrive/Colab\ Notebooks/'

In [None]:
!cp {path_src}/{file_zip} .

In [None]:
!unzip {file_zip} -d {path}

Archive:  models_docs_sour.zip
  inflating: sample_data/models_docs_sour.pkl  


In [None]:
import pickle

f = f'{path}/{file_pkl}'

with open(f, "rb") as fIn:
    scores_tm = pickle.load(fIn)

'sour'

In [33]:
#scores_tm_ = scores_tm
scores_tm_ = scores_tm

In [60]:
import pandas as pd

x_range = 10
df_scores = pd.DataFrame()

for mid, scores_tid in enumerate(scores_tm_):
    tm = models[mid]
    df_scores_tid = pd.DataFrame()
    for tid, scores in scores_tid.items():

        df = (pd.DataFrame().from_dict(scores)
                .stack().reset_index())
        df.columns = ['x', 'word', 'score']

        w = tm.get_topic(tid) # rep. words in the topic
        # assign numbers for plotting and adjust the range to compare models in a fig
        map_x = {x[0]: i * x_range / len(w) for i, x in enumerate(w)}
        df['x'] = df.word.map(map_x)

        df['tid'] = tid
        df_scores_tid = pd.concat([df_scores_tid, df])

        #break # testing

    df_scores_tid['tm'] = mid
    df_scores = pd.concat([df_scores, df_scores_tid])

    #break # testing

In [63]:
df_scores.head()

Unnamed: 0,x,word,score,tid,tm
0,0.0,fruity,0.999,0,0
1,1.0,fruit,0.998,0,0
2,2.0,fruits,0.985,0,0
3,3.0,strawberry,0.49,0,0
4,4.0,berries,0.017,0,0


In [66]:
import plotly.express as px

fig = px.box(df_scores,
              x='x',
              y="score",
              color="tm",
              #facet_col="tm",
              hover_name="word",
              #render_mode="svg",
              width=1000
              )
fig.show()

# Topics per Class
use topics_per_class method

In [None]:
classes = df_reviews.wine.tolist()

In [None]:
topics_per_class = topic_model.topics_per_class(docs, classes=classes)

6it [00:00, 23.67it/s]


In [None]:
normalize_frequency = False
custom_labels = True

In [None]:
topic_model.visualize_topics_per_class(topics_per_class, top_n_topics=10,
                                       normalize_frequency = normalize_frequency,
                                       width=1000, height=500,
                                       custom_labels=custom_labels)

## Share of reviews

In [None]:
total_freq = df_reviews.groupby('wine').count().id.to_dict() # num of documents per class

df = topics_per_class.assign(Frequency=topics_per_class.apply(lambda x: x.Frequency/total_freq[x.Class], axis=1))

sum([v for k,v in total_freq.items()]), len(docs)

(1581, 1581)

In [None]:
normalize_frequency = False
custom_labels = False

In [None]:
topic_model.visualize_topics_per_class(df, top_n_topics=10,
                                       normalize_frequency = normalize_frequency,
                                       width=1000, height=500,
                                       custom_labels=custom_labels)

In [None]:
for tid in range(10):

    l1 = get_llm_labels(topic_model, 'Llama2', tid)
    l2 = get_llm_labels(topic_model, 'Llama2Desc', tid)
    print_with_line_feed(f'{tid}. {l1}: {l2}', 80)


0. Wine Review: A rich and complex red wine with notes of dark fruit, oak, and 
spice, displaying a medium body and moderate tannins, showing a subtle imbalance 
towards dryness on the palate. 
1. Flavor profiles: Flavor profiles and pairings involving various fruits, 
spices, and other ingredients. 
2. Wine Reviews: Cabernet Sauvignon wine review 
3. Value for Money: The topic is about finding good value for money or getting 
the best bang for your buck. 
4. Fruit flavors: Flavor profile of fruity drinks 
5. Wine Label: "Fruit-Forward Red": Wine descriptions 
6. Chilean Wine: Chilean wine varieties and blind taste testing 
7. Taste profile: The topic of sour and bitter tastes in food and drinks, 
including descriptions of different levels of intensity and individual 
preferences. 
8. Everyday Wine: Everyday simplicity in affordable wine 
9. Affordable and enjoyable beverages: Affordable and convenient drink option 


In [None]:
#get_llm_labels(topic_model, None)

tid = 2

l = get_llm_labels(topic_model, 'Llama2', tid)
print_with_line_feed(f'{l}:', 80)

l = get_llm_labels(topic_model, 'Llama2Desc', tid)
print_with_line_feed(l, 80)

Wine Reviews: 
Cabernet Sauvignon wine review 


In [None]:
# for checking

tid = 1
hotel = 'Park Inn'

topics_per_class.loc[(topics_per_class.Topic==tid) & (topics_per_class.Class==hotel)].Frequency / total_freq[hotel]

34    0.023483
Name: Frequency, dtype: float64