<a href="https://colab.research.google.com/github/lbk209/topic_modeling/blob/main/tm_wine_reviews_params_reviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setting

In [1]:
import numpy as np
import plotly.express as px

In [None]:
# to work with path name having blank
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [2]:
import os
import pandas as pd

def read_csv(file, path_data, **kwargs):
    """
    kwargs: keyword args for pd.read_csv
    """
    files = [x for x in os.listdir(path_data) if x.startswith(file)]

    df_reviews = pd.DataFrame()
    for f in files:
        df = pd.read_csv(f'{path_data}/{f}', **kwargs)
        df_reviews = pd.concat([df_reviews, df])

    return df_reviews.reset_index(drop=True)

In [3]:
path_data = 'sample_data'
path_src = '/content/drive/MyDrive/Colab\ Notebooks/'

# Review result

In [4]:
file = 'wr_param_study_240218'
!unzip {path_src}/{file}.zip -d {path_data}

Archive:  /content/drive/MyDrive/Colab Notebooks//wr_param_study_240218.zip
  inflating: sample_data/wr_param_study_240218_a1.csv  
  inflating: sample_data/wr_param_study_240218_score_a1.csv  
  inflating: sample_data/wr_param_study_240218_a2.csv  
  inflating: sample_data/wr_param_study_240218_score_a2.csv  


In [5]:
f = f'{file}_a'
df_result = read_csv(f, path_data)

# find topic names
cols = [x for x in df_result.columns if x.isdigit()]

# convert values to list
df_result.loc[:, cols] = df_result.loc[:, cols].applymap(lambda x: eval(x) if x is not np.nan else np.nan)

# convert topics cols to int
cols_topic = [int(x) for x in cols]
df_result = df_result.rename(columns=dict(zip(cols, cols_topic)))

df_result.head()

Unnamed: 0,min_df,max_df,n_components,n_neighbors,min_dist,min_cluster_size,min_samples,0,1,2,...,85,86,87,88,89,90,91,92,93,94
0,0.001,0.5,10,50,0.1,100,10,"[blackberry chocolate, blackberry pepper, vani...","[good wine, excellent wine, nice wine, wine go...","[chardonnay fruity, decent chardonnay, tasty c...",...,,,,,,,,,,
1,0.001,1.0,10,10,0.05,100,100,"[sweetness, vanilla oak, oak vanilla, great, g...","[good wine, nice wine, wine good, great wine, ...","[good chardonnay, nice chardonnay, chardonnay,...",...,,,,,,,,,,
2,0.001,0.5,50,20,0.05,50,5,"[peach pear, honey pear, apple pear, honey pea...","[good wine, wine good, wine good wine, excelle...","[chardonnay, chardonnay good, nice chardonnay,...",...,,,,,,,,,,
3,0.01,1.0,50,50,0.0,20,2,"[good wine, wine good, excellent wine, nice wi...","[nice chardonnay, delicious chardonnay, good c...","[peach honey pear, peach pear honey, honey pea...",...,,,,,,,,,,
4,0.001,1.0,50,50,0.05,100,100,"[great, excellent, good, nice, oak vanilla, va...","[good wine, wine good, nice wine, great wine, ...","[chardonnay, nice chardonnay, good chardonnay,...",...,,,,,,,,,,


In [6]:
f = f'{file}_score'
df_score = read_csv(f, path_data)

# find topic names
cols = [x for x in df_score.columns if x.isdigit()]

# convert topics cols to int
cols_topic = [int(x) for x in cols]
df_score = df_score.rename(columns=dict(zip(cols, cols_topic)))

df_score.head()

Unnamed: 0,min_df,max_df,n_components,n_neighbors,min_dist,min_cluster_size,min_samples,0,1,2,...,85,86,87,88,89,90,91,92,93,94
0,0.01,1.0,10,20,0.0,50,25,0.625278,0.802663,0.909255,...,,,,,,,,,,
1,0.001,0.5,20,20,0.05,100,100,0.851484,0.777453,0.673803,...,,,,,,,,,,
2,0.01,0.5,20,50,0.1,20,20,0.688655,0.856638,0.671677,...,,,,,,,,,,
3,0.01,1.0,10,20,0.1,50,50,0.488196,0.813407,0.909836,...,,,,,,,,,,
4,0.001,0.5,50,50,0.1,20,2,0.856688,0.927628,0.885631,...,,,,,,,,,,


In [21]:
#df = df_result
df = df_score

cols_param = list(df_score.columns)[:7]
a = df.loc[:,cols].duplicated().sum()
b = len(df)

print(f'{a} duplicated in {b} param sets')

0 duplicated in 400 param sets


## Metric: c-TF-IDF of KeyBERT
check this out: https://plotly.com/python/plotly-express/

In [23]:
#df_score2 = (df_score.rename_axis('param').reset_index().set_index(cols_param+['param']).rename_axis(columns='Topic').stack().rename('score').reset_index())
df_score2 = (df_score.set_index(cols_param).rename_axis(columns='Topic').stack().rename('score').reset_index())
df_score2.head()

Unnamed: 0,min_df,max_df,n_components,n_neighbors,min_dist,min_cluster_size,min_samples,Topic,score
0,0.01,1.0,10,20,0.0,50,25,0,0.625278
1,0.01,1.0,10,20,0.0,50,25,1,0.802663
2,0.01,1.0,10,20,0.0,50,25,2,0.909255
3,0.01,1.0,10,20,0.0,50,25,3,0.924828
4,0.01,1.0,10,20,0.0,50,25,4,0.798456


In [24]:
a = 'min_samples'
df_score2.groupby(a)[a].count().sort_values(ascending=False)

min_samples
2      3780
10     2384
20     1460
5       612
50      597
25      293
100     205
Name: min_samples, dtype: int64

In [77]:
cond = df_score2.index # default
#cond = (df_score2.min_samples==2)

In [72]:
a = ['x', 'y', 'color', 'facet_col', 'facet_row']
b = """
#Topic, score, min_df, n_components, min_cluster_size
#Topic, score, min_cluster_size, n_neighbors, n_components,
#Topic, score, n_neighbors, min_cluster_size, n_components,
Topic, score, n_components, min_cluster_size, n_neighbors,
#min_samples, score, n_components, min_cluster_size, n_neighbors,
#n_components, score, min_samples, min_cluster_size, n_neighbors
"""

b = [x for x in b.strip().strip(',').split('\n') if not x.startswith('#')]
b = [x.strip() for x in b[0].split(',')]
kw = dict(zip(a,b))

kwm = kw.copy()
kwm.update({
    'marginal_y': "violin",
    'marginal_x': "box"
})

fig = px.scatter(df_score2.astype({kw['color']: str}).loc[cond].sort_values(list(kw.values())[2:]),
                 width=1200, height=1000,
                 **kwm)
fig.show()

In [73]:
cond = (df_score2.n_components == 50)
cond = cond & (df_score2.Topic == 0)
df_score2.loc[cond].iloc[:,:-1].drop_duplicates()

Unnamed: 0,min_df,max_df,n_components,n_neighbors,min_dist,min_cluster_size,min_samples,Topic
30,0.001,0.5,50,50,0.10,20,2,0
95,0.001,0.5,50,50,0.00,20,20,0
103,0.010,1.0,50,20,0.05,100,10,0
109,0.010,0.5,50,50,0.00,100,100,0
198,0.001,1.0,50,50,0.10,20,20,0
...,...,...,...,...,...,...,...,...
8998,0.010,1.0,50,10,0.05,100,10,0
9011,0.010,1.0,50,20,0.00,20,10,0
9142,0.001,0.5,50,20,0.05,50,25,0
9213,0.001,1.0,50,50,0.05,50,5,0


In [78]:
import itertools

i = 0
#bs = list(itertools.combinations(df_score2.columns[:7], 3))
bs = [['Topic', 'score'] + list(x) for x in itertools.combinations(df_score2.columns[:7], 3)]

cond = df_score2.index

In [88]:
print(f'{i+1} from {len(bs)} plots\n')

b = bs[i]

a = ['x', 'y', 'color', 'facet_col', 'facet_row']
kw = dict(zip(a,b))

kwm = kw.copy()
kwm.update({
    'marginal_y': "violin",
    'marginal_x': "box"
})

fig = px.scatter(df_score2.astype({kw['color']: str}).loc[cond].sort_values(list(kw.values())[2:]),
                 #width=1200, height=1000,
                 width=1000, height=600,
                 **kwm)
fig.update_layout(yaxis=dict(range=[0,1]))
fig.show()

i += 1

10 from 35 plots



In [91]:
import time, itertools

sleep_time = 3

def visualize_result(df_score2, b, a = ['x', 'y', 'color', 'facet_col', 'facet_row'],
                     width=1000, height=600):
    kw = dict(zip(a,b))

    kwm = kw.copy()
    kwm.update({
        'marginal_y': "violin",
        'marginal_x': "box"
    })

    fig = px.scatter(df_score2.astype({kw['color']: str}).sort_values(list(kw.values())[2:]),
                     width=width, height=height,**kwm)
    fig.update_layout(yaxis=dict(range=[0,1]))
    #fig.show()
    return fig


for b in itertools.combinations(df_score2.columns[:7], 3):
    b = ['Topic', 'score'] + list(b)
    fig = visualize_result(df_score2, b)
    fig.show()
    time.sleep(sleep_time)  # Sleep for remaining time

KeyboardInterrupt: 