# Run the model

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np


# Load the data
data_path = 'attachment/attach_processed_length10.csv'

# Set embedding save path
embeddings_path = 'attachment/models/doc/attach_doc_embeddings.npy'

df = pd.read_csv(data_path, usecols=['text'], low_memory=False)
docs = df['text'].tolist()

# Prepare sub-models
embedding_model = SentenceTransformer('thenlper/gte-large')
embeddings = embedding_model.encode(docs, show_progress_bar=True)

np.save(embeddings_path, embeddings)

In [None]:
from umap import UMAP
from hdbscan import HDBSCAN
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from sentence_transformers import SentenceTransformer
import collections
from tqdm import tqdm
from scipy.cluster import hierarchy as sch

# Define file paths
data_path = 'attachment/attach_processed_length10.csv'
embeddings_path = 'attachment/models/doc/attach_doc_embeddings.npy'
model_save_path = 'attachment/models/doc/attach_doc3_model_dir/'

print("Step 1: Loading the data {data_path}")
# Load the data
df = pd.read_csv(data_path, usecols=['text'], low_memory=False)

print("Step 2: Preparing the documents...")
# Specify what the 'docs' are
docs = df['text'].tolist()

# Load the embeddings
embeddings = np.load(embeddings_path)

###### Extract vocab to be used in BERTopic
vocab = collections.Counter()
tokenizer = CountVectorizer(ngram_range=(1, 4)).build_tokenizer()
for doc in tqdm(docs):
    vocab.update(tokenizer(doc))
vocab = [word for word, frequency in vocab.items() if frequency >= 30]; len(vocab)


umap_model = UMAP(
        n_components=3,  # has a wild impact hard to predict
        n_neighbors=20,  # Higher is a more gloabl strcture
        min_dist=0.01,   # Lower value means more dense packing
        random_state=42, # Reproducability
        metric="cosine", # have to pick something
        n_jobs=-1        # speed
        )

hdbscan_model = HDBSCAN(
            min_cluster_size=100,           # smallest size group considered
            min_samples=20,               # larger is more conservative - more noise
            leaf_size=40,                   # number of points per leaf node in the tree - default 40
            gen_min_span_tree=True,        # True creates minimum spanning trees - increasing RAM
            prediction_data=True,           # generates extra cached data of prediction labels for new data or reuse
            cluster_selection_method='eom', # eom is normal - leaf might get more homogeneous clusters
            cluster_selection_epsilon=0.0,  # default - merges clusters below threshold
            core_dist_n_jobs=-1,            # For speed
            )

topic_model = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    verbose=True)
topics, probs = topic_model.fit_transform(docs, embeddings)

topic_model.save(model_save_path, serialization="safetensors", save_ctfidf=True, save_embedding_model=embedding_model)



# Load and Create Hierarchies

In [None]:
# Hierarchical topics
# linkage_function options:
# single - min distance; long, chain like clusters; sensitive to noise
# complete - maximum distance; compact, round clusters; less sensitive
# avarge - average distane between all points in 2 clusters; balance of single/complete; good compromoise
# ward - minimizes increase in squared distance wihtin cluster; tends toward balanced clusters, suitable for varying sizes

from bertopic import BERTopic
from scipy.cluster import hierarchy as sch
from bertopic import BERTopic
import pandas as pd
from scipy.cluster import hierarchy as sch

# Define file paths
data_path = 'attachment/attach_processed_length10.csv'
embeddings_path = 'attachment/models/doc/attach_doc_embeddings.npy'
local_model = 'attachment/models/doc/attach_doc3_model_dir/'

# Load the data
df = pd.read_csv(data_path, usecols=['text'], low_memory=False)

# Specify what the 'docs' are
docs = df['text'].tolist()

# Load the embeddings
embeddings = np.load(embeddings_path)

# Load 'em up
topic_model = BERTopic.load(local_model, embedding_model=embedding_model)

linkage_function = lambda x: sch.linkage(x, 'ward', optimal_ordering=True)
hierarchical_topics = topic_model.hierarchical_topics(docs, linkage_function=linkage_function) 

tree = topic_model.get_topic_tree(hierarchical_topics)
print(tree)

topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

# Visualizations and Evaluations

### Visualize - in 2 parts

In [None]:
import itertools
import pandas as pd
from bertopic import BERTopic
import numpy as np 
from umap import UMAP
import seaborn as sns
from matplotlib import pyplot as plt
from adjustText import adjust_text
import matplotlib.patheffects as pe

# Load the embeddings
embeddings_path = 'attachment/doc/models/attach_doc_embeddings.npy'
embeddings = np.load(embeddings_path)

# Reduce the embeddings
umap_model = UMAP(
        n_components=2,  # has a wild impact hard to predict
        n_neighbors=30,  # Higher is a more gloabl strcture
        min_dist=0.01,   # Lower value means more dense packing
        random_state=42, # Reproducability
        metric="cosine", # have to pick something
        n_jobs=-1        # speed
        )

reduced_embeddings = umap_model.fit_transform(embeddings)

# Load the model
local_model = 'attachment/doc/models/attach_doc1_model_dir/'
topic_model = BERTopic.load(local_model, embedding_model='thenlper/gte-large')

# Get the Topics
topic_info_df = topic_model.get_topic_info()
# print(topic_info_df)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from adjustText import adjust_text
import matplotlib.patheffects as pe

# Set the 'df'
df = reduced_embeddings

# Assuming 'topic_model.topics_' contains all the unique topics including '-1'
unique_topics = set(topic_model.topics_) - {-1}  # Remove the '-1' topic if present

# Define the number of unique topics
n_topics = len(unique_topics)

colormap_name = 'tab10' if n_topics <= 10 else 'tab20'
cmap = plt.get_cmap(colormap_name)

# Generate colors from the colormap
colors = cmap.colors

# Create a color key mapping each topic to a color
color_key = {str(topic): colors[i % len(colors)] for i, topic in enumerate(sorted(unique_topics))}

# Prepare dataframe and ignore outliers
df = pd.DataFrame({
    "x": reduced_embeddings[:, 0],  # x-coordinates from reduced embeddings
    "y": reduced_embeddings[:, 1],  # y-coordinates from reduced embeddings
    "Topic": topic_model.topics_    # topics from the BERTopic model
})

# Optionally, you can filter the DataFrame to remove outliers or unwanted topics
# For example, if you want to exclude topic '-1' and filter based on x and y values:
df = df[df['Topic'] != -1]
df = df[(df['x'] > -10) & (df['x'] < 10) & (df['y'] > -10) & (df['y'] < 10)]

# Convert 'Topic' to a string if it's not already, as the color_key expects string keys
df['Topic'] = df['Topic'].astype(str)

# Create a dummy column for size (all values are 1)
df['Length'] = 1

# Create the scatter plot
fig = plt.figure(figsize=(16, 16))
sns.scatterplot(data=df, x='x', y='y', hue='Topic', palette=color_key, alpha=0.4, sizes=(0.4, 10), size="Length")

# Create 'mean_df' to store the centroids of each topic
mean_df = df.groupby('Topic').mean().reset_index()

# Annotate top 50 topics
texts, xs, ys = [], [], []
for row in mean_df.iterrows():
    topic = row[1]["Topic"]
    name = " - ".join(list(zip(*topic_model.get_topic(int(topic))))[0][:3])

    if int(topic) <= 50:
        xs.append(row[1]["x"])
        ys.append(row[1]["y"])
        texts.append(plt.text(row[1]["x"], row[1]["y"], name, size=10, ha="center", 
                              color=color_key[str(int(topic))], path_effects=[pe.withStroke(linewidth=0.5, foreground="black")]))

# Adjust annotations to prevent overlap
adjust_text(texts, x=xs, y=ys, time_lim=1, force_text=(0.01, 0.02), force_static=(0.01, 0.02), force_pull=(0.5, 0.5))
plt.show() 


### Visualize from BERTopic

In [None]:
# Run the visualization with the original embeddings
topic_model.visualize_documents(docs, embeddings=embeddings)

# Reduce dimensionality of embeddings, this step is optional but much faster to perform iteratively:
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)
topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings)

### Visualize attempt 2

In [12]:
import pandas as pd
from bertopic import BERTopic
import numpy as np
from umap import UMAP
import plotly.express as px

data_path = 'attachment/attach_short.csv'
df = pd.read_csv(data_path, usecols=['text'], low_memory=False)
docs = df['text'].tolist()

# Load the embeddings
embeddings_path = 'attachment/short/attach_short_embeddings.npy'
embeddings = np.load(embeddings_path)

# Load the model
local_model = 'attachment/short/attach_short_model_dir/'
topic_model = BERTopic.load(local_model, embedding_model='thenlper/gte-large')

# Define UMAP hyperparameter combinations
umap_params_list = [
    {'n_neighbors': 10, 'min_dist': 0.0, 'metric': 'cosine'},
    {'n_neighbors': 10, 'min_dist': 0.1, 'metric': 'cosine'},
    {'n_neighbors': 10, 'min_dist': 0.5, 'metric': 'cosine'},
    {'n_neighbors': 20, 'min_dist': 0.0, 'metric': 'cosine'},
    {'n_neighbors': 20, 'min_dist': 0.1, 'metric': 'cosine'},
    {'n_neighbors': 50, 'min_dist': 0.0, 'metric': 'cosine'},
    {'n_neighbors': 50, 'min_dist': 0.1, 'metric': 'cosine'},
    # Add more parameter combinations as needed
]

# Define color scheme
color_scheme = px.colors.qualitative.Plotly

# Iterate over UMAP parameter combinations
for i, umap_params in enumerate(umap_params_list):
    # Reduce dimensionality of embeddings
    umap_model = UMAP(**umap_params)
    reduced_embeddings = umap_model.fit_transform(embeddings)

    # Create a DataFrame with the reduced embeddings and topic information
    topic_labels = topic_model.topics_
    topic_info_df = topic_model.get_topic_info()
    topic_names = {topic_id: " - ".join(list(zip(*topic_model.get_topic(topic_id)))[0][:3]) for topic_id in topic_info_df.Topic.unique()}

    df_vis = pd.DataFrame({
        'x': reduced_embeddings[:, 0],
        'y': reduced_embeddings[:, 1],
        'Topic': [topic_names[topic_id] if topic_id in topic_names else "No Topic" for topic_id in topic_labels]
    })

    # Create the interactive scatter plot using Plotly
    fig = px.scatter(df_vis, x='x', y='y', color='Topic', color_discrete_sequence=color_scheme, hover_data=['Topic'], opacity=0.7)

    # Update layout for better visibility and consistent plot size
    fig.update_layout(plot_bgcolor='white', width=800, height=600,
                      showlegend=True, legend_title="Topics",
                      title=f'Interactive 2D Visualization of Topics<br>UMAP Parameters: n_neighbors={umap_params["n_neighbors"]}, min_dist={umap_params["min_dist"]}, metric={umap_params["metric"]}',
                      xaxis=dict(range=[df_vis['x'].min() - 1, df_vis['x'].max() + 1]),
                      yaxis=dict(range=[df_vis['y'].min() - 1, df_vis['y'].max() + 1]))

    # Save the interactive plot as an individual HTML file
    file_name = f"doc1_2d_n_neighbors_{umap_params['n_neighbors']}_min_dist_{umap_params['min_dist']}_metric_{umap_params['metric']}"
    fig.write_html(f"attachment/doc/visualizations/doc1/{file_name}.html")
    print(f"Visualization saved as 'attachment/doc/visualizations/doc1/{file_name}.html'")

Visualization saved as 'attachment/doc/visualizations/doc1/doc1_2d_n_neighbors_10_min_dist_0.0_metric_cosine.html'
Visualization saved as 'attachment/doc/visualizations/doc1/doc1_2d_n_neighbors_10_min_dist_0.1_metric_cosine.html'
Visualization saved as 'attachment/doc/visualizations/doc1/doc1_2d_n_neighbors_10_min_dist_0.5_metric_cosine.html'
Visualization saved as 'attachment/doc/visualizations/doc1/doc1_2d_n_neighbors_20_min_dist_0.0_metric_cosine.html'
Visualization saved as 'attachment/doc/visualizations/doc1/doc1_2d_n_neighbors_20_min_dist_0.1_metric_cosine.html'
Visualization saved as 'attachment/doc/visualizations/doc1/doc1_2d_n_neighbors_50_min_dist_0.0_metric_cosine.html'
Visualization saved as 'attachment/doc/visualizations/doc1/doc1_2d_n_neighbors_50_min_dist_0.1_metric_cosine.html'


### Synthetic Data

In [11]:
import pandas as pd
import numpy as np
from umap import UMAP
import plotly.express as px
from sklearn.datasets import make_blobs

# Generate synthetic data
n_samples = 1000
n_topics = 20
docs, topic_labels = make_blobs(n_samples=n_samples, centers=n_topics, n_features=768, random_state=42)

# Define UMAP hyperparameter combinations
umap_params_list = [
    {'n_neighbors': 10, 'min_dist': 0.0, 'metric': 'cosine'},
    {'n_neighbors': 30, 'min_dist': 0.1, 'metric': 'euclidean'},
    {'n_neighbors': 50, 'min_dist': 0.5, 'metric': 'cosine'},
    # Add more parameter combinations as needed
]

# Define color scheme
color_scheme = px.colors.qualitative.Plotly

# Iterate over UMAP parameter combinations
for i, umap_params in enumerate(umap_params_list):
    # Reduce dimensionality of embeddings
    umap_model = UMAP(**umap_params)
    reduced_embeddings = umap_model.fit_transform(docs)

    # Create a DataFrame with the reduced embeddings and topic information
    topic_names = {topic_id: f"Topic {topic_id}" for topic_id in range(n_topics)}

    df_vis = pd.DataFrame({
        'x': reduced_embeddings[:, 0],
        'y': reduced_embeddings[:, 1],
        'Topic': [topic_names[topic_id] for topic_id in topic_labels]
    })

    # Create the interactive scatter plot using Plotly
    fig = px.scatter(df_vis, x='x', y='y', color='Topic', color_discrete_sequence=color_scheme, hover_data=['Topic'], opacity=0.7)

    # Update layout for better visibility and consistent plot size
    fig.update_layout(plot_bgcolor='white', width=800, height=600,
                      showlegend=True, legend_title="Topics",
                      title=f'Interactive 2D Visualization of Topics<br>UMAP Parameters: n_neighbors={umap_params["n_neighbors"]}, min_dist={umap_params["min_dist"]}, metric={umap_params["metric"]}',
                      xaxis=dict(range=[df_vis['x'].min() - 1, df_vis['x'].max() + 1]),
                      yaxis=dict(range=[df_vis['y'].min() - 1, df_vis['y'].max() + 1]))

    # Save the interactive plot as an individual HTML file
    file_name = f"doc1_2d_n_neighbors_{umap_params['n_neighbors']}_min_dist_{umap_params['min_dist']}_metric_{umap_params['metric']}"
    fig.write_html(f"attachment/doc/visualizations/doc1/{file_name}.html")
    print(f"Visualization saved as 'attachment/doc/visualizations/doc1/{file_name}.html'")

Visualization saved as 'attachment/doc/visualizations/doc1/doc1_2d_n_neighbors_10_min_dist_0.0_metric_cosine.html'
Visualization saved as 'attachment/doc/visualizations/doc1/doc1_2d_n_neighbors_30_min_dist_0.1_metric_euclidean.html'
Visualization saved as 'attachment/doc/visualizations/doc1/doc1_2d_n_neighbors_50_min_dist_0.5_metric_cosine.html'


### Beautiful UMAP

In [None]:
import pandas as pd
from bertopic import BERTopic
import numpy as np
from umap import UMAP
import plotly.express as px
from sklearn.preprocessing import PowerTransformer
from sklearn.pipeline import make_pipeline

# Load your dataset
data_path = 'attachment/attach_processed_length10.csv'
df = pd.read_csv(data_path, usecols=['text'], low_memory=False)
docs = df['text'].tolist()

# Load the embeddings
embeddings_path = 'attachment/doc/models/attach_doc_embeddings.npy'
embeddings = np.load(embeddings_path)

# Load the model
local_model = 'attachment/doc/models/attach_doc1_model_dir/'
topic_model = BERTopic.load(local_model, embedding_model='thenlper/gte-large')

# Scale the embeddings using PowerTransformer
pipe = make_pipeline(PowerTransformer())
scaled_embeddings = pipe.fit_transform(embeddings)

# Define UMAP hyperparameters
umap_model = UMAP(
    n_components=2,
    n_neighbors=15,
    min_dist=0.0,
    metric='euclidean',
    random_state=42,
    n_jobs=-1
)

# Fit UMAP and transform the scaled embeddings
umap_embeddings = umap_model.fit_transform(scaled_embeddings)

# Create a DataFrame for the UMAP output
umap_df = pd.DataFrame(umap_embeddings, columns=['x', 'y'])
umap_df['topic'] = topics

# Create a dictionary to map topic IDs to topic names
topic_info_df = topic_model.get_topic_info()
topic_names = {topic_id: " - ".join(list(zip(*topic_model.get_topic(topic_id)))[0][:3]) for topic_id in topic_info_df.Topic.unique()}
umap_df['topic_name'] = umap_df['topic'].map(topic_names)

# Visualize with Plotly using the "fire" color scheme
color_scheme = px.colors.sequential.YlOrRd
fig = px.scatter(umap_df, x='x', y='y', color='topic_name', color_discrete_sequence=color_scheme,
                 hover_data=['topic_name'], opacity=0.7, title='UMAP Visualization of Topics')

# Update layout for better visibility and consistent plot size
fig.update_layout(plot_bgcolor='black', width=800, height=600,
                  showlegend=True, legend_title="Topics",
                  xaxis=dict(range=[umap_df['x'].min() - 1, umap_df['x'].max() + 1]),
                  yaxis=dict(range=[umap_df['y'].min() - 1, umap_df['y'].max() + 1]))

# Save the interactive plot as an HTML file
file_name = "short_2d_umap_plot"
fig.write_html(f"attachment/doc/visualizations/{file_name}.html")
print(f"Visualization saved as 'attachment/doc/visualizations/{file_name}.html'")

### Visualize from Claude

In [28]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,1982,0_yes_me_this_it,"[yes, me, this, it, fa, lol, same, you, no, oh]",
1,1,10,1_bot_good_bad_fkn,"[bot, good, bad, fkn, bots, sorry, , , , ]",


In [34]:
import pandas as pd
from bertopic import BERTopic
import numpy as np
from umap import UMAP
import plotly.express as px

data_path = 'attachment/attach_processed_length10.csv'
df = pd.read_csv(data_path, usecols=['text'], low_memory=False)
docs = df['text'].tolist()

# Load the embeddings
embeddings_path = 'attachment/doc/models/attach_doc_embeddings.npy'
embeddings = np.load(embeddings_path)

# Load the model
local_model = 'attachment/doc/models/attach_doc1_model_dir/'
topic_model = BERTopic.load(local_model, embedding_model='thenlper/gte-large')

# Generate topics for the documents
topics, probs = topic_model.transform(docs)

# Check if there are any outlier documents
outlier_mask = topics == -1
outlier_docs = [doc for doc, is_outlier in zip(docs, outlier_mask) if is_outlier]

if len(outlier_docs) > 0:
    # Reduce outliers
    new_topics = topic_model.reduce_outliers(docs, topics, strategy="c-tf-idf", threshold=0.5)
else:
    print("No outlier documents found. Skipping outlier reduction.")
    new_topics = topics

# Update the original topic model with the new topics
topic_model.topics_ = new_topics

# Define UMAP hyperparameter combinations
umap_params_list = [
    {'n_neighbors': 10, 'min_dist': 0.0, 'metric': 'cosine'},
    {'n_neighbors': 10, 'min_dist': 0.1, 'metric': 'cosine'},
    {'n_neighbors': 20, 'min_dist': 0.0, 'metric': 'cosine'},
    {'n_neighbors': 20, 'min_dist': 0.1, 'metric': 'cosine'},
    {'n_neighbors': 50, 'min_dist': 0.0, 'metric': 'cosine'},
    {'n_neighbors': 50, 'min_dist': 0.1, 'metric': 'cosine'},
    # Add more parameter combinations as needed
]

# Define color scheme
color_scheme = px.colors.qualitative.Plotly

# Iterate over UMAP parameter combinations
for i, umap_params in enumerate(umap_params_list):
    # Reduce dimensionality of embeddings
    umap_model = UMAP(**umap_params)
    reduced_embeddings = umap_model.fit_transform(embeddings)

    # Create a DataFrame with the reduced embeddings and topic information
    topic_labels = topic_model.topics_
    topic_info_df = topic_model.get_topic_info()
    topic_names = {topic_id: " - ".join(list(zip(*topic_model.get_topic(topic_id)))[0][:3]) for topic_id in topic_info_df.Topic.unique()}

    df_vis = pd.DataFrame({
        'x': reduced_embeddings[:, 0],
        'y': reduced_embeddings[:, 1],
        'Topic': [topic_names[topic_id] if topic_id in topic_names else "No Topic" for topic_id in topic_labels]
    })

    # Create the interactive scatter plot using Plotly
    fig = px.scatter(df_vis, x='x', y='y', color='Topic', color_discrete_sequence=color_scheme, hover_data=['Topic'], opacity=0.7)

    # Update layout for better visibility and consistent plot size
    fig.update_layout(plot_bgcolor='white', width=800, height=600,
                      showlegend=True, legend_title="Topics",
                      title=f'Interactive 2D Visualization of Topics<br>UMAP Parameters: n_neighbors={umap_params["n_neighbors"]}, min_dist={umap_params["min_dist"]}, metric={umap_params["metric"]}',
                      xaxis=dict(range=[df_vis['x'].min() - 1, df_vis['x'].max() + 1]),
                      yaxis=dict(range=[df_vis['y'].min() - 1, df_vis['y'].max() + 1]))

    # Save the interactive plot as an individual HTML file
    file_name = f"doc1_2d_n_neighbors_{umap_params['n_neighbors']}_min_dist_{umap_params['min_dist']}_metric_{umap_params['metric']}"
    fig.write_html(f"attachment/doc/visualizations/doc1/{file_name}.html")
    print(f"Visualization saved as 'attachment/doc/visualizations/doc1/{file_name}.html'")

Batches: 100%|██████████| 11895/11895 [34:26<00:00,  5.76it/s] 
2024-05-07 08:02:18,378 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


Visualization saved as 'attachment/doc/visualizations/doc1/doc1_2d_n_neighbors_10_min_dist_0.0_metric_cosine.html'
Visualization saved as 'attachment/doc/visualizations/doc1/doc1_2d_n_neighbors_10_min_dist_0.1_metric_cosine.html'
Visualization saved as 'attachment/doc/visualizations/doc1/doc1_2d_n_neighbors_20_min_dist_0.0_metric_cosine.html'
Visualization saved as 'attachment/doc/visualizations/doc1/doc1_2d_n_neighbors_20_min_dist_0.1_metric_cosine.html'
Visualization saved as 'attachment/doc/visualizations/doc1/doc1_2d_n_neighbors_50_min_dist_0.0_metric_cosine.html'
Visualization saved as 'attachment/doc/visualizations/doc1/doc1_2d_n_neighbors_50_min_dist_0.1_metric_cosine.html'
