<a href="https://colab.research.google.com/github/imusicmash/wandb_workshop/blob/main/Copy_of_LLM_Clustering_and_Structured_Outputs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# LLM Clustering and Structured Outputs

In this demo, we will use unstructured data from [The 2024 MAD (ML, AI & Data) Landscape](https://mad.firstmark.com/) and automatically cluster it into meaninful categories, so that we can get better insights from the data. In the process, we will use Instructor to facilitate getting structured outputs from LLMs.


# Load data

In [None]:
import pandas as pd
from clustering_utils import cluster_texts, visualize, name_clusters, dedup_cluster_names, assign_clusters, write_cluster_names_to_file
import warnings

warnings.filterwarnings('ignore')

Logged in as W&B user darek-kleczek.
View Weave data at https://wandb.ai/darek-kleczek/llm-clustering/weave


In [None]:
df = pd.read_csv('aicompanies_clean.csv')

In [None]:
df.head()

Unnamed: 0,category,company_name,website,founded_year,raised_amount,description
0,Storage,Cohesity,https://www.cohesity.com/,2013.0,$660M,Cohesity is a late-stage technology firm that ...
1,Storage,Qumulo,https://qumulo.com/,2012.0,$345.5M,"Qumulo, headquartered in Seattle, has develope..."
2,Storage,NetApp,https://www.netapp.com/,1992.0,,NetApp (NASDAQ: NTAP) provides data storage an...
3,Storage,HPE Nimble Storage,https://www.hpe.com/us/en/services/nimble-stor...,2015.0,,Hewlett Packard Enterprise (NYSE: HPE) provide...
4,Storage,MinIO,https://min.io/,2014.0,$126.3M,Minio provides open source object storage for ...


# Embed and visualize

In [None]:
clusters, embeddings = cluster_texts(df['description'].values.tolist())
df['cluster_id'] = clusters

# Let's first visualize categories from the source data
visualize(df['company_name'].values.tolist(), df['category'].values.tolist(), embeddings)

In [None]:
# Now let's visualize BERTopic clusters
visualize(df['company_name'].values.tolist(), clusters, embeddings)

# LLM Clustering Workflow

In [None]:
# Make sure to set OPENAI_API_KEY environment variable
import os
assert os.getenv("OPENAI_API_KEY", "").startswith("sk-"), "This doesn't look like a valid OpenAI API key"
print("OpenAI API key configured")

# Name the clusters and save results to text file for inspection
cluster_names = await name_clusters(df)
write_cluster_names_to_file('stage1.txt', cluster_names)

# Deduplicate cluster names and save results to text file for inspection
cluster_names = dedup_cluster_names(cluster_names)
write_cluster_names_to_file('stage2.txt', cluster_names)

# Assign cluster names to each description in the dataframe. Create new clusters if needed.
df = await assign_clusters(df, cluster_names, create=True)
cluster_names = df.new_cluster.unique().tolist()
write_cluster_names_to_file('stage3.txt', cluster_names)

# Deduplicate cluster names again
cluster_names = dedup_cluster_names(cluster_names)
write_cluster_names_to_file('stage4.txt', cluster_names)

# Assign cluster names to each description in the dataframe. Do not create new clusters.
df = await assign_clusters(df, cluster_names, create=False)
df.to_csv('aicompanies_with_clusters.csv', index=False)

OpenAI API key configured


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


🍩 https://wandb.ai/darek-kleczek/llm-clustering/r/call/f9c5bb57-e136-4f77-a63f-694891685630
🍩 https://wandb.ai/darek-kleczek/llm-clustering/r/call/eeba2350-5ef0-496e-8315-7ce3561c6351
🍩 https://wandb.ai/darek-kleczek/llm-clustering/r/call/2f62f8da-9bec-46e5-81b1-df082900764e
🍩 https://wandb.ai/darek-kleczek/llm-clustering/r/call/f59d6e6c-da6f-4d37-81ce-564c46f3fec2
🍩 https://wandb.ai/darek-kleczek/llm-clustering/r/call/5d12832e-9904-486a-bfbb-e3b7a8aa263f


# Visualize the result

In [None]:
visualize(df['company_name'].values.tolist(), df['new_cluster'].values.tolist(), embeddings)

In [None]:
df.category.nunique(), df.new_cluster.nunique()

(99, 157)

# See the trends

In [None]:
# Let's focus on the last 12 years and top 10 most popular categories in that period
focus_df = df[(df.founded_year > 2012) & (df.founded_year < 2024)]
focus_cats = focus_df.new_cluster.value_counts()[:10].index
focus_df = focus_df[focus_df.new_cluster.isin(focus_cats)]

In [None]:
import pandas as pd
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import ColumnDataSource, HoverTool
from bokeh.palettes import Category10
from bokeh.transform import dodge

output_notebook()

grouped_df = focus_df.groupby(['founded_year', 'new_cluster']).size().reset_index(name='count')
pivot_df = grouped_df.pivot(index='founded_year', columns='new_cluster', values='count')

p = figure(width=800, height=400, title='Interactive Line Chart of Rows by Cluster across Years',
           x_axis_label='Year', y_axis_label='Count')

colors = Category10[len(pivot_df.columns)]

hover = HoverTool(tooltips=[
    ('Year', '@year'),
    ('Count', '@count'),
    ('Cluster', '@cluster')
])
p.add_tools(hover)

for (col, color) in zip(pivot_df.columns, colors):
    source = ColumnDataSource(data={
        'year': [str(x) for x in pivot_df.index],
        'count': pivot_df[col],
        'cluster': [col] * len(pivot_df)
    })
    p.line('year', 'count', source=source, line_width=2, color=color)
    p.circle('year', 'count', source=source, fill_color=color, size=8)

show(p)
