In [None]:
from google.colab import output
output.enable_custom_widget_manager()

In [None]:
!pip -q install bunkatopics -upgrade

In [None]:
from bunkatopics import Bunka

In [None]:
from datasets import load_dataset
import random

docs = load_dataset("bunkalab/medium-sample-technology")["train"]["title"]

In [None]:
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer(model_name_or_path="all-MiniLM-L6-v2")

# Load Projection Model
import umap
projection_model = umap.UMAP(
                n_components=2,
                random_state=42,
            )

In [None]:
# Initialize Bunka with your chosen model and language preference
bunka = Bunka(embedding_model=embedding_model,
              projection_model = projection_model)

# Fit Bunka to your text data
bunka.fit(docs)

[32m2024-04-11 09:05:44 - [94mBunka[0m - INFO - [1mProcessing 33998 tokens[0m
INFO:Bunka:Processing 33998 tokens
[32m2024-04-11 09:05:46 - [94mBunka[0m - INFO - [1mDetected language: English[0m
INFO:Bunka:Detected language: English
[32m2024-04-11 09:05:46 - [94mBunka[0m - INFO - [1mEmbedding documents... (can take varying amounts of time depending on their size)[0m
INFO:Bunka:Embedding documents... (can take varying amounts of time depending on their size)


Batches:   0%|          | 0/94 [00:00<?, ?it/s]

[32m2024-04-11 09:05:56 - [94mBunka[0m - INFO - [1mReducing the dimensions of embeddings...[0m
INFO:Bunka:Reducing the dimensions of embeddings...
[32m2024-04-11 09:06:09 - [94mBunka[0m - INFO - [1mExtracting meaningful terms from documents...[0m
INFO:Bunka:Extracting meaningful terms from documents...
[32m2024-04-11 09:06:09 - [94mBunka[0m - INFO - [1mSampling 2000 documents for term extraction[0m
INFO:Bunka:Sampling 2000 documents for term extraction
100%|██████████| 2000/2000 [00:14<00:00, 138.93it/s]


In [None]:
df_topics = bunka.get_topics(n_clusters=15, name_length=5, min_count_terms = 2) # Specify the number of terms to describe each topic
df_topics

[32m2024-04-11 09:06:25 - [94mBunka[0m - INFO - [1mComputing the topics[0m
INFO:Bunka:Computing the topics


Unnamed: 0,topic_id,topic_name,size,percent
0,bt-1,Women | product | leaders | Team | stream,293,9.77
1,bt-11,Sales | hackathon | Product | Marketing | thanks,272,9.07
2,bt-14,Kubernetes | Internet | API | Tips | instagram,263,8.77
3,bt-3,routine | discovery | Design | moments | ILLUM...,262,8.73
4,bt-6,gadgets | Device | phone | IoT | Delivers,250,8.33
5,bt-13,technology | Predictions | Trends | today | Op...,241,8.03
6,bt-7,Course | visualization | Coders | Projects | g...,218,7.27
7,bt-10,Robots | education | knowledge | Robotics | Words,203,6.77
8,bt-2,discount | Cryptocurrency | NordVPN | mining |...,197,6.57
9,bt-0,battery | Ago | satellite | Car | storage,192,6.4


In [None]:
# Check the top documents for every topic
bunka.df_top_docs_per_topic_

Unnamed: 0,doc_id,content,ranking_per_topic,topic_id,topic_name
0,6346e8bc-1a78-47ac-b,The First Electric Car Was Invented Almost 200...,1,bt-0,battery | Ago | satellite | Car | storage
1,b4e1ccb0-0001-4b08-9,The First Electric Car Was Invented Almost 200...,2,bt-0,battery | Ago | satellite | Car | storage
2,0b8f10d3-f74d-44bd-a,How to organize a RoadShow to present Tutellus...,3,bt-0,battery | Ago | satellite | Car | storage
3,46aaaf47-2a2a-436d-a,Efficient energy storage and future PVS by Sol...,4,bt-0,battery | Ago | satellite | Car | storage
4,87666ac8-3b7d-4b23-8,This extremely power thick battery might almos...,5,bt-0,battery | Ago | satellite | Car | storage
...,...,...,...,...,...
282,694df34e-128f-4fbc-8,[History] — The Curse of Oak Island Season 8 E...,12,bt-9,Episode | printing | HD | Printer | types
283,785ffd44-1e94-435a-b,Episode 1: STEM Role Models,13,bt-9,Episode | printing | HD | Printer | types
284,8982e193-44c1-4bca-b,New England’s Series A Deals — Part III,14,bt-9,Episode | printing | HD | Printer | types
285,96b58a89-261c-4388-8,"S1,E2 || The Stand (Series 1, Episode 2)",15,bt-9,Episode | printing | HD | Printer | types


In [None]:
bunka.visualize_topics(width=800, height=800, colorscale='Blues', density = True,label_size_ratio = 60, convex_hull = True, show_text = True)

[32m2024-04-11 09:06:25 - [94mBunka[0m - INFO - [1mCreating the Bunka Map[0m
INFO:Bunka:Creating the Bunka Map


In [None]:
import os
from google.colab import userdata
from langchain.llms import OpenAI

# Defined in the secrets tab in Google Colab
OPEN_AI_KEY = userdata.get('OPEN_AI')
llm = OpenAI(openai_api_key = OPEN_AI_KEY)

df_topics_clean = bunka.get_clean_topic_name(llm=llm)

[32m2024-04-11 09:06:28 - [94mBunka[0m - INFO - [1mUsing LLM to make topic names cleaner[0m
INFO:Bunka:Using LLM to make topic names cleaner
Creating new labels for clusters:  47%|████▋     | 7/15 [00:02<00:03,  2.41it/s]

In [None]:
df_topics_clean

In [None]:
bunka.visualize_topics(width=800, height=800, colorscale='Portland', density = True,label_size_ratio = 60, convex_hull = True)