In [None]:
!pip install datasets
!pip install sentence_transformers
!pip install umap-learn
!pip install hdbscan
!pip install bertopic

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [None]:
from datasets import load_dataset

dataset = load_dataset("CShorten/ML-ArXiv-Papers")["train"]

# Extract abstracts to train on and corresponding titles
abstracts = dataset["abstract"][:1000]
titles = dataset["title"][:1000]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/986 [00:00<?, ?B/s]

ML-Arxiv-Papers.csv:   0%|          | 0.00/147M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/117592 [00:00<?, ? examples/s]

In [None]:
from sentence_transformers import SentenceTransformer

# Pre-calculate embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(abstracts, show_progress_bar=True)

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

In [None]:
embeddings.shape

(1000, 384)

In [None]:
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, PartOfSpeech

# KeyBERT
keybert_model = KeyBERTInspired()

# Part-of-Speech
pos_model = PartOfSpeech("en_core_web_sm")

# MMR
mmr_model = MaximalMarginalRelevance(diversity=0.3)

# All representation models
representation_model = {
    "KeyBERT": keybert_model,
    "MMR": mmr_model,
    "POS": pos_model
}

In [None]:
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic

# base case
# n_neighbors=15
# n_components=5
# min_cluster_size=150
# top_n_words=5
# min_dist=0.0

umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = HDBSCAN(min_cluster_size=150, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
vectorizer_model = CountVectorizer(stop_words="english", min_df=1, ngram_range=(1, 2))

topic_model = BERTopic(

  # Pipeline models
  embedding_model=embedding_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  vectorizer_model=vectorizer_model,
  representation_model=representation_model,

  # Hyperparameters
  top_n_words=5,
  verbose=True
)

# Train model
topics, probs = topic_model.fit_transform(abstracts, embeddings)

# Show topics
topic_model.get_topic_info()

# Reduce dimensionality of embeddings, this step is optional but much faster to perform iteratively:
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)

# Visualize the documents in 2-dimensional space and show the titles on hover instead of the abstracts
topic_model.visualize_documents(titles, reduced_embeddings=reduced_embeddings, custom_labels=True)

2024-11-28 15:38:08,630 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-11-28 15:38:18,055 - BERTopic - Dimensionality - Completed ✓
2024-11-28 15:38:18,057 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-11-28 15:38:18,127 - BERTopic - Cluster - Completed ✓
2024-11-28 15:38:18,136 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-11-28 15:38:19,798 - BERTopic - Representation - Completed ✓


In [None]:
# case 1: 세밀한 클러스터링
# n_neighbors=5
# n_components=5
# min_cluster_size=15
# top_n_words=5
# min_dist=0.0

umap_model = UMAP(n_neighbors=5, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
vectorizer_model = CountVectorizer(stop_words="english", min_df=1, ngram_range=(1, 2))

topic_model = BERTopic(

  # Pipeline models
  embedding_model=embedding_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  vectorizer_model=vectorizer_model,
  representation_model=representation_model,

  # Hyperparameters
  top_n_words=5,
  verbose=True
)

# Train model
topics, probs = topic_model.fit_transform(abstracts, embeddings)

# Show topics
topic_model.get_topic_info()

# Reduce dimensionality of embeddings, this step is optional but much faster to perform iteratively:
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)

# Visualize the documents in 2-dimensional space and show the titles on hover instead of the abstracts
topic_model.visualize_documents(titles, reduced_embeddings=reduced_embeddings, custom_labels=True)

2024-11-28 15:38:25,858 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-11-28 15:38:27,834 - BERTopic - Dimensionality - Completed ✓
2024-11-28 15:38:27,836 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-11-28 15:38:27,871 - BERTopic - Cluster - Completed ✓
2024-11-28 15:38:27,877 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-11-28 15:38:35,091 - BERTopic - Representation - Completed ✓


In [None]:
# case 2: 좀 더 큰 클러스터만 감지하도록 + 좀 더 풍부한 주제 설명
# n_neighbors=5
# min_cluster_size=30
# top_n_words=15
# min_dist=0.0

umap_model = UMAP(n_neighbors=5, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = HDBSCAN(min_cluster_size=30, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
vectorizer_model = CountVectorizer(stop_words="english", min_df=1, ngram_range=(1, 2))

topic_model = BERTopic(

  # Pipeline models
  embedding_model=embedding_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  vectorizer_model=vectorizer_model,
  representation_model=representation_model,

  # Hyperparameters
  top_n_words=15,
  verbose=True
)

# Train model
topics, probs = topic_model.fit_transform(abstracts, embeddings)

# Show topics
topic_model.get_topic_info()

# Reduce dimensionality of embeddings, this step is optional but much faster to perform iteratively:
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)

# Visualize the documents in 2-dimensional space and show the titles on hover instead of the abstracts
topic_model.visualize_documents(titles, reduced_embeddings=reduced_embeddings, custom_labels=True)

2024-11-28 15:38:37,305 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-11-28 15:38:39,279 - BERTopic - Dimensionality - Completed ✓
2024-11-28 15:38:39,280 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-11-28 15:38:39,320 - BERTopic - Cluster - Completed ✓
2024-11-28 15:38:39,325 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-11-28 15:38:41,847 - BERTopic - Representation - Completed ✓


In [None]:
# case 3: 좀 더 풍부한 주제 설명
# n_neighbors=5
# min_cluster_size=30
# top_n_words=20
# min_dist=0.0

umap_model = UMAP(n_neighbors=5, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = HDBSCAN(min_cluster_size=30, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
vectorizer_model = CountVectorizer(stop_words="english", min_df=1, ngram_range=(1, 2))

topic_model = BERTopic(

  # Pipeline models
  embedding_model=embedding_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  vectorizer_model=vectorizer_model,
  representation_model=representation_model,

  # Hyperparameters
  top_n_words=20,
  verbose=True
)

# Train model
topics, probs = topic_model.fit_transform(abstracts, embeddings)

# Show topics
topic_model.get_topic_info()

# Reduce dimensionality of embeddings, this step is optional but much faster to perform iteratively:
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)

# Visualize the documents in 2-dimensional space and show the titles on hover instead of the abstracts
topic_model.visualize_documents(titles, reduced_embeddings=reduced_embeddings, custom_labels=True)

2024-11-28 15:38:43,852 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-11-28 15:38:45,986 - BERTopic - Dimensionality - Completed ✓
2024-11-28 15:38:45,988 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-11-28 15:38:46,045 - BERTopic - Cluster - Completed ✓
2024-11-28 15:38:46,050 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-11-28 15:38:49,470 - BERTopic - Representation - Completed ✓


In [None]:
# case 4: 전역성 반영
# n_neighbors=10
# min_cluster_size=30
# top_n_words=20
# min_dist=0.0

umap_model = UMAP(n_neighbors=10, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = HDBSCAN(min_cluster_size=30, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
vectorizer_model = CountVectorizer(stop_words="english", min_df=1, ngram_range=(1, 2))

topic_model = BERTopic(

  # Pipeline models
  embedding_model=embedding_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  vectorizer_model=vectorizer_model,
  representation_model=representation_model,

  # Hyperparameters
  top_n_words=20,
  verbose=True
)

# Train model
topics, probs = topic_model.fit_transform(abstracts, embeddings)

# Show topics
topic_model.get_topic_info()

# Reduce dimensionality of embeddings, this step is optional but much faster to perform iteratively:
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)

# Visualize the documents in 2-dimensional space and show the titles on hover instead of the abstracts
topic_model.visualize_documents(titles, reduced_embeddings=reduced_embeddings, custom_labels=True)

2024-11-28 15:38:51,444 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-11-28 15:38:53,945 - BERTopic - Dimensionality - Completed ✓
2024-11-28 15:38:53,947 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-11-28 15:38:53,995 - BERTopic - Cluster - Completed ✓
2024-11-28 15:38:54,000 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-11-28 15:38:56,506 - BERTopic - Representation - Completed ✓


In [None]:
from itertools import product
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.metrics import silhouette_score
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

param_grid = {
    "n_neighbors": [5, 10, 15, 20, 25, 30, 50],
    "min_cluster_size": [5, 10, 20, 50, 100, 200],
    "top_n_words": [5, 10, 15, 20, 25, 30],
    "min_dist": [0.0, 0.1, 0.2, 0.3, 0.4, 0.5],
}

# 전체 파라미터 조합
param_combinations = list(product(*param_grid.values()))
vectorizer_model = CountVectorizer(stop_words="english", min_df=1, ngram_range=(1, 2))

results = []

# Iteration
for combination in param_combinations:
    params = dict(zip(param_grid.keys(), combination))

    umap_model = UMAP(
        n_neighbors=params["n_neighbors"],
        n_components=5,
        min_dist=params["min_dist"],
        metric="cosine",
        random_state=42
    )
    hdbscan_model = HDBSCAN(
        min_cluster_size=params["min_cluster_size"],
        metric="euclidean",
        cluster_selection_method="eom",
        prediction_data=True
    )

    topic_model = BERTopic(
        embedding_model=embedding_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        vectorizer_model=vectorizer_model,
        representation_model=representation_model,
        top_n_words=params["top_n_words"],
        verbose=False
    )

    try:
        topics, probs = topic_model.fit_transform(abstracts, embeddings)

        labels = hdbscan_model.labels_

        # 실루엣 점수 계산
        if len(set(labels)) > 1: # 클러스터 개수 2 이상인 경우만
            silhouette_avg = silhouette_score(embeddings, labels)
        else:
            silhouette_avg = -1  # 클러스터 하나인 경우

        # Save the result
        results.append((params, silhouette_avg))
        print(f"Params: {params}, Silhouette Score: {silhouette_avg}")

    except Exception as e: # 예외 처리
        print(f"Params: {params} caused an error: {e}")
        continue

# best parameters
best_params = max(results, key=lambda x: x[1])
print("\nBest Hyperparameters:")
print(f"Params: {best_params[0]}, Silhouette Score: {best_params[1]}")

# 전체 결과 재확인
sorted_results = sorted(results, key=lambda x: x[1], reverse=True)
print("\nAll Results:")
for res in sorted_results:
    print(f"Params: {res[0]}, Silhouette Score: {res[1]}")

Params: {'n_neighbors': 5, 'min_cluster_size': 5, 'top_n_words': 5, 'min_dist': 0.0}, Silhouette Score: -0.0012753187911584973
Params: {'n_neighbors': 5, 'min_cluster_size': 5, 'top_n_words': 5, 'min_dist': 0.1}, Silhouette Score: -0.005854322575032711
Params: {'n_neighbors': 5, 'min_cluster_size': 5, 'top_n_words': 5, 'min_dist': 0.2}, Silhouette Score: 0.015978312119841576
Params: {'n_neighbors': 5, 'min_cluster_size': 5, 'top_n_words': 5, 'min_dist': 0.3}, Silhouette Score: 0.01635102741420269
Params: {'n_neighbors': 5, 'min_cluster_size': 5, 'top_n_words': 5, 'min_dist': 0.4}, Silhouette Score: 0.01635102741420269
Params: {'n_neighbors': 5, 'min_cluster_size': 5, 'top_n_words': 5, 'min_dist': 0.5}, Silhouette Score: 0.01585463434457779
Params: {'n_neighbors': 5, 'min_cluster_size': 5, 'top_n_words': 10, 'min_dist': 0.0}, Silhouette Score: -0.0012753187911584973
Params: {'n_neighbors': 5, 'min_cluster_size': 5, 'top_n_words': 10, 'min_dist': 0.1}, Silhouette Score: -0.00585432257503

In [None]:
# case 5: 실루엣 점수 최고 기록
# n_neighbors=15
# min_cluster_size=50
# top_n_words=5
# min_dist=0.0

from itertools import product
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
from sklearn.metrics import silhouette_score

umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = HDBSCAN(min_cluster_size=50, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
vectorizer_model = CountVectorizer(stop_words="english", min_df=1, ngram_range=(1, 2))

topic_model = BERTopic(

  # Pipeline models
  embedding_model=embedding_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  vectorizer_model=vectorizer_model,
  representation_model=representation_model,

  # Hyperparameters
  top_n_words=5,
  verbose=True
)

# Train model
topics, probs = topic_model.fit_transform(abstracts, embeddings)

# Show topics
topic_model.get_topic_info()

# Reduce dimensionality of embeddings, this step is optional but much faster to perform iteratively:
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)

# Visualize the documents in 2-dimensional space and show the titles on hover instead of the abstracts
topic_model.visualize_documents(titles, reduced_embeddings=reduced_embeddings, custom_labels=True)

2024-11-28 18:40:53,865 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-11-28 18:40:57,800 - BERTopic - Dimensionality - Completed ✓
2024-11-28 18:40:57,802 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-11-28 18:40:57,847 - BERTopic - Cluster - Completed ✓
2024-11-28 18:40:57,851 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-11-28 18:40:59,768 - BERTopic - Representation - Completed ✓


In [None]:
# case 6: (실루엣 계수 최고 기록 기반) 좀 더 세부적인 주제 탐지
# n_neighbors=15
# min_cluster_size=30
# top_n_words=10
# min_dist=0.0

from itertools import product
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
from sklearn.metrics import silhouette_score

umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = HDBSCAN(min_cluster_size=30, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
vectorizer_model = CountVectorizer(stop_words="english", min_df=1, ngram_range=(1, 2))

topic_model = BERTopic(

  # Pipeline models
  embedding_model=embedding_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  vectorizer_model=vectorizer_model,
  representation_model=representation_model,

  # Hyperparameters
  top_n_words=10,
  verbose=True
)

# Train model
topics, probs = topic_model.fit_transform(abstracts, embeddings)

# Show topics
topic_model.get_topic_info()

# Reduce dimensionality of embeddings, this step is optional but much faster to perform iteratively:
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)

# Visualize the documents in 2-dimensional space and show the titles on hover instead of the abstracts
topic_model.visualize_documents(titles, reduced_embeddings=reduced_embeddings, custom_labels=True)

2024-11-28 18:41:01,823 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-11-28 18:41:04,592 - BERTopic - Dimensionality - Completed ✓
2024-11-28 18:41:04,593 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-11-28 18:41:04,630 - BERTopic - Cluster - Completed ✓
2024-11-28 18:41:04,635 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-11-28 18:41:07,623 - BERTopic - Representation - Completed ✓


In [None]:
# case 7: (실루엣 계수 최고 기록 기반) 더 풍부한 주제 설명
# n_neighbors=10
# min_cluster_size=30
# top_n_words=10
# min_dist=0.0

from itertools import product
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
from sklearn.metrics import silhouette_score

umap_model = UMAP(n_neighbors=10, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = HDBSCAN(min_cluster_size=30, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
vectorizer_model = CountVectorizer(stop_words="english", min_df=1, ngram_range=(1, 2))

topic_model = BERTopic(

  # Pipeline models
  embedding_model=embedding_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  vectorizer_model=vectorizer_model,
  representation_model=representation_model,

  # Hyperparameters
  top_n_words=10,
  verbose=True
)

# Train model
topics, probs = topic_model.fit_transform(abstracts, embeddings)

# Show topics
topic_model.get_topic_info()

# Reduce dimensionality of embeddings, this step is optional but much faster to perform iteratively:
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)

# Visualize the documents in 2-dimensional space and show the titles on hover instead of the abstracts
topic_model.visualize_documents(titles, reduced_embeddings=reduced_embeddings, custom_labels=True)

2024-11-28 18:41:10,435 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-11-28 18:41:12,931 - BERTopic - Dimensionality - Completed ✓
2024-11-28 18:41:12,932 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-11-28 18:41:12,971 - BERTopic - Cluster - Completed ✓
2024-11-28 18:41:12,976 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-11-28 18:41:15,519 - BERTopic - Representation - Completed ✓
