# 텍스트 클러스터링

## 1. 데이터 로드

In [2]:
# 데이터 로드
from datasets import load_dataset
dataset = load_dataset("maartengr/arxiv_nlp")["train"]

README.md:   0%|          | 0.00/617 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


data.csv:   0%|          | 0.00/53.2M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [3]:
dataset

Dataset({
    features: ['Titles', 'Abstracts', 'Years', 'Categories'],
    num_rows: 44949
})

In [4]:
abstracts = dataset["Abstracts"]
titles = dataset["Titles"]

## 2. 임베딩 생성

In [None]:
from sentence_transformers import SentenceTransformer

#초록에 대한 임베딩 생성
embedding_model = SentenceTransformer("thenlper/gte-small")
embeddings = embedding_model.encode(abstracts, show_progress_bar = True)
embeddings.shape

modules.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/66.7M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/394 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1405 [00:00<?, ?it/s]

## 3. 임베딩 차원 축소

In [None]:
from umap import UMAP

# 입력 임베딩 차원축소 (384-> 5)
umap_model = UMAP(
                    n_components = 5, 
                    min_dist = 0.0, 
                    metric= 'cosine', 
                    random_state=42
                )
reduced_embeddings = umap_model.fit_transform(embeddings)
reduced_embeddings.shape

## 4. 임베딩 클러스터링 
- 밀도기반(density-based) 클러스터링 모델, HDBSCAN 사용

    - 장점 
        1) 클러스터의 개수를 자유롭게 설정
        2) 모든 데이터 포인트를 클러스터에 할당하지 않음 > 이상치 감지 가능

In [None]:
from hdbscan import HDBSCAN

# 모델 훈련 > 클러스터 추출
hdbscan_model = HDBSCAN(min_cluster_size=50).fit(reduced_embeddings)
clusters = hdbscan_model.labels_

#클러스터 개수 확인
len(set(clusters))

## 5. 클러스터 확인

In [None]:
import numpy as np

# 특정 클러스터에 있는 n개 문서 출력
cluster_idx = 0
show_doc_cnt = 3
for index in np.where(clusters=cluster_idx)[0][:show_doc_cnt]:
    print(abstracts[index][:300] + "... \n")

## 6. 클러스터 시각화

- 시각화를 위해 2차원으로 차원축소 필요

In [None]:
import pandas as pd

# 차원축소 (384차원 > 2차원)
reduced_embeddings = UMAP(
                        n_components=2,
                        min_dist =0.0,
                        metric="cosine",
                        ransom_state =42                  
                        ).fit_transform(embeddings)

# 데이터 프레임 생성
df = pd.DataFrame(reduced_embeddings, columns = ["x", "y"])
df["title"] = titles
df["cluster"] = [str(c) for c in clusters]

clusters_df = df.loc[df.cluster !='-1', :]
outliers_df = df.loc[df.cluster =='-1', :]



In [None]:
import matplotlib.pyplot as plt

# 그래프에 시각화(이상치, 정상치)
plt.scatter(outliers_df.x, outliers_df.y, alpha=0.05, s=2, c="grey")
plt.scatter(
    clusters_df.x, clusters_df.y, c=clusters_df.cluster.astype(int),
    alpha=0.6, s=2, cmap= 'tab20b'
)

plt.axis("off")