DBSCAN 사용 <br/>
GridSearch를 사용한 파라미터 최적화 <br/>
빈도수를 이용한 불용어 250개를 추출하여 사용 <br/>
문서의 최빈 어휘 100개를 추출하여 벡터화한 것을 클러스터링 <br/>
ver1과 비교하여 클러스터의 갯수와 노이즈 포인트의 갯수 모두 증가, 실루엣 스코어 미세하게 하락 <br/><br/>

Best DBSCAN params: eps=0.1, min_samples=2 with Silhouette Score=-0.19123197729214664<br/>
Number of clusters: 32<br/>
Number of noise points: 1242

In [1]:
import os
import glob
import string
from tqdm import tqdm
from collections import Counter
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [2]:
# nltk 데이터 다운로드 (최초 실행 시 필요)
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# Google Drive 마운트
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# NLTK 초기화
lemmatizer = WordNetLemmatizer()

In [5]:
# txt 파일 경로 설정 (Google Drive의 경로로 변경)
path = '/content/drive/MyDrive/bitcoin_nlp'
output_path = '/content/drive/MyDrive/bitcoin_nlp/preprocessed'

In [6]:
# 불용어 리스트 생성 (빈도 기반)
def create_stopwords(documents, top_n=250):
    all_words = []
    for doc in documents:
        all_words.extend(doc[1].split())
    word_counts = Counter(all_words)
    stopwords = [word for word, count in word_counts.most_common(top_n)]
    return stopwords

In [7]:
# 구두점 제거 함수
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

In [8]:
# 불용어 제거 함수
def remove_stopwords(text, stopwords):
    words = text.split()
    words = [word for word in words if word not in stopwords]
    return ' '.join(words)

In [9]:
# 표제어 추출 함수
def lemmatize_words(text):
    words = text.split()
    lemmatized_words = [lemmatizer.lemmatize(word, pos='v') for word in words]
    return ' '.join(lemmatized_words)

In [10]:
# 각 문서에서 빈도가 가장 높은 단어 100개 추출
def get_top_n_words(documents, n=100):
    top_words = []
    for filename, content in documents:
        words = content.split()
        word_counts = Counter(words)
        top_n = [word for word, count in word_counts.most_common(n)]
        top_words.append((filename, top_n))
    return top_words

In [11]:
# 최빈 단어 기반 문서 벡터 생성
def create_document_vectors(top_words, all_words_set):
    vectors = []
    for filename, words in top_words:
        vector = np.zeros(len(all_words_set))
        for word in words:
            if word in all_words_set:
                vector[list(all_words_set).index(word)] = 1
        vectors.append((filename, vector))
    return vectors

In [12]:
# 텍스트 파일 읽기 및 전처리 (구두점 제거, 불용어 제거, 표제어 추출)
def read_and_preprocess_files(path, stopwords):
    files = glob.glob(os.path.join(path, '*.txt'))
    documents = []
    for file in tqdm(files, desc="Reading and preprocessing files"):
        with open(file, 'r', encoding='utf-8') as f:
            content = f.read()
            content = remove_punctuation(content)  # 구두점 제거
            content = remove_stopwords(content, stopwords)  # 불용어 제거
            content = lemmatize_words(content)  # 표제어 추출
            documents.append((os.path.basename(file), content))
    return documents

In [13]:
# 전처리된 파일 저장
def save_preprocessed_files(documents, output_path):
    if not os.path.exists(output_path):
        os.makedirs(output_path)

    for filename, content in documents:
        # 기존 파일 이름 뒤에 '_preprocessed' 추가
        preprocessed_filename = os.path.splitext(filename)[0] + '_preprocessed.txt'
        with open(os.path.join(output_path, preprocessed_filename), 'w', encoding='utf-8') as f:
            f.write(content)

In [14]:
# 문서 읽기
documents = read_and_preprocess_files(path, [])

Reading and preprocessing files: 100%|██████████| 1373/1373 [00:43<00:00, 31.46it/s]


In [15]:
# 불용어 리스트 생성
stopwords = create_stopwords(documents)
stopwords

['the',
 'be',
 'of',
 'and',
 'to',
 'a',
 'in',
 'for',
 '\u200b',
 'that',
 'will',
 'The',
 'on',
 'as',
 'or',
 'with',
 'by',
 'have',
 'can',
 'use',
 'an',
 'from',
 'this',
 'not',
 '\u200b\u200b',
 'it',
 'their',
 'any',
 'blockchain',
 'which',
 'data',
 'network',
 'at',
 'all',
 'market',
 'other',
 'This',
 'service',
 'contract',
 'tokens',
 '•',
 'In',
 'provide',
 'users',
 'we',
 'platform',
 '1',
 'more',
 'token',
 'such',
 'its',
 'time',
 'block',
 'make',
 'system',
 'transaction',
 'A',
 'also',
 'value',
 'new',
 'do',
 'exchange',
 '2',
 'may',
 'our',
 'they',
 'create',
 'transactions',
 'Token',
 'include',
 'information',
 'only',
 'one',
 'process',
 'need',
 'allow',
 'user',
 'into',
 'smart',
 'base',
 'each',
 'through',
 'trade',
 'project',
 'but',
 'fee',
 'technology',
 'We',
 '3',
 'you',
 'reward',
 'price',
 'number',
 'these',
 'chain',
 'no',
 '●',
 'development',
 'It',
 'nod',
 'if',
 'account',
 'order',
 'Ethereum',
 'than',
 'require',


In [16]:
# 문서 읽기
documents = read_and_preprocess_files(path, stopwords)

Reading and preprocessing files: 100%|██████████| 1373/1373 [00:38<00:00, 35.29it/s]


In [17]:
# 전처리된 파일 저장
save_preprocessed_files(documents, output_path)

In [18]:
# 각 문서에서 빈도가 가장 높은 단어 100개 추출
top_words = get_top_n_words(documents)

In [19]:
# 모든 문서에서 등장하는 모든 단어의 집합 생성
all_words_set = set()
for filename, words in top_words:
    all_words_set.update(words)

In [20]:
# 최빈 단어 기반 문서 벡터 생성
document_vectors = create_document_vectors(top_words, all_words_set)

In [21]:
# 벡터를 numpy 배열로 변환 및 스케일링
filenames, vectors = zip(*document_vectors)
X = np.array(vectors)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [22]:
# Grid Search를 사용한 DBSCAN 파라미터 최적화
eps_values = np.arange(0.1, 2.1, 0.1)
min_samples_values = list(range(2, 31, 2))
best_dbscan_score = -1
best_dbscan_params = None

In [23]:
for eps in eps_values:
    for min_samples in min_samples_values:
        print(f"Running DBSCAN with eps={eps}, min_samples={min_samples}")
        dbscan = DBSCAN(eps=eps, min_samples=min_samples, metric='euclidean')
        clusters = dbscan.fit_predict(X_scaled)

        # 클러스터링 평가 (Silhouette Score 사용)
        if len(set(clusters)) > 1:
            score = silhouette_score(X_scaled, clusters)
            print(f"Silhouette Score: {score}")
            if score > best_dbscan_score:
                best_dbscan_score = score
                best_dbscan_params = (eps, min_samples)

Running DBSCAN with eps=0.1, min_samples=2
Silhouette Score: -0.19123197729214664
Running DBSCAN with eps=0.1, min_samples=4
Silhouette Score: -0.24913481488600214
Running DBSCAN with eps=0.1, min_samples=6
Silhouette Score: -0.24913481488600214
Running DBSCAN with eps=0.1, min_samples=8
Silhouette Score: -0.2621043250725149
Running DBSCAN with eps=0.1, min_samples=10
Silhouette Score: -0.2621043250725149
Running DBSCAN with eps=0.1, min_samples=12
Silhouette Score: -0.2621043250725149
Running DBSCAN with eps=0.1, min_samples=14
Silhouette Score: -0.2621043250725149
Running DBSCAN with eps=0.1, min_samples=16
Silhouette Score: -0.2621043250725149
Running DBSCAN with eps=0.1, min_samples=18
Silhouette Score: -0.2621043250725149
Running DBSCAN with eps=0.1, min_samples=20
Silhouette Score: -0.2621043250725149
Running DBSCAN with eps=0.1, min_samples=22
Silhouette Score: -0.2621043250725149
Running DBSCAN with eps=0.1, min_samples=24
Silhouette Score: -0.2621043250725149
Running DBSCAN wi

In [25]:
# 최적의 파라미터로 DBSCAN 클러스터링 수행
if best_dbscan_params:
    eps, min_samples = best_dbscan_params
    print(f"Best DBSCAN params: eps={eps}, min_samples={min_samples} with Silhouette Score={best_dbscan_score}")
    dbscan = DBSCAN(eps=eps, min_samples=min_samples, metric='euclidean')
    clusters = dbscan.fit_predict(X_scaled)

    # 각 클러스터에 속한 파일들 출력 및 CSV 저장
    clustered_files = {}
    for file, cluster in zip(filenames, clusters):
        if cluster not in clustered_files:
            clustered_files[cluster] = []
        clustered_files[cluster].append(file)

    # 클러스터 결과 출력
    n_clusters = len(set(clusters)) - (1 if -1 in clusters else 0)
    n_noise = list(clusters).count(-1)
    print(f"Number of clusters: {n_clusters}")
    print(f"Number of noise points: {n_noise}")

    # 각 클러스터의 파일 리스트 출력
    for cluster, file_list in clustered_files.items():
        print(f"Cluster {cluster}:")
        for file in file_list:
            print(f"  {file}")

    # 결과를 CSV 파일로 저장
    results = {'File': filenames, 'Cluster': clusters}
    results_df = pd.DataFrame(results)
    results_df.to_csv('/content/drive/MyDrive/bitcoin_nlp/dbscan_results.csv', index=False)
    print("DBSCAN 결과가 CSV 파일로 저장되었습니다.")

else:
    print("No suitable parameters found for DBSCAN.")

Best DBSCAN params: eps=0.1, min_samples=2 with Silhouette Score=-0.19123197729214664
Number of clusters: 32
Number of noise points: 1242
Cluster -1:
  WaBi WABI whitepapers - whitepaper.io.txt
  BLOCKv VEE whitepapers - whitepaper.io.txt
  Electric Vehicle Zone EVZ whitepapers - whitepaper.io.txt
  ScPrime SCP whitepapers - whitepaper.io.txt
  FOAM FOAM whitepapers - whitepaper.io.txt
  ShareToken SHR whitepapers - whitepaper.io.txt
  PEAKDEFI PEAK whitepapers - whitepaper.io.txt
  Mysterium MYST whitepapers - whitepaper.io.txt
  Hegic HEGIC whitepapers - whitepaper.io.txt
  Zano ZANO whitepapers - whitepaper.io.txt
  Kylin KYL whitepapers - whitepaper.io.txt
  POA POA whitepapers - whitepaper.io.txt
  ProximaX XPX whitepapers - whitepaper.io.txt
  Ignis IGNIS whitepapers - whitepaper.io.txt
  N.Exchange NEX whitepapers - whitepaper.io.txt
  BioPassport Token BIOT whitepapers - whitepaper.io.txt
  Edgeless EDG whitepapers - whitepaper.io.txt
  GamerCoin GHX whitepapers - whitepaper.io