Doc2Vec, DBSCAN 사용 <br/>
GridSearch를 사용한 파라미터 최적화 <br/>
빈도수를 이용한 불용어 250개를 추출하여 사용 <br/>
ver1과 동일하지만 epoch를 20에서 30으로 증가시킴 <br/>
ver1과 비교하여 클러스터의 갯수와 노이즈 포인트의 갯수 모두 감소 <br/><br/>

Best DBSCAN params: eps=2.0, min_samples=4 with Silhouette Score=-0.12577107405301016<br/>
Number of clusters: 3<br/>
Number of noise points: 1207

In [1]:
import os
import glob
import string
from tqdm import tqdm
from collections import Counter
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [2]:
# nltk 데이터 다운로드 (최초 실행 시 필요)
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# Google Drive 마운트
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# NLTK 초기화
lemmatizer = WordNetLemmatizer()

In [5]:
# txt 파일 경로 설정 (Google Drive의 경로로 변경)
path = '/content/drive/MyDrive/bitcoin_nlp'
output_path = '/content/drive/MyDrive/bitcoin_nlp/preprocessed'

In [6]:
# 불용어 리스트 생성 (빈도 기반)
def create_stopwords(documents, top_n=250):
    all_words = []
    for doc in documents:
        all_words.extend(doc[1].split())
    word_counts = Counter(all_words)
    stopwords = [word for word, count in word_counts.most_common(top_n)]
    return stopwords

In [7]:
# 구두점 제거 함수
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

In [8]:
# 불용어 제거 함수
def remove_stopwords(text, stopwords):
    words = text.split()
    words = [word for word in words if word not in stopwords]
    return ' '.join(words)

In [9]:
# 표제어 추출 함수
def lemmatize_words(text):
    words = text.split()
    lemmatized_words = [lemmatizer.lemmatize(word, pos='v') for word in words]
    return ' '.join(lemmatized_words)

In [10]:
# 텍스트 파일 읽기 및 전처리 (구두점 제거, 불용어 제거, 표제어 추출)
def read_and_preprocess_files(path, stopwords):
    files = glob.glob(os.path.join(path, '*.txt'))
    documents = []
    for file in tqdm(files, desc="Reading and preprocessing files"):
        with open(file, 'r', encoding='utf-8') as f:
            content = f.read()
            content = remove_punctuation(content)  # 구두점 제거
            content = remove_stopwords(content, stopwords)  # 불용어 제거
            content = lemmatize_words(content)  # 표제어 추출
            documents.append((os.path.basename(file), content))
    return documents

In [11]:
# 전처리된 파일 저장
def save_preprocessed_files(documents, output_path):
    if not os.path.exists(output_path):
        os.makedirs(output_path)

    for filename, content in documents:
        # 기존 파일 이름 뒤에 '_preprocessed' 추가
        preprocessed_filename = os.path.splitext(filename)[0] + '_preprocessed.txt'
        with open(os.path.join(output_path, preprocessed_filename), 'w', encoding='utf-8') as f:
            f.write(content)

In [12]:
# 문서 태그 및 형태소 분석
def tag_documents(documents):
    tagged_documents = [TaggedDocument(doc.split(), [i]) for i, (name, doc) in enumerate(tqdm(documents, desc="Tagging documents"))]
    return tagged_documents

In [13]:
# Doc2Vec 모델 파라미터 최적화
def optimize_doc2vec(tagged_corpus_list, documents):
    vector_size = 25
    windows = [3, 5]
    alphas = [0.03, 0.05]
    min_alphas = [0.0001, 0.0002, 0.0003]

    best_doc2vec_params = None
    best_doc2vec_score = -1

    for window in windows:
        for alpha in alphas:
            for min_alpha in min_alphas:
                print(f"Training Doc2Vec with vector_size={vector_size}, window={window}, alpha={alpha}, min_alpha={min_alpha}")
                model = Doc2Vec(vector_size=vector_size, window=window, alpha=alpha, min_alpha=min_alpha, workers=8)
                model.build_vocab(tagged_corpus_list)
                model.train(tagged_corpus_list, total_examples=model.corpus_count, epochs=30)

                # 벡터 추출
                vectors = [model.dv[i] for i in range(len(documents))]
                X = np.array(vectors)

                # 벡터 스케일링
                scaler = StandardScaler()
                X_scaled = scaler.fit_transform(X)

                # DBSCAN 클러스터링 및 평가
                dbscan = DBSCAN(eps=2.0, min_samples=8, metric='euclidean')
                clusters = dbscan.fit_predict(X_scaled)

                if len(set(clusters)) > 1:
                    score = silhouette_score(X_scaled, clusters)
                    print(f"Silhouette Score: {score:.2f}")
                    if score > best_doc2vec_score:
                        best_doc2vec_score = score
                        best_doc2vec_params = (vector_size, window, alpha, min_alpha)

    if best_doc2vec_params:
        print(f"Best Doc2Vec params: vector_size={best_doc2vec_params[0]}, window={best_doc2vec_params[1]}, alpha={best_doc2vec_params[2]}, min_alpha={best_doc2vec_params[3]} with Silhouette Score={best_doc2vec_score:.2f}")
    else:
        print("No optimal parameters found. Silhouette scores were not positive for any parameter combination.")

    return best_doc2vec_params

In [14]:
# 문서 읽기
documents = read_and_preprocess_files(path, [])

Reading and preprocessing files: 100%|██████████| 1373/1373 [00:42<00:00, 32.24it/s]


In [15]:
# 불용어 리스트 생성
stopwords = create_stopwords(documents)
stopwords

['the',
 'be',
 'of',
 'and',
 'to',
 'a',
 'in',
 'for',
 '\u200b',
 'that',
 'will',
 'The',
 'on',
 'as',
 'or',
 'with',
 'by',
 'have',
 'can',
 'use',
 'an',
 'from',
 'this',
 'not',
 '\u200b\u200b',
 'it',
 'their',
 'any',
 'blockchain',
 'which',
 'data',
 'network',
 'at',
 'all',
 'market',
 'other',
 'This',
 'service',
 'contract',
 'tokens',
 '•',
 'In',
 'provide',
 'users',
 'we',
 'platform',
 '1',
 'more',
 'token',
 'such',
 'its',
 'time',
 'block',
 'make',
 'system',
 'transaction',
 'A',
 'also',
 'value',
 'new',
 'do',
 'exchange',
 '2',
 'may',
 'our',
 'they',
 'create',
 'transactions',
 'Token',
 'include',
 'information',
 'only',
 'one',
 'process',
 'need',
 'allow',
 'user',
 'into',
 'smart',
 'base',
 'each',
 'through',
 'trade',
 'project',
 'but',
 'fee',
 'technology',
 'We',
 '3',
 'you',
 'reward',
 'price',
 'number',
 'these',
 'chain',
 'no',
 '●',
 'development',
 'It',
 'nod',
 'if',
 'account',
 'order',
 'Ethereum',
 'than',
 'require',


In [16]:
# 문서 읽기
documents = read_and_preprocess_files(path, stopwords)

Reading and preprocessing files: 100%|██████████| 1373/1373 [00:39<00:00, 34.82it/s]


In [17]:
# 전처리된 파일 저장
save_preprocessed_files(documents, output_path)

In [18]:
# 태그된 문서 리스트 생성
tagged_corpus_list = tag_documents(documents)

Tagging documents: 100%|██████████| 1373/1373 [00:00<00:00, 3111.90it/s]


In [19]:
best_params = optimize_doc2vec(tagged_corpus_list, documents)

Training Doc2Vec with vector_size=25, window=3, alpha=0.03, min_alpha=0.0001
Silhouette Score: -0.16
Training Doc2Vec with vector_size=25, window=3, alpha=0.03, min_alpha=0.0002
Silhouette Score: -0.16
Training Doc2Vec with vector_size=25, window=3, alpha=0.03, min_alpha=0.0003
Silhouette Score: -0.15
Training Doc2Vec with vector_size=25, window=3, alpha=0.05, min_alpha=0.0001
Silhouette Score: -0.17
Training Doc2Vec with vector_size=25, window=3, alpha=0.05, min_alpha=0.0002
Silhouette Score: -0.17
Training Doc2Vec with vector_size=25, window=3, alpha=0.05, min_alpha=0.0003
Silhouette Score: -0.17
Training Doc2Vec with vector_size=25, window=5, alpha=0.03, min_alpha=0.0001
Silhouette Score: -0.16
Training Doc2Vec with vector_size=25, window=5, alpha=0.03, min_alpha=0.0002
Silhouette Score: -0.17
Training Doc2Vec with vector_size=25, window=5, alpha=0.03, min_alpha=0.0003
Silhouette Score: -0.16
Training Doc2Vec with vector_size=25, window=5, alpha=0.05, min_alpha=0.0001
Silhouette Sco

In [20]:
# 최적의 Doc2Vec 파라미터로 모델 학습
vector_size, window, alpha, min_alpha = best_params
model = Doc2Vec(vector_size=vector_size, window=window, alpha=alpha, min_alpha=min_alpha, workers=8)
model.build_vocab(tagged_corpus_list)
model.train(tagged_corpus_list, total_examples=model.corpus_count, epochs=10)

In [21]:
# 최적화된 모델로 벡터 추출 및 저장
vectors = [model.dv[i] for i in range(len(documents))]
output_dir = '/content/drive/MyDrive/bitcoin_nlp/vectors'

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

for i, vector in enumerate(vectors):
    vector_path = os.path.join(output_dir, f'vector_{i}.txt')
    np.savetxt(vector_path, vector)

In [22]:
# 벡터 파일 읽기
def load_vectors(vector_dir):
    vector_files = sorted(glob.glob(os.path.join(vector_dir, 'vector_*.txt')))
    vectors = []
    for file in tqdm(vector_files, desc="Loading vectors"):
        vector = np.loadtxt(file)
        vectors.append(vector)
    return vectors, vector_files

In [23]:
vectors, vector_files = load_vectors(output_dir)

# 벡터를 numpy 배열로 변환
X = np.array(vectors)

# 데이터를 스케일링
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

Loading vectors: 100%|██████████| 1373/1373 [00:10<00:00, 128.39it/s]


In [24]:
# Grid Search를 사용한 DBSCAN 파라미터 최적화
eps_values = np.arange(0.1, 2.1, 0.1)
min_samples_values = list(range(2, 31, 2))
best_dbscan_score = -1
best_dbscan_params = None

In [25]:
for eps in eps_values:
    for min_samples in min_samples_values:
        print(f"Running DBSCAN with eps={eps}, min_samples={min_samples}")
        dbscan = DBSCAN(eps=eps, min_samples=min_samples, metric='euclidean')
        clusters = dbscan.fit_predict(X_scaled)

        # 클러스터링 평가 (Silhouette Score 사용)
        if len(set(clusters)) > 1:
            score = silhouette_score(X_scaled, clusters)
            print(f"Silhouette Score: {score}")
            if score > best_dbscan_score:
                best_dbscan_score = score
                best_dbscan_params = (eps, min_samples)

Running DBSCAN with eps=0.1, min_samples=2
Silhouette Score: -0.15293458565434953
Running DBSCAN with eps=0.1, min_samples=4
Silhouette Score: -0.15293458565434953
Running DBSCAN with eps=0.1, min_samples=6
Silhouette Score: -0.15293458565434953
Running DBSCAN with eps=0.1, min_samples=8
Silhouette Score: -0.15293458565434953
Running DBSCAN with eps=0.1, min_samples=10
Silhouette Score: -0.15293458565434953
Running DBSCAN with eps=0.1, min_samples=12
Silhouette Score: -0.15293458565434953
Running DBSCAN with eps=0.1, min_samples=14
Silhouette Score: -0.15293458565434953
Running DBSCAN with eps=0.1, min_samples=16
Silhouette Score: -0.15293458565434953
Running DBSCAN with eps=0.1, min_samples=18
Silhouette Score: -0.15293458565434953
Running DBSCAN with eps=0.1, min_samples=20
Silhouette Score: -0.15293458565434953
Running DBSCAN with eps=0.1, min_samples=22
Silhouette Score: -0.15293458565434953
Running DBSCAN with eps=0.1, min_samples=24
Silhouette Score: -0.15293458565434953
Running 

In [26]:
# 최적의 파라미터로 DBSCAN 클러스터링 수행
if best_dbscan_params:
    eps, min_samples = best_dbscan_params
    print(f"Best DBSCAN params: eps={eps}, min_samples={min_samples} with Silhouette Score={best_dbscan_score}")
    dbscan = DBSCAN(eps=eps, min_samples=min_samples, metric='euclidean')
    clusters = dbscan.fit_predict(X_scaled)

    # 각 클러스터에 속한 파일들 출력 및 CSV 저장
    clustered_files = {}
    for file, cluster in zip(vector_files, clusters):
        if cluster not in clustered_files:
            clustered_files[cluster] = []
        clustered_files[cluster].append(file)

    # 클러스터 결과 출력
    n_clusters = len(set(clusters)) - (1 if -1 in clusters else 0)
    n_noise = list(clusters).count(-1)
    print(f"Number of clusters: {n_clusters}")
    print(f"Number of noise points: {n_noise}")

    # 각 클러스터의 파일 리스트 출력
    for cluster, file_list in clustered_files.items():
        print(f"Cluster {cluster}:")
        for file in file_list:
            print(f"  {file}")

    # 결과를 CSV 파일로 저장
    results = {'File': vector_files, 'Cluster': clusters}
    results_df = pd.DataFrame(results)
    results_df.to_csv('/content/drive/MyDrive/bitcoin_nlp/dbscan_results.csv', index=False)
    print("DBSCAN 결과가 CSV 파일로 저장되었습니다.")

else:
    print("No suitable parameters found for DBSCAN.")

Best DBSCAN params: eps=2.0, min_samples=4 with Silhouette Score=-0.12577107405301016
Number of clusters: 3
Number of noise points: 1207
Cluster -1:
  /content/drive/MyDrive/bitcoin_nlp/vectors/vector_0.txt
  /content/drive/MyDrive/bitcoin_nlp/vectors/vector_1.txt
  /content/drive/MyDrive/bitcoin_nlp/vectors/vector_10.txt
  /content/drive/MyDrive/bitcoin_nlp/vectors/vector_100.txt
  /content/drive/MyDrive/bitcoin_nlp/vectors/vector_1000.txt
  /content/drive/MyDrive/bitcoin_nlp/vectors/vector_1001.txt
  /content/drive/MyDrive/bitcoin_nlp/vectors/vector_1002.txt
  /content/drive/MyDrive/bitcoin_nlp/vectors/vector_1003.txt
  /content/drive/MyDrive/bitcoin_nlp/vectors/vector_1004.txt
  /content/drive/MyDrive/bitcoin_nlp/vectors/vector_1005.txt
  /content/drive/MyDrive/bitcoin_nlp/vectors/vector_1006.txt
  /content/drive/MyDrive/bitcoin_nlp/vectors/vector_1007.txt
  /content/drive/MyDrive/bitcoin_nlp/vectors/vector_1008.txt
  /content/drive/MyDrive/bitcoin_nlp/vectors/vector_1009.txt
  /con