# PJ3 Part2 - Clutering 

In [None]:
#If you dont have any packages run this cell

!pip install sklearn
!pip install numpy
!pip install pandas
!pip install yellowbrick
!pip install matplotlib

In [None]:
#Package import

from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn import metrics
from sklearn.cluster import KMeans
import numpy as np
from yellowbrick.cluster import KElbowVisualizer
import matplotlib.pyplot as plt
import os

In [None]:
#Data Load
categories = ['algebraic geometry', 'computer vision', 'general economics', 'quantitative biology', 'quantum physics','statistics theory']
data = load_files(container_path=r"text_all", categories=categories, shuffle=True, encoding='utf-8', decode_error='replace')

In [None]:
# TODO - Data preprocessing and clustering
'''
After appropriate data preprocessing, proceed with K-means clustering.
Calculate the V-measure score.
'''
count_vect = CountVectorizer(stop_words='english', max_features=2000, min_df=3, max_df=0.4)
data_counts = count_vect.fit_transform(data.data)
tfidf_transformer = TfidfTransformer()
data_trans = tfidf_transformer.fit_transform(data_counts)

# V-measure 
clst = KMeans(n_clusters=6, n_init=10, init='k-means++', random_state=42)
clst.fit(data_trans)
v_measure = metrics.v_measure_score(data.target, clst.labels_)

print(f"V-measure score: {v_measure}")

In [None]:
# TODO - Find the appropriate number of clusters
model = KMeans(init='k-means++', random_state=42)
visualizer = KElbowVisualizer(model, k=(2, 12), timings=False)

visualizer.fit(data_trans.toarray()) #Fit the data to the visualizer
visualizer.show()

# Visualization example

- data_trans를 올바르게 작성했을 경우 시각화 코드가 실행되게끔 작성 하였음
- 보고서 및 발표자료 작성 시 활용할 수 있음 

PCA is defined as an orthogonal linear transformation that transforms the data to a new coordinate system such that the greatest variance by some scalar projection of the data comes to lie on the first coordinate (called the first principal component), the second greatest variance on the second coordinate, and so on.[12]

reference : 

- [12] : Jolliffe, I. T. (2002). Principal Component Analysis. Springer Series in Statistics. New York: Springer-Verlag. doi:10.1007/b98835. ISBN 978-0-387-95442-4.
- [13] : https://en.wikipedia.org/wiki/Principal_component_analysis
- https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html


In [None]:
from sklearn.decomposition import PCA

# PCA를 사용한 차원 축소
pca = PCA(n_components=3)
X_pca = pca.fit_transform(data_trans.toarray())

In [None]:
#Clustering visualization
labels = clst.labels_

# 클러스터링 결과 시각화
plt.figure(figsize=(8, 6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=labels, cmap='viridis', marker='o')
plt.title('K-Means Clustering with PCA')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.show()

In [None]:
from mpl_toolkits.mplot3d import Axes3D
labels = clst.labels_

# Creating a 3D scatter plot
fig = plt.figure(figsize=(10,6))
ax = fig.add_subplot(111, projection='3d')

# Plotting the data points
ax.scatter(X_pca[:, 0], X_pca[:, 1], X_pca[:, 2], c=labels, s=15, cmap='viridis', marker='o')

# Setting labels
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')
ax.set_zlabel('PC3')

# Title
ax.set_title('3D PCA Plot')

# Showing the plot
plt.show()

T-SNE : 

Idea 
- 원본 차원의 데이터를 가우시안 분포를 이용해 유사도 계산
- 2차원 평면에 random mapping 후 원 데이터의 유사도와 같아지도록 학습(Minimize KL-Divergence)

Definition
- 입력 객체(고차원)들의 쌍으로 이루어진 유사성을 측정하는 분포
- 저차원 점들의 쌍으로 유사성을 측정하는 분포

reference
- https://en.wikipedia.org/wiki/T-distributed_stochastic_neighbor_embedding
- https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html

In [None]:
from sklearn.manifold import TSNE

X_embedded = TSNE(n_components=2, learning_rate='auto',
                  init='random', perplexity=50).fit_transform(data_trans.toarray())
#Clustering visualization
labels = clst.labels_

# 클러스터링 결과 시각화
plt.figure(figsize=(8, 6))
plt.scatter(X_embedded[:, 0], X_embedded[:, 1], c=labels, cmap='viridis', marker='o')
plt.title('K-Means Clustering with T-SNE')
plt.show()

In [None]:
from mpl_toolkits.mplot3d import Axes3D
labels = clst.labels_
X_embedded = TSNE(n_components=3, learning_rate='auto',
                  init='random', perplexity=50).fit_transform(data_trans.toarray())
#Clustering visualization
labels = clst.labels_

# Creating a 3D scatter plot
fig = plt.figure(figsize=(10,6))
ax = fig.add_subplot(111, projection='3d')

# Plotting the data points
ax.scatter(X_embedded[:, 0], X_embedded[:, 1], X_embedded[:, 2], c=labels, s=10, cmap='viridis', marker='o')


# Title
ax.set_title('3D T-SNE Plot')

# Showing the plot
plt.show()