In [1]:
###' ################################################################################
###'
###' IMPORT LIBRARIES
###'
###'

import numpy as np
import pandas as pd
from pathlib import Path

import ast
import umap.umap_ as umap
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from sklearn.cluster import KMeans
from sklearn.cluster import SpectralClustering
from sklearn.preprocessing import StandardScaler
from mpl_toolkits.mplot3d import Axes3D

## 0. Clustering Settings

In [2]:
# Load dataset

data_dir = Path(r"C:\Users\Hyemi\Python\TopicModeling\Data")
data_dir.mkdir(parents=True, exist_ok=True) 

embedding_files = [
    "articles_embedding_1_full.csv",
    "articles_embedding_2_full.csv",
    "articles_embedding_3_full.csv",
    "articles_embedding_4_full.csv"
]

embedding_file_paths = [data_dir / file for file in embedding_files]


embedding_dfs = []
for file_path in embedding_file_paths:
    if file_path.exists():  # Check if the file exists before reading
        embedding_dfs.append(pd.read_csv(file_path))

df1a = embedding_dfs[0]
df2a = embedding_dfs[1]
df3a = embedding_dfs[2]
df4a = embedding_dfs[3]

df1 = df1a.loc[:, ~df1a.columns.str.contains("Cluster", case=False)]
df2 = df2a.loc[:, ~df2a.columns.str.contains("Cluster", case=False)]
df3 = df3a.loc[:, ~df3a.columns.str.contains("Cluster", case=False)]
df4 = df4a.loc[:, ~df4a.columns.str.contains("Cluster", case=False)]


# Function to extract embeddings and normalize them
def normalize_embeddings(df):
    embeddings = df["Embeddings"].apply(lambda x: np.array(ast.literal_eval(x)) if isinstance(x, str) else np.array(x))
    embeddings_matrix = np.vstack(embeddings.values)
    scaler = StandardScaler()
    embeddings_scaled = scaler.fit_transform(embeddings_matrix)
    df["Embeddings_S"] = list(map(lambda x: x.tolist(), embeddings_scaled))
    
    return embeddings_scaled

# Normalize embeddings without modifying the "Embeddings" column
embeddings_scaled1 = normalize_embeddings(df1)
embeddings_scaled2 = normalize_embeddings(df2)
embeddings_scaled3 = normalize_embeddings(df3)
embeddings_scaled4 = normalize_embeddings(df4)

# UMAP Dimensionality Reduction to 3D
umap_3d = umap.UMAP(n_components=3, n_neighbors=30, min_dist=0.1, metric='cosine', random_state=42)

# Operate by group
embedding1_3d = umap_3d.fit_transform(embeddings_scaled1)
embedding2_3d = umap_3d.fit_transform(embeddings_scaled2)
embedding3_3d = umap_3d.fit_transform(embeddings_scaled3)
embedding4_3d = umap_3d.fit_transform(embeddings_scaled4)

In [None]:
###' ################################################################################
###'
###' Elbow Function
###'
###'

# Function to calculate the Within-Cluster Sum of Squares (WCSS) for different cluster sizes
def plot_elbow_method(embeddings_matrix, max_clusters=9):
    wcss = []  # List to store WCSS for each number of clusters
    
    for k in range(2, max_clusters + 1):
        kmeans = KMeans(n_clusters=k, random_state=42, n_init=30)
        kmeans.fit(embeddings_matrix)
        wcss.append(kmeans.inertia_)  # Inertia is the sum of squared distances to the closest centroid
    
    # Plot Elbow Graph
    plt.figure(figsize=(8, 5))
    plt.plot(range(2, max_clusters + 1), wcss, marker='o', linestyle='-', color='b')
    plt.xlabel("Number of Clusters")
    plt.ylabel("WCSS (Within-Cluster Sum of Squares)")
    plt.title("Elbow Method for Optimal k")
    plt.xticks(range(1, max_clusters + 1))
    plt.grid()
    plt.show()

## Group1. Spectral Clustering

In [None]:
plot_elbow_method(embedding1_3d)

## Group2. Spectral Clustering

In [None]:
# Run the Elbow Method function to determine the optimal number of clusters
plot_elbow_method(embedding2_3d)

In [None]:
###' ################################################################################
###'
###' Special Trial3
###' : DBSCANClustering
###'
###'

from sklearn.mixture import GaussianMixture

num_clusters = 4
gmm = GaussianMixture(n_components=num_clusters)
labels_3_2 = gmm.fit_predict(embedding2_3d)


# Create 3D scatter plot
fig = plt.figure(figsize=(10, 7))
ax = fig.add_subplot(111, projection='3d')

# Scatter plot
scatter = ax.scatter(
    embedding2_3d[:, 0], embedding2_3d[:, 1], embedding2_3d[:, 2], 
    c=labels_3_2, cmap="tab10", alpha=0.6
)

# Labels and Title
ax.set_title("Word Clusters Based on Abstract Embeddings (3D)")
ax.set_xlabel("UMAP Dimension 1")
ax.set_ylabel("UMAP Dimension 2")
ax.set_zlabel("UMAP Dimension 3")

# Add legend
legend1 = ax.legend(*scatter.legend_elements(), title="Cluster")
ax.add_artist(legend1)

# Show plot
plt.show()

## Group3. Spectral Clustering

In [None]:
# Run the Elbow Method function to determine the optimal number of clusters
plot_elbow_method(embedding3_3d)

## Group4. Spectral Clustering

In [None]:
# Run the Elbow Method function to determine the optimal number of clusters
plot_elbow_method(embedding4_3d)

In [None]:
###' ################################################################################
###'
###' Special Trial2
###' : AgglomerativeClustering
###'
###'

from sklearn.cluster import AgglomerativeClustering
spectral_embedding = spectral.affinity_matrix_  # You can use the spectral embedding as input

num_clusters = 5
agg_clustering = AgglomerativeClustering(n_clusters=num_clusters)
labels_1_4 = agg_clustering.fit_predict(embedding4_3d)

df4["Cluster"] = labels_1_4

# Define custom colors for each cluster
cmap = plt.get_cmap("tab10")

# Create 3D scatter plot
fig = plt.figure(figsize=(10, 7))
ax = fig.add_subplot(111, projection='3d')

# Scatter plot
scatter = ax.scatter(
    embedding4_3d[:, 2], embedding4_3d[:, 1], embedding4_3d[:, 0], 
    c=labels_1_4, cmap=cmap, alpha=0.6
)

# Labels and Title
ax.set_title("GROUP4: Agglomerative Clustering Based on Abstract Embeddings")
ax.set_xlabel("UMAP Dimension 1")
ax.set_ylabel("UMAP Dimension 2")
ax.set_zlabel("UMAP Dimension 3")

# Add legend
legend1 = ax.legend(*scatter.legend_elements(), title="Cluster")
ax.add_artist(legend1)

# Show plot
plt.show()

In [None]:
###' ################################################################################
###'
###' Saving
###'
###'

file_path1 = data_dir / "articles_4_clustering1.csv"
df1.to_csv(file_path1, index=False)

file_path2 = data_dir / "articles_4_clustering2.csv"
df2.to_csv(file_path2, index=False)

file_path3 = data_dir / "articles_4_clustering3.csv"
df3.to_csv(file_path3, index=False)

file_path4 = data_dir / "articles_4_clustering4.csv"
df4.to_csv(file_path4, index=False)