In [None]:
# Let's now cluster our self-trained Wikipedia discussions model
from gensim.models import Word2Vec

model = Word2Vec.load("wue15_word2vec.model")
model = model.wv

In [None]:
# Get the words and their vectors
words = model.index_to_key
word_vectors = model[words]
print("Number of vectors to be clustered:", len(word_vectors))


In [None]:
from sklearn import cluster
# We will use the KMeans clustering algorithm
# We need to choose a number of clusters (k)
# We will not use too many clusters so it goes fast
k = 500

# An alternative would be to take a certain percentage, e.g. 2.4 % of the number of types
# k = int(len(word_vectors) * 0.024)

print("Number of clusters:", k)
kmeans = cluster.KMeans(n_clusters=k)
kmeans.fit(word_vectors)

In [None]:
import pickle
# Save the model
with open("kmeans_wue15_word2vec.pkl", "wb") as f:
    pickle.dump(kmeans, f)

In [None]:
import pickle
# Read the model
with open("kmeans_wue15_word2vec.pkl", "rb") as f:
    kmeans = pickle.load(f)

In [None]:
# The labels are the cluster IDs for each word
kmeans.labels_

In [None]:
# Let's create a dictionary to map words to their cluster IDs

word_cluster = dict(zip(words, kmeans.labels_))
print(word_cluster)

cluster_words = {}
for word, cluster in word_cluster.items():
    if cluster not in cluster_words:
        cluster_words[cluster] = []
    cluster_words[cluster].append(word)

In [None]:
import numpy as np

cluster_centroid = {}
for cluster, words in cluster_words.items():
    vectors = [model[word] for word in words]
    cluster_centroid[cluster] = np.mean(vectors, axis=0)


In [None]:
from scipy.spatial import distance
import pandas as pd

# Create DataFrame for cluster assignment and similarity
df = pd.DataFrame(word_cluster.items(), columns=["word", "cluster"])
df["sim"] = [
    1 - distance.cosine(model[word], cluster_centroid[cluster])
    for word, cluster in word_cluster.items()
]
df = df.sort_values(by=["cluster", "sim"], ascending=[True, False])


In [None]:
df

In [None]:
word_cluster["attack"]

In [None]:
word = "post"
cluster = word_cluster[word]
cluster_words[cluster]

In [None]:
# Create top-3 word labels for each cluster
cluster_label = {cluster: "|".join(words[:3]) for cluster, words in cluster_words.items()}


In [None]:
cluster_label

In [None]:
def sort_dict(dic, reverse=True):
    """
    Sort a dictionary by its values.

    Args:
        dic (Dict): Dictionary to sort.
        reverse (bool): Whether to sort in descending order.

    Returns:
        Dict: Sorted dictionary.
    """
    return dict(sorted(dic.items(), key=lambda item: item[1], reverse=reverse))


def get_similar_clusters(
    word_or_vec, topn=10, least=False
):
    """
    Get the most or least similar clusters for a given word or vector.

    Args:
        word_or_vec (Union[str, np.ndarray]): Word or vector to compare.
        topn (int): Number of clusters to return.
        least (bool): Whether to return least similar clusters.

    Returns:
        List[Tuple[int, float]]: List of cluster indices and similarity scores.
    """
    if isinstance(word_or_vec, str):
        vector = model[word_or_vec]
    elif isinstance(word_or_vec, np.ndarray):
        vector = word_or_vec
    else:
        raise ValueError("Input must be a string (word) or a NumPy array (vector).")

    # Compute similarities to all cluster centroids
    cluster_similarities = {
        cluster: 1 - distance.cosine(vector, centroid)
        for cluster, centroid in cluster_centroid.items()
    }
    sorted_clusters = sort_dict(cluster_similarities, reverse=not least)
    return list(sorted_clusters.items())[:topn]

def print_similar_clusters(
    word_or_vec, topn=10, least=False
):
    """
    Print the most or least similar clusters for a given word or vector.

    Args:
        word_or_vec (Union[str, np.ndarray]): Word or vector to compare.
        topn (int): Number of clusters to print.
        least (bool): Whether to print least similar clusters.
    """
    clusters = get_similar_clusters(word_or_vec, topn=topn, least=least)
    for cluster, similarity in clusters:
        if cluster in cluster_label:
            print(f"Cluster {cluster}: {cluster_label[cluster]} (Similarity: {similarity:.3f})")


In [None]:
print_similar_clusters("write", topn=10)