In [1]:
# Let's now cluster our self-trained Wikipedia discussions model
from gensim.models import Word2Vec

model = Word2Vec.load("data/wue15_word2vec.model")
model = model.wv

In [2]:
# Get the words and their vectors
words = model.index_to_key
word_vectors = model[words]
print("Number of vectors to be clustered:", len(word_vectors))


Number of vectors to be clustered: 115621


In [3]:
from sklearn import cluster
# We will use the KMeans clustering algorithm
# We need to choose a number of clusters (k)
# We will not use too many clusters so it goes fast
k = 500

# An alternative would be to take a certain percentage, e.g. 2.4 % of the number of types
# k = int(len(word_vectors) * 0.024)

print("Number of clusters:", k)
kmeans = cluster.KMeans(n_clusters=k)
kmeans.fit(word_vectors)

Number of clusters: 500


In [4]:
import pickle
# Save the model
with open("data/kmeans_wue15_word2vec.pkl", "wb") as f:
    pickle.dump(kmeans, f)

In [5]:
import pickle
# Read the model
with open("data/kmeans_wue15_word2vec.pkl", "rb") as f:
    kmeans = pickle.load(f)

In [6]:
# The labels are the cluster IDs for each word
kmeans.labels_

array([189, 189, 244, ..., 215, 215, 369], dtype=int32)

In [7]:
# Let's create a dictionary to map words to their cluster IDs

word_cluster = dict(zip(words, kmeans.labels_))
print(word_cluster)

cluster_words = {}
for word, cluster in word_cluster.items():
    if cluster not in cluster_words:
        cluster_words[cluster] = []
    cluster_words[cluster].append(word)



In [8]:
import numpy as np

cluster_centroid = {}
for cluster, words in cluster_words.items():
    vectors = [model[word] for word in words]
    cluster_centroid[cluster] = np.mean(vectors, axis=0)


In [9]:
from scipy.spatial import distance
import pandas as pd

# Create DataFrame for cluster assignment and similarity
df = pd.DataFrame(word_cluster.items(), columns=["word", "cluster"])
df["sim"] = [
    1 - distance.cosine(model[word], cluster_centroid[cluster])
    for word, cluster in word_cluster.items()
]
df = df.sort_values(by=["cluster", "sim"], ascending=[True, False])


In [10]:
df

Unnamed: 0,word,cluster,sim
20821,5pm,0,0.853182
28842,Eventbrite,0,0.795898
18383,11am,0,0.794542
44720,2pm–5pm,0,0.793528
53748,Guapo's,0,0.789034
...,...,...,...
731,An,496,1.000000
1043,wikis,497,0.949709
3736,Wikipedias,497,0.881928
46,any,498,1.000000


In [11]:
word_cluster["attack"]

60

In [12]:
word = "post"
cluster = word_cluster[word]
cluster_words[cluster]

['answer', 'post', 'reply', 'response', 'respond']

In [13]:
# Create top-3 word labels for each cluster
cluster_label = {cluster: "|".join(words[:3]) for cluster, words in cluster_words.items()}


In [14]:
cluster_label

{189: ',|.|to',
 244: 'the|a|this',
 170: 'you|I|we',
 457: '(|)|Disambiguation',
 470: ':|:C|:P',
 472: 'UTC|--',
 213: "is|it's|It's",
 473: 'on',
 13: '"|original|fact',
 495: 'it|The|which',
 152: 'page|pages',
 164: 'Wikipedia|talk|WP',
 171: 'be|show|include',
 196: 'your|my|Your',
 188: 'have|has|had',
 486: 'article|way|subject',
 321: 'If|if',
 463: 'deletion|discussion|request',
 88: 'are|was|were',
 417: '!|...|again',
 489: 'at|At',
 22: 'will|can|may',
 53: 'Image|File',
 313: 'use',
 359: '|',
 233: 'by',
 498: 'any',
 264: 'image',
 72: '2007|2008|2006',
 347: 'please|free|going',
 124: 'Please',
 69: 'do|see|help',
 41: 'speedy',
 376: 'been|happened|appeared',
 139: 'copyright|external|web',
 419: 'deleted',
 203: "would|I'd|you'd",
 96: '/|Archive|Users',
 287: 'like|need|want',
 130: 'more|better|rather',
 432: 'articles',
 426: 'You|they|We',
 323: '-|No|Unfortunately',
 458: 'Thanks|thanks',
 414: 'me|them|it.',
 340: 'edit|reason|addition',
 280: 'other|all|these'

In [15]:
def sort_dict(dic, reverse=True):
    """
    Sort a dictionary by its values.

    Args:
        dic (Dict): Dictionary to sort.
        reverse (bool): Whether to sort in descending order.

    Returns:
        Dict: Sorted dictionary.
    """
    return dict(sorted(dic.items(), key=lambda item: item[1], reverse=reverse))


def get_similar_clusters(
    word_or_vec, topn=10, least=False
):
    """
    Get the most or least similar clusters for a given word or vector.

    Args:
        word_or_vec (Union[str, np.ndarray]): Word or vector to compare.
        topn (int): Number of clusters to return.
        least (bool): Whether to return least similar clusters.

    Returns:
        List[Tuple[int, float]]: List of cluster indices and similarity scores.
    """
    if isinstance(word_or_vec, str):
        vector = model[word_or_vec]
    elif isinstance(word_or_vec, np.ndarray):
        vector = word_or_vec
    else:
        raise ValueError("Input must be a string (word) or a NumPy array (vector).")

    # Compute similarities to all cluster centroids
    cluster_similarities = {
        cluster: 1 - distance.cosine(vector, centroid)
        for cluster, centroid in cluster_centroid.items()
    }
    sorted_clusters = sort_dict(cluster_similarities, reverse=not least)
    return list(sorted_clusters.items())[:topn]

def print_similar_clusters(
    word_or_vec, topn=10, least=False
):
    """
    Print the most or least similar clusters for a given word or vector.

    Args:
        word_or_vec (Union[str, np.ndarray]): Word or vector to compare.
        topn (int): Number of clusters to print.
        least (bool): Whether to print least similar clusters.
    """
    clusters = get_similar_clusters(word_or_vec, topn=topn, least=least)
    for cluster, similarity in clusters:
        if cluster in cluster_label:
            print(f"Cluster {cluster}: {cluster_label[cluster]} (Similarity: {similarity:.3f})")


In [16]:
print_similar_clusters("write", topn=10)

Cluster 69: do|see|help (Similarity: 0.623)
Cluster 129: questions|make|put (Similarity: 0.602)
Cluster 167: start|play|present (Similarity: 0.537)
Cluster 456: find|reach|figure (Similarity: 0.535)
Cluster 433: change|delete|revert (Similarity: 0.495)
Cluster 381: add (Similarity: 0.474)
Cluster 4: Make|spot|Consider (Similarity: 0.470)
Cluster 46: take|address|discuss (Similarity: 0.468)
Cluster 481: give|provide (Similarity: 0.463)
Cluster 27: get|come|pass (Similarity: 0.458)
