# <center>Critical AI</center>
<center>ENGL 54.41</center>
<center>Dartmouth College</center>
<center>Winter 2026</center>
<pre>Created: 01/15/2026</pre>

In [None]:
import gensim
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

from gensim import matutils
import numpy as np
from numpy import dot
from sklearn.metrics.pairwise import cosine_similarity

from sklearn.decomposition import PCA
from sklearn.manifold import MDS

from matplotlib import pyplot as plt
%matplotlib inline

In [None]:
# Load Google News 200 Model (smaller)
google_model = KeyedVectors.load_word2vec_format("../models/google-vectors.w2v",
                                                 binary = True)

In [None]:
# "Interview" the model
vocab_size, dim = google_model.vectors.shape
print("vocab:", vocab_size)
print("depth:", dim)

## Clustering

We can automatically group embeddings together into clusters using an algorithm call k-Means. This finds clusters by partitioning data into _k_ clusters. We specify the number of clusters that we want prior to running the algorith. How to pick a value for _k_? Good question. We have to make a guess or know something about the data. It's just another sign of subjectivity and choice involved in machine learning.

In [None]:
from sklearn.cluster import KMeans

# here we'll decide that we want fifty clusters of our vectors.
# vocab_size / 50 = means potentially many tokens/words per cluster but maybe more meaningful cluster?

kmeans = KMeans(n_clusters=50, random_state=42, n_init="auto")
kmeans.fit(google_model.vectors)

In [None]:
# assign cluster labels to all vectors (to all vocabulary)
cluster_labels = kmeans.labels_

# these are center of the clusters--this will not correspond to a specific
# vector but the center of vector space comprised of vectors assigned to this
# cluster.
centroids = kmeans.cluster_centers_

In [None]:
# let's visualize these centroids and label with the cluster id
plt.scatter(
    centroids[:, 0],
    centroids[:, 1],
    marker="o",       # circles are nice
    s=6,              # size of the marker
    linewidths=3,     # increase size
    color="g",        # green
)

# add labels to the cluster points
for i in range(centroids.shape[0]):
     plt.annotate(i, xy = (centroids[:, 0][i], centroids[:, 1][i]), xytext = (3, 3),
        textcoords = 'offset points', ha = 'left', va = 'top')
plt.title("k-Means Clusters of Vector Data")
plt.xticks(())
plt.yticks(())
plt.show()

## Interrogating a Cluster

We can explore centroids that are close together: these might have similar
semantic meanings even if the vocabulary are in distinct clusters.

In [None]:
# This one looks promising. The centroids, as mentioned above, contain vector values. 
# They will be the same size as our embedding space:
centroids[27].shape

In [None]:
# We can query the model for vectors most similar to this imaginary location in vector space:
google_model.most_similar(positive=[centroids[27]], topn=10)

In [None]:
# Let's select another centroid located close to that one: 
google_model.most_similar(positive=[centroids[16]], topn=10)

In [None]:
# And another:
google_model.most_similar(positive=[centroids[46]], topn=10)

In [None]:
# This one looks like a real outlier--why?
google_model.most_similar(positive=[centroids[26]], topn=10)

In [None]:
# Try some other clusters!