# <center>Critical AI</center>
<center>ENGL 54.41</center>
<center>Dartmouth College</center>
<center>Winter 2026</center>
<pre>Created: 01/02/2026</pre>

## Vectorization and Document Distances in the DTM

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import nltk
import re

In [None]:
# open a text file (HathiTrust exported text) -- this is plain text
# and was manually exported and only available for texts not in copyright
text = open("../data/uiug-30112039344814-1767014814.txt","rt").read()

# display the first 130 characters of this file
print(text[:130])

# preprocessing to remove HathiTrust mark-up (header & page breaks)
text = re.sub(r'\A.*?(?=^##)', '', text, flags=re.S | re.M)
text = re.sub(r'^##.*\n?', '', text, flags=re.M)

In [None]:
# split into lines (will work well for volumes of poetry, not so much for other stuff)
text = text.splitlines()

In [None]:
# and now remove super short lines
text = [l for l in text if len(l) > 5]

In [None]:
# how many lines? We'll count each line as document (a row) in our document-term matrix.
print(len(text))

In [None]:
# display a sample line:
text[433]

In [None]:
# CountVectorizer "vectorizes" inputs by calculating word (token) frequencies.
# We create an instance of the vectorizer by calling it here, without arguments. 
# There are several possible arguments (and defaults!). These have important
# consequences for what is counted. 
vec = CountVectorizer()

In [None]:
# The following "fits" our input documents (as a list of strings) to the model. 
# We'll call the model "dtm" for document-term matrix. Scikit-Learn does things
# in this way to enable multiple "fittings" for different purposes (in predictive
# modeling we typically have a "training" and "testing" dataset and each need to be
# constructed in the same manner with the same parameters.
dtm = vec.fit_transform(text)

In [None]:
# shape of dtm = documents x vocab/tokens
dtm.shape

In [None]:
# let's pick a line and view it:
print(text[323])

In [None]:
# which columns (tokens) have non-zero values?
dtm[323,:].todense().nonzero()[1]

In [None]:
# and these numerical columns correspond to which features?
[vec.get_feature_names_out()[v] for v in dtm[323,:].todense().nonzero()[1]]

In [None]:
# Now we'll create a cosine similarity matrix of all the rows. This has
# pair-wise distances of all the texts. The diagonal of this matrix contains
# values for the measurement of a text compared with itself. The upper and 
# lower triangle of the matrix are the same.
dist = cosine_similarity(dtm)
dist.shape

In [None]:
# distance of that text compared to the first 10 texts:
dist[323,:10]

In [None]:
# display 25 most similar texts using our distance matrix
[text[i] for i in np.argsort(dist[323])[::-1][:25]]

In [None]:
# we can find the index of a line in our original input easy enough:
text.index("That catch the wind's moan in the dead of winter.")

In [None]:
# change variable to the line number from above
target = 315
print("Original line:",text[target],"\n")
for d in np.argsort(dist[target])[::-1][:25][1:]:
    print(text[d],np.round(dist[target][d],5))
    for v in np.where((dtm[target] != 0).todense() & (dtm[d] != 0).todense())[1]:
        print(f' {vec.get_feature_names_out()[v]}: {dtm[target,v]} (a) {dtm[d,v]} (b)',end=" ")
    print("\n")