# <center>Critical AI</center>
<center>ENGL 54.41</center>
<center>Dartmouth College</center>
<center>Winter 2026</center>
<pre>Created: 01/8/2026</pre>

## Creating Document-Term Matrix from HathiTrust Data

In [None]:
from htrc_features import FeatureReader
import numpy as np
import pandas as pd

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA

from matplotlib import pyplot as plt

In [None]:
# the following is a list of HathiTrust ids for books. These identify
# the HTRC extracted features dataset for each text. You can find the
# ID by visiting https://www.hathitrust.org/ and searching for a book.
# You will need to click on the link for a specific volume from a 
# specific library. If you want a book that is under copyright 
# protection, you can change "Item Availbility" from "Full View" to 
# "All Items" have you have searched for a book or author. Same process
# applies for finding the IDs (click on "Limited (search-only)" to find
# ID from the url.

texts = ['inu.30000114418225',
     'mdp.39015063955069',
     'uva.x001172111',
     'mdp.39015053616556',
     'uc1.b3340190',
     'uc1.$b803019',
     'mdp.39015004998749',
     'mdp.39015048713369',
     'uc1.b4451810',
     'mdp.39015005611895',
     'mdp.39015047442747',
     'mdp.39015053574953',
     'uc1.b3340141',
     'mdp.39015016446554',
     'mdp.39015031222196',
     'uc1.$b114956',
     'pst.000005961382',
     'hvd.hxdink',
     'wu.89016088155',
     'hvd.hnmhl4']

In [None]:
# build document-term matrix by page
fr = FeatureReader(ids = texts)
rows = []
for vol in fr:
    print(vol)
    tl = vol.tokenlist(section='body', case=False, pos=True, drop_section=True)
    tl = tl.reset_index().rename(columns={"token": "lowercase", 0: "count"})
    tl["volume"] = vol.id
    rows.append(tl[["volume", "page", "lowercase", "pos", "count"]])

df = pd.concat(rows, ignore_index=True)

# filter for only alphabetical tokens and longer than one character
df = df[df["lowercase"].str.isalpha() & (df["lowercase"].str.len() > 1)]

# filter for nouns, adjectives, and verbs
keep_pos = {"NN", "NNS", "NNP", "NNPS", "VB", "VBD", "VBG", 
            "VBN", "VBP", "VBZ", "JJ", "JJR", "JJS"}
df = df[df["pos"].isin(keep_pos)]

# create page_ids
df["page_id"] = df["volume"].astype(str) + ":" + df["page"].astype(str)

dtm_counts = (
    df.pivot_table(index="page_id",
                   columns="lowercase",
                   values="count",
                   aggfunc="sum",
                   fill_value=0)
    .sort_index()
)

In [None]:
dtm_counts

In [None]:
# strip stopwords
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
stop_words = ENGLISH_STOP_WORDS

cols_to_drop = [c for c in stop_words if c in dtm_counts.columns]
dtm_counts.drop(columns = cols_to_drop,
                inplace = True)

# remove low frequency terms
a = 3 # threshold to remove rare words
dtm_counts = dtm_counts.loc[:, dtm_counts.sum() >= a]
print(f'reduced term count to {dtm_counts.shape[1]}')

In [None]:
# useful indices
feature_names = dtm_counts.columns.to_list()
pages = dtm_counts.index.to_list()

In [None]:
# which pages are most similar to this page?
sample_page = 'mdp.39015005611895:45'

# get the row number
rn = pages.index(sample_page)
print(f'row number: {rn}')

# show me the vocab. 
pv = dtm_counts.loc['mdp.39015005611895:45'].to_numpy().nonzero()[0]
print([feature_names[f] for f in pv])

In [None]:
# create page similarity matrix
page_similarity = cosine_similarity(dtm_counts)
page_similarity.shape

In [None]:
# now we are going to print a table of document, distance from our target,
# and the shared vocabulary (ignoring frequency).
target = 1579

for d in np.argsort(page_similarity[target])[::-1][:25][1:]:
    # get intersection of vocabulary between our target page and others
    shared_vocab = np.intersect1d(pv, dtm_counts.loc[pages[d]].to_numpy().nonzero()[0])
    print(f'{pages[d]:25} {page_similarity[target][d]:10f} {[feature_names[f] for f in shared_vocab]}')

## Term-Document Matrix: Representing Vocabulary from the DTM

In [None]:
# create term similarity matrix from document-term matrix (transpose rows & columns)
term_similarity = cosine_similarity(dtm_counts.T)

In [None]:
# reduce to two dimensions (x,y) with principle components analysis (PCA)
pca = PCA(n_components=2)
pca_data = pca.fit_transform(term_similarity)

In [None]:
def get_term_similarity(term):
    """Simple term similarity based on converting document-term matrix into
    term-document matrix and calculating cosine distances of the vocabulary.
    This is going to have relatively poor performance on a small amount of data.
    """
    if term in feature_names:
        term_idx = feature_names.index(term)
    else:
        return 255

    # simple similar terms based on context
    similar_terms = term_similarity[term_idx]
    similar_vocab = [feature_names[idx] for idx in np.argsort(similar_terms)[::-1][1:25]]
    distances = np.sort(similar_terms)[::-1][1:25]
    return [[a,b] for a,b in zip(similar_vocab,distances)]

In [None]:
get_term_similarity("ebbing")

In [None]:
get_term_similarity("dream")

In [None]:
term = "dream"

# reduce data to just to closest neighboring terms
words = [r[0] for r in get_term_similarity(term)]
plot_data = pca_data[[feature_names.index(w) for w in words]]
xs, ys = plot_data[:, 0], plot_data[:, 1]

fig = plt.figure(figsize=(20, 15))
plt.clf()
plt.title("PCA of Most Similar to: " + term)
plt.style.use('ggplot')
plt.scatter(xs, ys, marker = '^')
for i, w in enumerate(words):
     plt.annotate(w, xy = (xs[i], ys[i]), xytext = (3, 3),
        textcoords = 'offset points', ha = 'left', va = 'top')
plt.show()  