# <center>Critical AI</center>
<center>ENGL 54.41</center>
<center>Dartmouth College</center>
<center>Winter 2026</center>
<pre>Created: 01/24/2026; Updated: 01/29/2026</pre>

In [None]:
import torch
import pandas as pd

from transformers import CLIPTokenizer, CLIPTextModel
import plotly.express as px
import plotly.io as pio
import matplotlib as mpl
from sklearn.manifold import TSNE

In [None]:
# This cell of code will determine if we have an accelerator for running
# our neural networks.
# mps == Apple Silicon device (MX series of Macbooks)
# cuda == Compute Unified Device Architecture is a toolkit from Nvidia and means we have a GPU
# cpu == Just using the general-purpose CPU for our calculations

if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
    device = torch.device('mps')
elif torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
print('Using device: {0}'.format(device))

In [None]:
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14",
                                             dtype = torch.float16,
                                             device_map = "auto")

In [None]:
# let's view the architecture of this network. It has a vocabulary equal to the size
# of the token_embedding. The position_embedding dimensions tell us how many tokens can be 
# supplied as input to the network.
text_encoder.eval()

In [None]:
# we can read a CSV directly from the web:
df = pd.read_csv('https://raw.githubusercontent.com/jeddobson/ENGL54.41-26W/refs/heads/main/data/wit_v1.train.all-1percent_sample-5k.csv')

In [None]:
# let's look at a few sample rows from this dataset:
df.head()

In [None]:
# what is the distribution of languages in this sample?
df['language'].value_counts().plot(kind='bar',figsize=(20, 5),title='Languages in 5k Sample of WIT 1%')

In [None]:
# extract just the English-language captions 
en_captions = df[(df['language'] == "en") & (df['caption_alt_text_description'].notna())]['caption_alt_text_description']

In [None]:
# The dataset has a column to note if an image is the main article image or not
df['is_main_image'].value_counts().plot(kind='pie', autopct='%1.1f%%', title="Main Article Image")

In [None]:
# let's use some classic NLP first:
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer()
dtm = vec.fit_transform(en_captions)
# summarize vocabulary counts
vocab_sums = dtm.sum(axis=0)
# create a dictionary of frequent terms for plotting
freq_dict = {v:vocab_sums[0, i] for v, i in vec.vocabulary_.items()}

In [None]:
# create dataframe and plot 25 most frequent words as a bar chart
v_dv = pd.DataFrame.from_dict(freq_dict, orient='index', columns=["frequency"])
v_dv.sort_values(by="frequency",ascending=False).iloc[:25].plot(kind='bar',title="Captions: Word Frequency")

In [None]:
# now get the actual encoding used by CLIP. This will return vectors that are
# 768 values in length. This is the neural representation of the inputs. We'll 
# be discussing Transformers in more detail next week, but for now it will be 
# helpful to know that we are taking only the first 77 tokens (these are subword
# units) and encoding these into 768 floating point numbers. 

inputs = tokenizer(en_captions.tolist(), 
                   padding=True, 
                   truncation = True, 
                   max_length = 77, 
                   return_tensors="pt")
with torch.no_grad():
    outputs = text_encoder(**inputs.to(device))
pooled_output = outputs.last_hidden_state.mean(dim=1).to('cpu')
print(pooled_output.shape)

In [None]:
tsne = TSNE(n_components=2, perplexity=2, max_iter = 1000, random_state = 42)
embeddings_2d = tsne.fit_transform(pooled_output.to('cpu'))
pio.renderers.default = "colab"

vis = pd.DataFrame({
    'TSNE Component 1': embeddings_2d[:, 0],
    'TSNE Component 2': embeddings_2d[:, 1],
    'Captions': en_captions,
})
fig = px.scatter(vis, x = 'TSNE Component 1', 
                 y = 'TSNE Component 2',
                 hover_name = 'Captions',
                 hover_data = 'Captions',
                 title = "t-SNE Projection of Text Embeddings from CLIP")
fig.update_traces(mode = "markers")
fig.show()