In [1]:
# CSC396 - Intro to Deep Learning w/ NLP
# Jose Santiago Campa Morales
# November 23, 2025
# Assignment #4 -- Transformer

import random
import torch
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from transformers import AutoTokenizer, AutoModel

# enable tqdm in pandas
tqdm.pandas()

# set to True to use the gpu (if there is one available)
use_gpu = True

# select device
device = torch.device('cuda' if use_gpu and torch.cuda.is_available() else 'cpu')
print(f'device: {device.type}')

# random seed
seed = 1234

# set random seed
if seed is not None:
    print(f'random seed: {seed}')
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

device: cuda
random seed: 1234


In [2]:
# read .txt file
sentences = []
with open("assignment4-dataset.txt", "rt", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if line:
            sentences.append(line)

print("Loaded", len(sentences), "sentences.")

# Sample dataset for testing
SAMPLE_SIZE = 1000000  # Use 1M sentences instead of 4M
sentences = sentences[:SAMPLE_SIZE]
print(f"Using {len(sentences)} sentences for testing")

Loaded 3980290 sentences.
Using 1000000 sentences for testing
Using 1000000 sentences for testing


In [3]:
# Decided to use distilbert

# From official website:
# Load model directly
MODEL_NAME = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME).to(device)

In [4]:
model.eval()

DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): DistilBertSdpaAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): L

In [5]:
pad_id = tokenizer.pad_token_id
id_to_tok = tokenizer.convert_ids_to_tokens(list(range(tokenizer.vocab_size)))

token_sums = {}
token_counts = {}

In [6]:
batch_size = 128  # Increased for better GPU utilization
max_len = 64

for i in tqdm(range(0, len(sentences), batch_size), desc="Embedding"):
    batch = sentences[i:i+batch_size]

    enc = tokenizer(
        batch,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=max_len
    ).to(device)

    with torch.no_grad():
        hidden = model(**enc).last_hidden_state  # [B, T, D]

    input_ids = enc["input_ids"]
    B, T, D = hidden.shape

    # Flatten batch
    flat_ids = input_ids.reshape(-1)
    flat_vecs = hidden.reshape(-1, D)

    # Mask padding
    mask = flat_ids != pad_id
    flat_ids = flat_ids[mask]
    flat_vecs = flat_vecs[mask]

    # -----------------------
    # VECTORIZED AGGREGATION
    # -----------------------
    unique_ids, inverse = torch.unique(flat_ids, return_inverse=True)
    sums = torch.zeros((len(unique_ids), D), device=flat_vecs.device)
    sums.index_add_(0, inverse, flat_vecs)
    counts = torch.bincount(inverse)

    # Move sums and counts to CPU and update global dict
    unique_ids = unique_ids.cpu()
    sums = sums.cpu()
    counts = counts.cpu()

    for tok_id, vec_sum, count in zip(unique_ids.tolist(), sums, counts.tolist()):
        if tok_id in token_sums:
            token_sums[tok_id] += vec_sum
            token_counts[tok_id] += count
        else:
            token_sums[tok_id] = vec_sum.clone()
            token_counts[tok_id] = count

Embedding:   0%|          | 0/7813 [00:00<?, ?it/s]

In [7]:
token_static_embeddings = {}
for tok_id, vec_sum in token_sums.items():
    avg_vec = vec_sum / token_counts[tok_id]
    tok_str = id_to_tok[tok_id]
    token_static_embeddings[tok_str] = avg_vec.numpy()

print("Number of tokens:", len(token_static_embeddings))

Number of tokens: 28591


In [8]:
np.save("static_embeddings_roberta.npy", token_static_embeddings)
print("Saved to static_embeddings_roberta.npy")

Saved to static_embeddings_roberta.npy


In [9]:
# Inspect a few token embeddings
sample_tokens = ['the', 'hello', 'world', 'computer', 'science']

for token in sample_tokens:
    if token in token_static_embeddings:
        embedding = token_static_embeddings[token]
        print(f"Token: '{token}'")
        print(f"  Embedding shape: {embedding.shape}")
        print(f"  First 10 dimensions: {embedding[:10]}")
        print(f"  Count (how many times seen): {token_counts[tokenizer.convert_tokens_to_ids(token)]}")
        print()

Token: 'the'
  Embedding shape: (768,)
  First 10 dimensions: [-0.5182941  -0.2055869  -0.23243168 -0.04096534  0.138746   -0.04299089
  0.14637561  0.48362467 -0.13441496  0.02707357]
  Count (how many times seen): 767752

Token: 'hello'
  Embedding shape: (768,)
  First 10 dimensions: [-0.33900633  0.05078349  0.5843242  -0.2721656   0.17722043 -0.2888496
  0.21743204  0.2900156  -0.5144144  -0.25633046]
  Count (how many times seen): 86

Token: 'world'
  Embedding shape: (768,)
  First 10 dimensions: [ 0.10564563 -0.01599088  0.22037983 -0.2593901   0.29943195 -0.24501027
  0.5150999   0.6359968  -0.3580607   0.06160359]
  Count (how many times seen): 12616

Token: 'computer'
  Embedding shape: (768,)
  First 10 dimensions: [-0.5075014   0.40926838  0.06141642  0.08055346  1.0685854  -0.23621342
 -0.45720547  0.42538086 -0.2385968   0.04511508]
  Count (how many times seen): 1153

Token: 'science'
  Embedding shape: (768,)
  First 10 dimensions: [-0.3540344   0.4305366  -0.25918427 

In [10]:
from gensim.models import KeyedVectors

fname = "glove.6B.300d-vocabulary.txt"
glove = KeyedVectors.load_word2vec_format(fname, no_header=True)
glove.vectors.shape

(400000, 0)

In [11]:
# Load GloVe vocabulary (just words, one per line)
with open("glove.6B.300d-vocabulary.txt", "r", encoding="utf-8") as f:
    glove_words = [line.strip() for line in f if line.strip()]

print(f"Loaded {len(glove_words)} words from GloVe vocabulary")

# Compute word embeddings by averaging token embeddings
word_embeddings = {}
missing_words = []

for word in tqdm(glove_words, desc="Computing word embeddings"):
    # Tokenize the word
    tokens = tokenizer.tokenize(word)
    
    # Get embeddings for each token
    token_vecs = []
    for token in tokens:
        if token in token_static_embeddings:
            token_vecs.append(token_static_embeddings[token])
    
    # Average the token embeddings to get word embedding
    if len(token_vecs) > 0:
        word_embedding = np.mean(token_vecs, axis=0)
        word_embeddings[word] = word_embedding
    else:
        missing_words.append(word)

print(f"Created embeddings for {len(word_embeddings)} words")
print(f"Missing {len(missing_words)} words (tokens not in vocabulary)")

Loaded 400000 words from GloVe vocabulary


Computing word embeddings:   0%|          | 0/400000 [00:00<?, ?it/s]

Created embeddings for 399890 words
Missing 110 words (tokens not in vocabulary)


In [12]:
# Function to find most similar words
from numpy.linalg import norm

def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (norm(vec1) * norm(vec2))

def most_similar_word(target_word, topn=10):
    if target_word not in word_embeddings:
        print(f"'{target_word}' not in vocabulary")
        return []
    
    target_vec = word_embeddings[target_word]
    similarities = []
    
    for word, embedding in word_embeddings.items():
        if word != target_word:
            sim = cosine_similarity(target_vec, embedding)
            similarities.append((word, sim))
    
    similarities.sort(key=lambda x: x[1], reverse=True)
    return similarities[:topn]

# Test with various words
test_words = ['cactus', 'cake', 'angry', 'quickly', 'between', 'the']

for test_word in test_words:
    print(f"\nMost similar to '{test_word}':")
    results = most_similar_word(test_word, topn=10)
    for word, sim in results:
        print(f"  {word}: {sim:.4f}")



Most similar to 'cactus':
  cactuses: 0.8830
  coneflower: 0.7930
  spineflower: 0.7802
  flowerheads: 0.7751
  monkeyflower: 0.7714
  coneflowers: 0.7633
  lemongrass: 0.7622
  flowerbed: 0.7604
  flowery: 0.7591
  crabgrass: 0.7590

Most similar to 'cake':
  cactuses: 0.8830
  coneflower: 0.7930
  spineflower: 0.7802
  flowerheads: 0.7751
  monkeyflower: 0.7714
  coneflowers: 0.7633
  lemongrass: 0.7622
  flowerbed: 0.7604
  flowery: 0.7591
  crabgrass: 0.7590

Most similar to 'cake':
  cakebread: 0.9212
  cakey: 0.9050
  cakelike: 0.9020
  caked: 0.8939
  cakes: 0.8891
  cakewalk: 0.8803
  cheesecake: 0.8345
  pastry: 0.8221
  cheesecakes: 0.8093
  shortcake: 0.8080

Most similar to 'angry':
  cakebread: 0.9212
  cakey: 0.9050
  cakelike: 0.9020
  caked: 0.8939
  cakes: 0.8891
  cakewalk: 0.8803
  cheesecake: 0.8345
  pastry: 0.8221
  cheesecakes: 0.8093
  shortcake: 0.8080

Most similar to 'angry':
  furious: 0.9081
  enraged: 0.9028
  anger: 0.9015
  angered: 0.8845
  angering: 0