## NOTE
Cyverse kept freezing during the process to the point where it's unusable, so I decided to work on the subset of the original dataset for now. 

Please comment the line `df = df[0:2500]` if needed!

2. Read the texts provided with this assignment (link TBD). Tokenize them using the tokenizer corresponding to the transformer chosen in the previous step. Note that the resulting tokens are sub-word tokens that may not correspond to a full word.
3. Generate the contextualized embeddings for all the tokens in the dataset and compute the average embedding for each token in the vocabulary by averaging all its contextualized embeddings.

In [1]:
import random
import torch
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

# enable tqdm in pandas
tqdm.pandas()

# set to True to use the gpu (if there is one available)
use_gpu = True

# select device
device = torch.device('cuda' if use_gpu and torch.cuda.is_available() else 'cpu')
print(f'device: {device.type}')

# random seed
seed = 2024

# set random seed
if seed is not None:
    print(f'random seed: {seed}')
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

device: cuda
random seed: 2024


In [2]:
file = 'assignment4-dataset.txt'

# read in file as a whole
with open(file, 'r') as f:
    text = f.read()

In [3]:
len(text)

300935040

In [4]:
# split by double new lines
split_text = text.split('\n\n')

In [5]:
len(split_text)

449919

In [6]:
df = pd.DataFrame(split_text, columns=['text'])

In [7]:
df['text']

0         The White Monkey is a 1925 American silent dra...
1         Preservation\nAn incomplete print of The White...
2                                            External links
3         Films based on works by John Galsworthy\nFilms...
4          the montane grasslands and shrublands biome\n...
                                ...                        
449914                                           References
449915                                    External links\n 
449916    1929 births\n2015 deaths\nPeople from Naseby, ...
449917    Scottish female golfers\nGolfers from Edinburg...
449918    1868 births\n1947 deaths\nGerman Assyriologist...
Name: text, Length: 449919, dtype: object

In [8]:
# using only a subset
# comment to use entire dataset
df = df[0:2500]

In [9]:
# Load model directly
from transformers import AutoTokenizer, AutoModel, AutoConfig

transformer_name="Tejas3/distillbert_base_uncased_80_equal"

tokenizer = AutoTokenizer.from_pretrained(transformer_name, use_fast=True)
model = AutoModel.from_pretrained(transformer_name)

model = model.to(device)

In [10]:
from collections import defaultdict
from datasets import Dataset, DatasetDict

# use Dataset for batching
ds = DatasetDict()

ds['text'] = Dataset.from_pandas(df)
ds

DatasetDict({
    text: Dataset({
        features: ['text'],
        num_rows: 2500
    })
})

In [11]:
def tokenize(batch):
    return tokenizer(batch['text'], return_tensors='pt', padding=True, truncation=True)

In [13]:
tokens_ds = ds['text'].map(tokenize, batched=True)

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

In [156]:
type(tokens_ds)

datasets.arrow_dataset.Dataset

In [14]:
tokens_ds.to_pandas()

Unnamed: 0,text,input_ids,attention_mask
0,The White Monkey is a 1925 American silent dra...,"[101, 1996, 2317, 10608, 2003, 1037, 4849, 213...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,Preservation\nAn incomplete print of The White...,"[101, 8347, 2019, 12958, 6140, 1997, 1996, 231...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,External links,"[101, 6327, 6971, 102, 0, 0, 0, 0, 0, 0, 0, 0,...","[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,Films based on works by John Galsworthy\nFilms...,"[101, 3152, 2241, 2006, 2573, 2011, 2198, 1489...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,the montane grasslands and shrublands biome\n...,"[101, 1996, 21704, 26183, 1998, 15751, 8653, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
...,...,...,...
2495,First implemented in Chicago and New York City...,"[101, 2034, 7528, 1999, 3190, 1998, 2047, 2259...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2496,Smart zoning is a broad term that consists of ...,"[101, 6047, 27462, 2003, 1037, 5041, 2744, 200...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2497,"Planned unit development is cluster zoning, bu...","[101, 3740, 3131, 2458, 2003, 9324, 27462, 101...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2498,Amendments to zoning regulations may be subjec...,"[101, 16051, 2000, 27462, 7040, 2089, 2022, 33...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [38]:
output = tokenizer("Scottish female golfers\nGolfers from Edinburg")
pd.DataFrame(
    [output.tokens(), output.word_ids(), output.input_ids],
    index=['tokens', 'word_ids', 'input_ids'],
)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
tokens,[CLS],scottish,female,golfer,##s,golfer,##s,from,ed,##in,##burg,[SEP]
word_ids,,0,1,2,2,3,3,4,5,5,5,
input_ids,101,4104,2931,20601,2015,20601,2015,2013,3968,2378,4645,102


Resulting tokens are sub-word tokens that may not be a full-word.

Generate contextualized embedding for all the tokens,
compute average embedding for EACH token in the vocab by averaging ALL its contextualized embeddings.  

In [98]:
df_test = df[0:1]

# use Dataset for batching
ds_test = DatasetDict()

ds_test['text'] = Dataset.from_pandas(df_test)
ds_test

DatasetDict({
    text: Dataset({
        features: ['text'],
        num_rows: 1
    })
})

In [131]:
config = AutoConfig.from_pretrained(transformer_name)

# dict to popularize and use later for averaging
emb_dict = defaultdict(lambda: torch.zeros(config.hidden_size).to(device))
count_dict = defaultdict(int)


def process(batch):
    tokenized_batch = tokenizer(batch['text'], padding=True, truncation=True, return_tensors='pt').to(device)

    with torch.no_grad():
        output = model(**tokenized_batch)

    # extract embedding for the batch
    hidden_state = output.last_hidden_state

    token_ids = tokenized_batch['input_ids'][0]
    for i, input_id in enumerate(token_ids):
        token = tokenizer.decode(input_id)
        emb = hidden_state[0, i]

        emb_dict[token] += emb
        count_dict[token] += 1
        

In [134]:
ds.map(process)

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

DatasetDict({
    text: Dataset({
        features: ['text'],
        num_rows: 2500
    })
})

In [141]:
averaged_embeddings = {}

for token, emb_sum in emb_dict.items():
    count = count_dict[token]
    avg_emb = emb_sum / count
    averaged_embeddings[token] = avg_emb


In [142]:
len(averaged_embeddings)

18984

In [150]:
averaged_embeddings['bank'][:20]

tensor([-0.4808, -0.0273,  0.1218, -0.0136,  1.7032,  0.4736, -0.7008,  0.6053,
         0.3015,  0.3568,  0.4151, -0.4070, -0.0592, -0.2390, -0.8317,  0.0172,
         0.2599,  0.3683,  0.6968,  0.0497], device='cuda:0')

In [151]:
averaged_embeddings['##bank'][:20]

tensor([-0.4516, -0.0497,  0.4426, -0.0465,  1.9454,  0.3642, -0.7740,  0.1438,
         0.0791,  0.2405,  0.4439, -0.5630, -0.0471, -0.3022, -0.7502,  0.0564,
         0.2107,  0.1833,  0.2419, -0.0805], device='cuda:0')

In [152]:
averaged_embeddings['a'][:20]

tensor([-1.1211,  0.4314,  0.2021,  0.0670,  0.8984,  0.0598,  0.1892,  0.2609,
        -0.0384,  0.1633, -0.0507, -0.6512, -0.5906, -0.3929, -0.1859,  0.5465,
         0.4533, -0.3638,  0.1692,  0.2941], device='cuda:0')

In [153]:
averaged_embeddings['an'][:20]

tensor([-0.9624,  0.5725,  0.1105, -0.0946,  0.7890,  0.0748, -0.0712,  0.1090,
         0.2990, -0.1849,  0.1319, -0.5593, -0.6327, -0.6839, -0.0825,  0.7745,
         0.3140, -0.3574,  0.1573,  0.3212], device='cuda:0')

In [154]:
averaged_embeddings['what'][:20]

tensor([-0.8990,  0.6184,  0.4240, -0.5523,  0.6293, -0.0699,  0.2768,  0.7017,
         0.1377,  0.0261,  0.2182, -0.6507, -0.7590, -0.3453, -0.7088,  0.6377,
         0.5229, -0.5063,  0.0275,  0.3943], device='cuda:0')