In [36]:
import numpy as np
from tokenizers import Tokenizer, models
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import WordLevelTrainer
import pandas as pd
from IPython.display import display, Markdown

text_example = "Alice had no idea what to do"

tokenizer = Tokenizer(models.WordLevel(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Whitespace()
trainer = WordLevelTrainer(special_tokens=["[UNK]"])
tokenizer.train_from_iterator([text_example], trainer=trainer)
output = tokenizer.encode(text_example)

vocab = tokenizer.get_vocab()

vocab_df = pd.DataFrame(list(vocab.items()), columns=["Token", "ID"])

sentence_df = pd.DataFrame({ "Token": output.tokens, "ID": output.ids })
display(Markdown("## Sentence with token IDs"))
display(Markdown("Step 1 - Construct a word based vocab from the sentence"))
display(sentence_df)


## Sentence with token IDs

Step 1 - Construct a word based vocab from the sentence

Unnamed: 0,Token,ID
0,Alice,1
1,had,3
2,no,5
3,idea,4
4,what,7
5,to,6
6,do,2


In [81]:
import pandas as pd
import torch.nn as nn
import torch
embedding_dim = 3

# Generate a simple random 3 dim embedding for the vocab
embedding = nn.Embedding(tokenizer.get_vocab_size(), embedding_dim)

sequence_ids = torch.tensor(output.ids)
sequence_embedding_vectors = embedding(sequence_ids)

# For pandas display purposes
seq_ev_np = sequence_embedding_vectors.detach().numpy()

display(Markdown("## Sentence with random embedding vectors"))
pd.DataFrame(sequence_embedding_vectors.detach().numpy(), index=output.tokens)

## Sentence with random embedding vectors

Unnamed: 0,0,1,2
Alice,1.614764,0.19099,-0.733677
had,-1.580093,-0.060611,0.430378
no,1.849383,0.404539,-1.158705
idea,0.617203,-0.03333,-0.222188
what,-0.780946,0.171008,0.555985
to,-0.824286,-0.299747,-0.251405
do,1.48474,-0.981164,0.959198


In [84]:
import numpy as np

# Max span length, i.e. 1 and 2 word spans
L = 2

span_embeddings = []
span_word_indices = []
for i in range(len(output.ids)):
    for j in range(i, min(i + span_length, len(output.ids))):
        span_embeddings.append(sequence_embedding_vectors[i:j+1])
        span_word_indices.append([i, j])

padded = torch.zeros(len(span_word_indices), L, embedding_dim)
mask = torch.zeros(len(span_word_indices), L, 1)

for i_seq, seq in enumerate(span_word_indices):
    i, j = seq
    p = j-i + 1
    padded[i_seq, :p] = sequence_embedding_vectors[seq[0]:seq[1]+1]
    mask[i_seq, :p] = 1

sums = (padded * mask).sum(dim=1)
counts = mask.sum(dim=1)
means = sums / counts

for i_seq, seq in enumerate(span_word_indices):
    display(Markdown(f"## Span: {i_seq}"))
    i, j = seq
    p = j-i + 1

    tokens = ['Masked'] * L
    tokens[:p] = output.tokens[i:j+1]
    sequence_vectors = padded[i_seq, :L].detach().numpy()
    display(pd.DataFrame(padded[i_seq, :L].detach().numpy(), index=tokens))
    print(f"Span: '{output.tokens[i:j+1]}' - Mean Embedding: {means[i_seq].detach().numpy()}")

## Span: 0

Unnamed: 0,0,1,2
Alice,1.614764,0.19099,-0.733677
Masked,0.0,0.0,0.0


Span: '['Alice']' - Mean Embedding: [ 1.6147645   0.19099018 -0.733677  ]


## Span: 1

Unnamed: 0,0,1,2
Alice,1.614764,0.19099,-0.733677
had,-1.580093,-0.060611,0.430378


Span: '['Alice', 'had']' - Mean Embedding: [ 0.01733559  0.06518935 -0.15164974]


## Span: 2

Unnamed: 0,0,1,2
had,-1.580093,-0.060611,0.430378
Masked,0.0,0.0,0.0


Span: '['had']' - Mean Embedding: [-1.5800933  -0.06061148  0.43037754]


## Span: 3

Unnamed: 0,0,1,2
had,-1.580093,-0.060611,0.430378
no,1.849383,0.404539,-1.158705


Span: '['had', 'no']' - Mean Embedding: [ 0.13464504  0.17196378 -0.3641638 ]


## Span: 4

Unnamed: 0,0,1,2
no,1.849383,0.404539,-1.158705
Masked,0.0,0.0,0.0


Span: '['no']' - Mean Embedding: [ 1.8493834   0.40453905 -1.1587051 ]


## Span: 5

Unnamed: 0,0,1,2
no,1.849383,0.404539,-1.158705
idea,0.617203,-0.03333,-0.222188


Span: '['no', 'idea']' - Mean Embedding: [ 1.233293    0.18560454 -0.6904463 ]


## Span: 6

Unnamed: 0,0,1,2
idea,0.617203,-0.03333,-0.222188
Masked,0.0,0.0,0.0


Span: '['idea']' - Mean Embedding: [ 0.6172028  -0.03332995 -0.22218752]


## Span: 7

Unnamed: 0,0,1,2
idea,0.617203,-0.03333,-0.222188
what,-0.780946,0.171008,0.555985


Span: '['idea', 'what']' - Mean Embedding: [-0.08187145  0.06883895  0.16689888]


## Span: 8

Unnamed: 0,0,1,2
what,-0.780946,0.171008,0.555985
Masked,0.0,0.0,0.0


Span: '['what']' - Mean Embedding: [-0.7809457   0.17100784  0.5559853 ]


## Span: 9

Unnamed: 0,0,1,2
what,-0.780946,0.171008,0.555985
to,-0.824286,-0.299747,-0.251405


Span: '['what', 'to']' - Mean Embedding: [-0.8026159  -0.0643694   0.15229025]


## Span: 10

Unnamed: 0,0,1,2
to,-0.824286,-0.299747,-0.251405
Masked,0.0,0.0,0.0


Span: '['to']' - Mean Embedding: [-0.824286   -0.29974663 -0.25140476]


## Span: 11

Unnamed: 0,0,1,2
to,-0.824286,-0.299747,-0.251405
do,1.48474,-0.981164,0.959198


Span: '['to', 'do']' - Mean Embedding: [ 0.33022678 -0.6404553   0.35389674]


## Span: 12

Unnamed: 0,0,1,2
do,1.48474,-0.981164,0.959198
Masked,0.0,0.0,0.0


Span: '['do']' - Mean Embedding: [ 1.4847395  -0.981164    0.95919824]
