In [1]:
from tokenizers import Tokenizer, models
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import WordLevelTrainer
import pandas as pd
from IPython.display import display, Markdown

text_example = "alice had no idea what to do"

tokenizer = Tokenizer(models.WordLevel(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Whitespace()
trainer = WordLevelTrainer(special_tokens=["[UNK]"])
tokenizer.train_from_iterator([text_example], trainer=trainer)
output = tokenizer.encode(text_example)

vocab = tokenizer.get_vocab()

vocab_df = pd.DataFrame(list(vocab.items()), columns=["Token", "ID"])

sentence_df = pd.DataFrame({ "Token": output.tokens, "ID": output.ids })
display(Markdown("## Sentence with token IDs"))
display(Markdown("Step 1 - Construct a word based vocab from the sentence"))
display(sentence_df)


## Sentence with token IDs

Step 1 - Construct a word based vocab from the sentence

Unnamed: 0,Token,ID
0,alice,1
1,had,3
2,no,5
3,idea,4
4,what,7
5,to,6
6,do,2


In [2]:
import pandas as pd
import torch.nn as nn
import torch
embedding_dim = 3

# Generate a simple random 3 dim embedding for the vocab
embedding = nn.Embedding(tokenizer.get_vocab_size(), embedding_dim)

sequence_ids = torch.tensor(output.ids)
sequence_embedding_vectors = embedding(sequence_ids)

# For pandas display purposes
seq_ev_np = sequence_embedding_vectors.detach().numpy()

display(Markdown("## Sentence with each token's random embedding vectors"))
pd.DataFrame(sequence_embedding_vectors.detach().numpy(), index=output.tokens)

## Sentence with each token's random embedding vectors

Unnamed: 0,0,1,2
alice,-0.310908,-0.362517,-0.058297
had,1.526424,-1.880766,-1.497947
no,0.08537,-0.575056,1.928669
idea,0.087615,-0.407877,-0.146862
what,-0.509899,-1.756818,-1.441058
to,0.46954,1.74016,0.301302
do,-0.077508,-0.4811,0.699782


In [21]:
# Max span length, i.e. 1 and 2 word spans
L = 2

span_embeddings = []
span_word_indices = []
for i in range(len(output.ids)):
    for j in range(i, min(i + L, len(output.ids))):
        span_embeddings.append(sequence_embedding_vectors[i:j+1])
        span_word_indices.append([i, j])

display(Markdown("## Candidate span indices"))
display(Markdown("Produce indices covering 1 and 2 word groupings"))
span_word_indices_df = pd.DataFrame([f"{s[0]} -> {s[1]}" for s in span_word_indices], columns=["Range"])
span_word_indices_df.index.name = 'Span #'
display(span_word_indices_df)

## Candidate span indices

Produce indices covering 1 and 2 word groupings

Unnamed: 0_level_0,Range
Span #,Unnamed: 1_level_1
0,0 -> 0
1,0 -> 1
2,1 -> 1
3,1 -> 2
4,2 -> 2
5,2 -> 3
6,3 -> 3
7,3 -> 4
8,4 -> 4
9,4 -> 5


In [23]:
padded = torch.zeros(len(span_word_indices), L, embedding_dim)
mask = torch.zeros(len(span_word_indices), L, 1)

for i_seq, seq in enumerate(span_word_indices):
    i, j = seq
    p = j-i + 1
    padded[i_seq, :p] = sequence_embedding_vectors[seq[0]:seq[1]+1]
    mask[i_seq, :p] = 1

sums = (padded * mask).sum(dim=1)
counts = mask.sum(dim=1)
means = sums / counts


display(Markdown("## Candidate spans, with mean embeddings"))
display(Markdown("""
    Each span is made up of the embedding vectors of 1 or 2 tokens.
    And needs to produce a mean vector of that span.

    So each span is expanded to a (max_span_length, embedding_dim) matrix.
    With a mask matrix used to ensure that the mean only considers unmasked embedding values

    So the resulting padded and mask matrices are of shape (num_spans, max_span_length, embedding_dim) and (num_spans, max_span_length, 1) respectively.
"""))
for i_seq, seq in enumerate(span_word_indices):
    display(Markdown(f"#### Span: {i_seq}"))
    i, j = seq
    p = j-i + 1

    tokens = ['Masked'] * L
    tokens[:p] = output.tokens[i:j+1]
    sequence_vectors = padded[i_seq, :L].detach().numpy()
    display(pd.DataFrame(padded[i_seq, :L].detach().numpy(), index=tokens))
    print(f"Span: '{output.tokens[i:j+1]}' - Mean Embedding: {means[i_seq].detach().numpy()}")

## Candidate spans, with mean embeddings


    Each span is made up of the embedding vectors of 1 or 2 tokens.
    And needs to produce a mean vector of that span.

    So each span is expanded to a (max_span_length, embedding_dim) matrix.
    With a mask matrix used to ensure that the mean only considers unmasked embedding values

    So the resulting padded and mask matrices are of shape (num_spans, max_span_length, embedding_dim) and (num_spans, max_span_length, 1) respectively.


#### Span: 0

Unnamed: 0,0,1,2
alice,0.35186,2.270647,0.728004
Masked,0.0,0.0,0.0


Span: '['alice']' - Mean Embedding: [0.35185966 2.270647   0.7280041 ]


#### Span: 1

Unnamed: 0,0,1,2
alice,0.35186,2.270647,0.728004
had,0.801416,0.295078,0.090017


Span: '['alice', 'had']' - Mean Embedding: [0.57663804 1.2828624  0.40901077]


#### Span: 2

Unnamed: 0,0,1,2
had,0.801416,0.295078,0.090017
Masked,0.0,0.0,0.0


Span: '['had']' - Mean Embedding: [0.8014164  0.29507786 0.09001744]


#### Span: 3

Unnamed: 0,0,1,2
had,0.801416,0.295078,0.090017
no,1.67873,-2.044677,-0.991493


Span: '['had', 'no']' - Mean Embedding: [ 1.2400734  -0.87479985 -0.4507376 ]


#### Span: 4

Unnamed: 0,0,1,2
no,1.67873,-2.044677,-0.991493
Masked,0.0,0.0,0.0


Span: '['no']' - Mean Embedding: [ 1.6787304 -2.0446775 -0.9914926]


#### Span: 5

Unnamed: 0,0,1,2
no,1.67873,-2.044677,-0.991493
idea,1.095387,-1.347433,-1.204646


Span: '['no', 'idea']' - Mean Embedding: [ 1.3870585 -1.6960552 -1.0980694]


#### Span: 6

Unnamed: 0,0,1,2
idea,1.095387,-1.347433,-1.204646
Masked,0.0,0.0,0.0


Span: '['idea']' - Mean Embedding: [ 1.0953866 -1.3474327 -1.2046462]


#### Span: 7

Unnamed: 0,0,1,2
idea,1.095387,-1.347433,-1.204646
what,-0.585366,0.170341,0.928564


Span: '['idea', 'what']' - Mean Embedding: [ 0.25501022 -0.588546   -0.13804111]


#### Span: 8

Unnamed: 0,0,1,2
what,-0.585366,0.170341,0.928564
Masked,0.0,0.0,0.0


Span: '['what']' - Mean Embedding: [-0.5853662   0.17034076  0.928564  ]


#### Span: 9

Unnamed: 0,0,1,2
what,-0.585366,0.170341,0.928564
to,-1.222529,2.873668,0.678571


Span: '['what', 'to']' - Mean Embedding: [-0.9039475  1.5220045  0.8035674]


#### Span: 10

Unnamed: 0,0,1,2
to,-1.222529,2.873668,0.678571
Masked,0.0,0.0,0.0


Span: '['to']' - Mean Embedding: [-1.2225288  2.8736682  0.6785708]


#### Span: 11

Unnamed: 0,0,1,2
to,-1.222529,2.873668,0.678571
do,1.961216,0.540735,-1.182663


Span: '['to', 'do']' - Mean Embedding: [ 0.36934376  1.7072015  -0.25204602]


#### Span: 12

Unnamed: 0,0,1,2
do,1.961216,0.540735,-1.182663
Masked,0.0,0.0,0.0


Span: '['do']' - Mean Embedding: [ 1.9612163  0.5407349 -1.1826628]
