In [1]:
#installation
# !conda install pytorch torchvision cudatoolkit=10.2 -c pytorch
# !pip install transformers
# !pip install pytorch_transformers

In [2]:
# Tutorial copied from https://github.com/huggingface/transformers#usage

In [3]:
import pdb
import copy

import torch
from torch import nn
from transformers import BertModel, BertTokenizer
CUDA_ENABLED  = 0
from torch.utils.data import Dataset
from itertools import islice
from torch.utils.data import DataLoader
import numpy as np

def to_cuda(tensor):
    if CUDA_ENABLED:
        tensor = tensor.cuda()
    return tensor
def build_sentence_list(start_token, sentences):
    text = [start_token]
    for sentence in sentences:
        text += sentence + ['SEP']
    return text

In [4]:

class CustomDataset(Dataset):
    def __init__(self, filename, num_bunches = 100):
        self.num_bunches = num_bunches
        self.num_lines = 114180969
        self.bunch_width = self.num_lines // num_bunches
        self.filename = filename
        self.set_bunch(0)
    def set_bunch(self, bunch_idx):
        start = bunch_idx * self.bunch_width
        end = (bunch_idx + 1) * self.bunch_width
        with open(self.filename, encoding='iso-8859-1') as f:
            lines = [line[:-1] for line in islice(f, start, end)]
        self.X = lines
    def preprocess(self, text):
        return text
    def __len__(self):
        return len(self.X)
    def __getitem__(self, index):
        return self.X[index]

In [5]:
dataset = CustomDataset('enwiki-latest-pages-articles_preprocessed.txt')

#Wrap it around a dataloader
dataloader = DataLoader(dataset, batch_size = 2, num_workers = 0)

class MaskLMDataset:
    def __init__(self, dataset, dataloader):
        self.dataset = dataset
        self.dataloader = dataloader

In [6]:
itr = 0
for text, lengths_and_masks in dataloader:
    print(len(text))
    itr += 1
    if itr > 2:
        break

10
99
174


In [7]:

class PretrainedModel(nn.Module):
    def __init__(self):
        super(PretrainedModel, self).__init__()
        self.model = to_cuda(BertModel.from_pretrained(
            'bert-base-uncased',
            output_hidden_states=True,
            output_attentions=True
        ))
        self.model.eval()
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    def forward(self, text = None, tokenized_text = None, attention_mask = None):
        if text is not None:
            tokenized_text = to_cuda(torch.tensor([self.tokenizer.encode(text, add_special_tokens=True)]))
        if attention_mask is None:
            attention_mask = to_cuda(torch.tensor([[1]*len(tokenized_text)]))
        all_hidden_states, all_attentions = self.model(tokenized_text, attention_mask = attention_mask)[-2:]
        return all_hidden_states
        
model = PretrainedModel()
hidden_states = model("Here is some text to encode")

len(hidden_states)


13

In [8]:
class RealLinear(nn.Module):
    def __init__(self, input_size, output_size):
        super(RealLinear, self).__init__()
        self.input_size = input_size
        self.output_size = output_size
        self.linear = nn.Linear(input_size, output_size)
        
    def forward(self, vector):
        shape = vector.shape
        vector = vector.view(-1, self.input_size)
        output = self.linear(vector)
        output = output.view(list(shape[:-1]) + [self.output_size])
        return output

In [68]:
class Attention(nn.Module):
    def __init__(self, input_size):
        super(Attention, self).__init__()
        self.input_size = input_size
        self.k, self.q, self.v = [nn.Linear(input_size, input_size) for _ in range(3)]
        self.softmax_2 = nn.Softmax(dim=2)
        
    def forward(self, vector, attention_mask = None):
        if attention_mask is None:
            print('no attn mask')
            self.attention_mask = attention_mask = to_cuda(torch.tensor(np.ones(vector.shape)))
        #vector is batch_size, words, emb
        #unsqueezed vector is batch_size, w_sent, w_att, emb
#         vector = vector * attention_mask
        self.bs, self.wds, self.emb = bs, wds, emb = vector.shape
        self.vector = vector
        self.key, self.query, self.value = key, query, value = [l(vector) for l in [self.k, self.q, self.v]]
        self.key_raw = key
        self.query = query = query.unsqueeze(2)
        self.key, self.value = key, value = key.unsqueeze(1), value.unsqueeze(1)
        self.key_raw_2 = key
        self.mask_keys = mask_keys = attention_mask.unsqueeze(1)
        mask_keys = mask_keys.repeat([1, wds, 1, 1])
        self.mask_set_to_neg_inf = mask_set_to_neg_inf = ((1-mask_keys) * -1e10) + 1
        self.query = query = query.repeat([1, 1, wds, 1])
        self.key, self.value = key, value = [v.repeat([1, wds, 1, 1]) for v in [key, value]]
        self.key_raw_3 = key
        return key
        #shape: batch_size, w_sent, w_att, emb
#         pdb.set_trace()
        self.attn_weights_raw = attn_weights_raw = key * query * mask_set_to_neg_inf
        #softmax over w_att
        #shape: batch_size, w_sent, w_att, emb
        self.attn_weights = attn_weights = self.softmax_2(attn_weights_raw)
        #multiply with value
        #shape: batch_size, w_sent, w_att, emb
        self.sampled_values = sampled_values = attn_weights * value
        #sum to output: batch_size, w_sent, emb
        self.output = output = torch.sum(sampled_values, 2)
        return output

class MultiHeadAttn(nn.Module):
    def __init__(self, num_heads = 2, input_size = 2):
        super(MultiHeadAttn, self).__init__()
        self.attn_layers = attn_layers = nn.ModuleList([Attention(input_size) for _ in range(num_heads)])
        self.linear = linear = nn.Linear(input_size*num_heads, input_size)
    
    def forward(self, vector, attention_mask = None):
        if attention_mask is None:
            print('no attn mask')
            self.attention_mask = attention_mask = to_cuda(torch.tensor(np.ones(vector.shape)))
        self.vector = vector
        self.attn_outputs = attn_outputs = [l(vector, attention_mask) for l in self.attn_layers]
        self.attn_outputs_stacked = attn_outputs_stacked = torch.cat(attn_outputs, -1)
        self.output = output = self.linear(attn_outputs_stacked)
        return output
        
class Transformer(nn.Module):
    def __init__(self, num_heads = 2, input_size = 2, norm_layer = nn.LayerNorm):
        super(Transformer, self).__init__()
        self.MHA = MultiHeadAttn(num_heads, input_size)
        self.linear = nn.Linear(input_size, input_size)
        self.norm_layer = norm_layer(input_size)
    
    def forward(self, vector, attention_mask = None):
        if attention_mask is None:
            print('no attn mask')
            self.attention_mask = attention_mask = to_cuda(torch.tensor(np.ones(vector.shape)))

        self.mha_output = mha_output = self.MHA(vector, attention_mask)
        self.mha_with_shortcut = mha_with_shortcut = mha_output + vector
        self.mha_normed = mha_normed = self.norm_layer(mha_with_shortcut * attention_mask)
        self.linear_output = linear_output = self.linear(mha_normed)
        self.linear_with_shortcut = linear_with_shortcut = linear_output + mha_normed
        self.linear_normed = linear_normed = self.norm_layer(linear_with_shortcut * attention_mask)
        return linear_normed

class TransformerChain(nn.Module):
    def __init__(self, num_heads=2, input_size = 2, num_transformers = 2):
        super(TransformerChain, self).__init__()
        self.transformers = nn.ModuleList(
            [Transformer(num_heads, input_size) for _ in range(num_transformers)]
        )
        
    def forward(self, vector, attention_mask = None):
        if attention_mask is None:
            print('no attn mask')
            self.attention_mask = attention_mask = to_cuda(torch.tensor(np.ones(vector.shape)))
        self.vectors = []
        for trf in self.transformers:
            self.vector = vector = trf(vector, attention_mask)
            self.vectors.append(vector)
        return vector
    

In [69]:
mdl = Attention(sizes)
tensor = torch.Tensor([np.arange(sizes*sizes).reshape(sizes, sizes)])
mask = torch.ones_like(tensor)
mask[0,1,:] = 0
mask3 =mdl(tensor, mask)
tensor[0,1,:] = 0
mask4 =mdl2(tensor, mask)

RuntimeError: size mismatch, m1: [3 x 3], m2: [10 x 10] at ..\aten\src\TH/generic/THTensorMath.cpp:41

In [63]:
for model in [Attention]:
    sizes = 3
    mdl = model(sizes)
    mdl2 = copy.deepcopy(mdl)
    tensor = torch.Tensor([np.arange(sizes*sizes).reshape(sizes, sizes)])
    base = mdl(tensor)
#     mask = torch.ones(list(tensor.shape[:-1]) + [1])
    mask = torch.ones_like(tensor)
    mask[0,1,:] = 0
    mask1 =mdl(tensor, mask)
#     mask = torch.ones(list(tensor.shape[:-1]) + [1])
    mask = torch.ones_like(tensor)
    mask[0,0,:] = 0
    mask2 =mdl(tensor, mask)
    
    assert not np.array_equal(base.detach().numpy(),mask1.detach().numpy()) and \
        not np.array_equal(base.detach().numpy(),mask2.detach().numpy()) and \
        not np.array_equal(mask1.detach().numpy(),mask2.detach().numpy())
    
    tensor = torch.Tensor([np.arange(sizes*sizes).reshape(sizes, sizes)])
#     mask = torch.ones(list(tensor.shape[:-1]) + [1])
    mask = torch.ones_like(tensor)
    mask[0,1,:] = 0
    mask3 =mdl(tensor, mask)
    tensor[0,1,:] = 0
    mask4 =mdl2(tensor, mask)
    mask3[0,1,:] = 0
    mask4[0,1,:] = 0
    assert np.allclose(mask3.detach().numpy(),mask4.detach().numpy())

no attn mask


AssertionError: 

In [57]:
for model in [TransformerChain]:
    sizes = 10
    mdl = model(sizes, sizes)
    mdl2 = copy.deepcopy(mdl)
    tensor = torch.Tensor([np.arange(sizes*sizes).reshape(sizes, sizes)])
    base = mdl(tensor)
    mask = torch.ones(list(tensor.shape[:-1]) + [1])
    mask[0,1] = 0
    mask1 =mdl(tensor, mask)
    mask = torch.ones(list(tensor.shape[:-1]) + [1])
    mask[0,0] = 0
    mask2 =mdl(tensor, mask)
    
    assert not np.array_equal(base.detach().numpy(),mask1.detach().numpy()) and \
        not np.array_equal(base.detach().numpy(),mask2.detach().numpy()) and \
        not np.array_equal(mask1.detach().numpy(),mask2.detach().numpy())
    
    tensor = torch.Tensor([np.arange(sizes*sizes).reshape(sizes, sizes)])
    mask = torch.ones(list(tensor.shape[:-1]) + [1])
    mask[0,1] = 0
    mask3 =mdl(tensor, mask)
    tensor[0,1] = 0
    mask4 =mdl2(tensor, mask)
    mask3[0,1] = 0
    mask4[0,1] = 0
    assert np.allclose(mask3.detach().numpy(),mask4.detach().numpy())

no attn mask


RuntimeError: Expected object of scalar type Double but got scalar type Float for argument #3 'mat2' in call to _th_addmm_out

In [26]:
mask3[0,0,0], mask4[0,0,0]

(tensor(-1.8673, grad_fn=<SelectBackward>),
 tensor(-1.8817, grad_fn=<SelectBackward>))

In [17]:
for model in [TransformerChain, MultiHeadAttn, Transformer, Attention]:
    mdl = model()
    mdl2 = copy.deepcopy(mdl)
    tensor = torch.Tensor([[[1, 2], [4, 5]]]*4)
    base = mdl(tensor)
    mask = torch.ones(list(tensor.shape[:-1]) + [1])
    mask[0,1] = 0
    mask1 =mdl(tensor, mask)
    mask = torch.ones(list(tensor.shape[:-1]) + [1])
    mask[0,0] = 0
    mask2 =mdl(tensor, mask)
    
    assert not np.array_equal(base.detach().numpy(),mask1.detach().numpy()) and \
        not np.array_equal(base.detach().numpy(),mask2.detach().numpy()) and \
        not np.array_equal(mask1.detach().numpy(),mask2.detach().numpy())
    
    tensor = torch.Tensor([[[1, 2], [4, 5]]]*4)
    mask = torch.ones(list(tensor.shape[:-1]) + [1])
    mask[0,1] = 0
    mask3 =mdl(tensor, mask)
    tensor[0,1] = 0
    mask4 =mdl2(tensor, mask)
    mask3[0,1] = 0
    mask4[0,1] = 0
    assert np.allclose(mask3.detach().numpy(),mask4.detach().numpy())

no attn mask
no attn mask


AssertionError: 

In [18]:
model

__main__.MultiHeadAttn

In [19]:
mask3.detach().numpy(),mask4.detach().numpy()

(array([[[ 0.90996057, -0.9055404 ],
         [ 0.        ,  0.        ]],
 
        [[ 0.90996057, -0.9055404 ],
         [ 0.87828135, -1.4151497 ]],
 
        [[ 0.90996057, -0.9055404 ],
         [ 0.87828135, -1.4151497 ]],
 
        [[ 0.90996057, -0.9055404 ],
         [ 0.87828135, -1.4151497 ]]], dtype=float32),
 array([[[ 0.35096622,  0.30438673],
         [ 0.        ,  0.        ]],
 
        [[ 0.90996057, -0.9055404 ],
         [ 0.87828135, -1.4151497 ]],
 
        [[ 0.90996057, -0.9055404 ],
         [ 0.87828135, -1.4151497 ]],
 
        [[ 0.90996057, -0.9055404 ],
         [ 0.87828135, -1.4151497 ]]], dtype=float32))

In [20]:
mdl2.vector.detach().numpy()[0,0],\
 mdl.vector.detach().numpy()[0,0]

(array([1., 2.], dtype=float32), array([1., 2.], dtype=float32))

In [21]:
mdl2.attn_outputs[0].detach().numpy()[0,0],\
 mdl.attn_outputs[0].detach().numpy()[0,0]

(array([0.24015349, 1.2389604 ], dtype=float32),
 array([-0.35128397,  3.6968734 ], dtype=float32))

In [22]:
mask3.detach().numpy()[0,0],\
mask4.detach().numpy()[0,0]

(array([ 0.90996057, -0.9055404 ], dtype=float32),
 array([0.35096622, 0.30438673], dtype=float32))

In [None]:
lin = nn.Linear(3,4)

In [None]:
lin(torch.Tensor([[[1, 2, 3], [4, 5, 6]]] * 2)).shape

In [None]:
torch.Tensor([[[1, 2, 3], [4, 5, 6]]] * 2).shape

In [None]:

class TinyBert(nn.Module):
    def __init__(self, num_heads=3, input_size = 3, num_transformers = 2):
        super(TinyBert, self).__init__()
        self.Transformers = TransformerChain(num_heads, input_size, num_transformers)
    def forward(self, tokenized_text, attention_mask = None):
        if attention_mask is None:
            print('no attn mask')
            attention_mask = to_cuda(torch.tensor(np.ones_like(tokenized_text)))


In [None]:

[p for p in mdl.transformers[0].MHA.attn_layers[0].k.parameters()]


In [None]:

[p for p in mdl2.transformers[0].MHA.attn_layers[0].k.parameters()]


In [None]:

mdl2.transformers[0].MHA.attn_layers[0].output[1], \
 mdl.transformers[0].MHA.attn_layers[0].output[1]

In [None]:

mdl2.transformers[0].MHA.attn_layers[0].key[1,:,1], \
 mdl.transformers[0].MHA.attn_layers[0].key[1,:,1]

In [None]:

mdl2.transformers[0].MHA.attn_layers[0].key_raw[1,1]

In [None]:

mdl.transformers[0].MHA.attn_layers[0].key_raw[1,1]

In [None]:
# mdl

# mdl2.vectors[-1]

# mdl

# [p for p in mdl.transformers[0].MHA.attn_layers[0].k.parameters()]

# [p for p in mdl2.transformers[0].MHA.attn_layers[0].k.parameters()]

# mdl2.transformers[0].MHA.attn_layers[0].k

# mdl.transformers[0].MHA.attn_layers[0].vector

# mdl2.transformers[0].MHA.attn_layers[0].vector

# mdl.transformers[0].MHA.attn_layers[0].key

# mdl2.transformers[0].MHA.attn_layers[0].key

# mdl2.transformers[0].linear_normed


#         self.mha_output = mha_output = self.MHA(vector, attention_mask)
#         self.mha_with_shortcut = mha_with_shortcut = mha_output + vector
#         self.mha_normed = mha_normed = self.norm_layer(mha_with_shortcut * attention_mask)
#         self.linear_output = linear_output = self.linear(mha_normed)
#         self.linear_with_shortcut = linear_with_shortcut = linear_output + mha_normed
#         self.linear_normed = 

In [None]:
mask3

In [None]:
mask4

In [None]:
trfChain = TransformerChain(input_size = 3)
trfChain(torch.Tensor([[[1, 2, 3], [4, 5, 6]]] * 4)).shape

In [None]:
MHA = MultiHeadAttn(2, 3)
MHA(torch.Tensor([[[1, 2, 3], [4, 5, 6]]] * 4)).shape

In [None]:
trf = Transformer(2, 3)
trf(torch.Tensor([[[1, 2, 3], [4, 5, 6]]] * 4)).shape

In [None]:
attn = Attention(3)
attn(torch.Tensor([[[1, 2, 3], [4, 5, 6]]] * 4))

In [None]:
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.pretrained_model = PretrainedModel()
        self.tinybert = TinyBert()
        self.tokenizer = self.pretrained_model.tokenizer
        self.y = []

    def forward(self, text):
        if isinstance(text[0], list):
            return self.forward_sentence(text)
        elif isinstance(text[0], str):
            return self.forward_maskLM(text)
        else:
            raise ValueError('wtf is this text?' + text + type(text[0]))
    def forward_maskLM(self, text):
        self.y = []
        sentences = [build_sentence_list(
            'CLS', [self.tokenizer.tokenize(line)]) for line in text]
        
        lengths = [len(sentence) - 2 for sentence in sentences]
        mask_idxes = [np.random.randint(0, length) for length in lengths]
        
        masks = [np.ones(length + 2) for length in lengths]
        for mask_idx, mask, sentence in zip(mask_idxes, masks, sentences):
            mask[mask_idx + 1] = 0
            self.y.append(sentence[mask_idx + 1])
            sentence[mask_idx + 1] = '[MASK]'
        attention_mask = to_cuda(torch.tensor(pad_sequences(masks, padding='post')))
        tokenized_text = to_cuda(torch.tensor(pad_sequences([
            self.tokenizer.convert_tokens_to_ids(sentence) for sentence in sentences]).tolist()))
        pretrained_hidden = self.pretrained_model(
            tokenized_text = tokenized_text, attention_mask = attention_mask)
        
        return pretrained_hidden
        

In [None]:
mdl = Model()

pretrained_hidden = mdl(['hi there', 'how are you my. name is lee and i am writing code'])


In [None]:
len('how are you my . name is lee and i am writing code'.split(' '))

In [None]:
pretrained_hidden[0].shape

In [None]:
k, q, v = [torch.Tensor([1, 2, 3]) for _ in range(3)]

In [None]:
input: 'hey, wanna get some bubbles?'
    
numbers = tinybert(input)
food_prediction = model(numbers)
serve ads for boba

In [None]:
-> train tinybert model
-> train model from embeddings for <task>
-> evaluate performance at <task> (train/validation)
loop back to beggining until performance good

In [None]:
len(pretrained_hidden)

In [None]:
pretrained_hidden[4].shape

In [None]:
import matplotlib.pyplot as plt
plt.hist(pretrained_hidden[0].reshape(-1).detach().numpy(), bins = 100)

In [None]:
mdl(['hi there'])[0][0][:3,0]