In [22]:
import os

from data import *
from utils import *
from pretrained_model import *
from transformer import *
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from torch.nn.functional import leaky_relu
import torch.nn.functional as F


In [26]:

import download_glue_data
if not os.path.isdir('glue_data'):
    download_glue_data.main('')

In [None]:
dataset = CustomDataset('sample100000.txt')
dataloader = DataLoader(dataset, batch_size = 2, num_workers = 0, shuffle=True)

In [None]:
trf =  Transformer(d_model = 100, nhead = 2, num_encoder_layers = 3, 
                   dim_feedforward = 100, dropout = .1, activation = 'lrelu')

In [None]:
trf_output, trf_attn = trf(torch.rand((10, 32, 100)), src_key_padding_mask=torch.ones((32, 10)))

In [None]:
[l.shape for l in trf_output]

In [None]:
[l.shape for l in trf_attn]

In [None]:
assert np.allclose(trf_attn[0].sum(-1).detach().numpy(),1)

In [None]:
pretrained = PretrainedModel()

In [None]:
torch.transpose(torch.rand((10, 32, 100)), 1, 0).shape

In [None]:
class Embedder(nn.Module):
    def __init__(self, vocab_size, d_model):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, d_model)
    def forward(self, x):
        return self.embed(x)

In [None]:
import math
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
#         self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return x

In [None]:
emb = Embedder(5, 4)
# PE(emb(torch.tensor([[1, 2, 3]]))).shape

In [None]:
tok = BertTokenizer.from_pretrained('bert-base-uncased')

VOCAB_SIZE = vocab_size = tok.vocab_size

In [None]:
# h1 = W1 * x
# h2 = W2 * h1 + h1
# h3 = W3 * h2 + h2
# y = W4 * h3 + h3

# d(L)/w1 = d(l/y)*W4*W3*W2*x + <> +

# d(L/W1) = d(l/y) * (1 + W4) * (1 + W3) * (1 + W2) * x

# d(l/y) * x * (1 + W4 + W3 + W2 )

In [None]:
class TinyBert(nn.Module):
    def __init__(self, vocab_size = VOCAB_SIZE, emb_size=144, nhead = 12, num_encoder_layers = 6, teacher_size=768):
        super(TinyBert, self).__init__()
        self.emb_size = emb_size
        self.model = Transformer(
            d_model = emb_size, nhead = nhead, num_encoder_layers = num_encoder_layers, 
            dim_feedforward = emb_size, dropout = .1, activation = 'lrelu')
        self.embedder = Embedder(vocab_size, emb_size)
        self.PE = PositionalEncoding(emb_size)
        self.teacher_size = teacher_size
        self.linear_layers = nn.ModuleList([nn.Linear(emb_size, teacher_size) for _ in range(num_encoder_layers + 1)])
        self.linear_output = nn.Linear(emb_size, vocab_size)
    def forward(self, src, mask=None):
        if mask is None:
            mask = torch.ones_like(src, dtype = float)
        #reshaping cus trf module is stupid
        self.mask = mask
        self.emb_raw = emb_raw = self.embedder(src)
        self.emb = emb = self.PE(emb_raw)
        self.emb_transposed = emb_transposed = torch.transpose(emb, 1, 0)
        self.hidden, self.attn = hidden, attn = self.model(emb_transposed, src_key_padding_mask=mask)
        self.emb_hidden = [emb_transposed] + hidden
        emb_and_hidden = [torch.transpose(l, 1, 0) for l in self.emb_hidden]
        self.projections = projections = [l(embedding) for l, embedding in zip(self.linear_layers, emb_and_hidden)]
        self.output_logits = output_logits = torch.transpose(self.linear_output(hidden[-1]), 1, 0)
        self.output_probs = output_probs = F.softmax(output_logits, -1)
        return output_probs, output_logits, projections, emb_and_hidden, attn

In [None]:
tb = TinyBert()

In [None]:
# from tensorflow.python.keras.preprocessing.sequence import pad_sequences
import torch.optim as optim

class Model(nn.Module):
    def __init__(self, num_encoder_layers = 7):
        super(Model, self).__init__()
        self.pretrained_model = PretrainedModel()
        self.tinybert = TinyBert(num_encoder_layers=num_encoder_layers)
        self.num_encoder_layers = num_encoder_layers
        # assuming 13 layers
        self.step = int((13-1)/(self.num_encoder_layers-1))
#         self.tinybert = Transformer(d_model = 100, nhead = 2, num_encoder_layers = 3, 
#                    dim_feedforward = 100, dropout = .1, activation = 'lrelu')
        self.tokenizer = self.pretrained_model.tokenizer
        self.y = []
        self.optimizer = optim.RMSprop(self.parameters(), lr=0.01)

    def forward(self, text):
        if isinstance(text[0], list):
            return self.forward_sentence(text)
        elif isinstance(text[0], str):
            return self.forward_maskLM(text)
        else:
            raise ValueError('wtf is this text?' + text + type(text[0]))
            
    def preprocess_LM(self, text):
        self.y = []
        sentences = [build_sentence_list(
            'CLS', [self.tokenizer.tokenize(line)]) for line in text]
        
        lengths = [len(sentence) - 2 for sentence in sentences]
        mask_idxes = [np.random.choice(length, size=math.ceil(length/7), replace=False) for length in lengths]
        
        masks = [np.ones(length + 2) for length in lengths]
        for mask_idxes, mask, sentence in zip(mask_idxes, masks, sentences):
            self.y.append([])
            for mask_idx in mask_idxes:
                mask[mask_idx + 1] = 0
                self.y[-1].append(sentence[mask_idx + 1])
                sentence[mask_idx + 1] = '[MASK]'
        self.attention_mask = attention_mask = to_cuda(torch.tensor(pad_sequences(masks, padding='post')))
        self.tokenized_text = tokenized_text = to_cuda(torch.tensor(pad_sequences([
            self.tokenizer.convert_tokens_to_ids(sentence) for sentence in sentences]).tolist()))
        return tokenized_text, attention_mask

    def forward_maskLM(self, text):
        tokenized_text, attention_mask = self.preprocess_LM(text)
        self.pretrained_loss, self.pretrained_output, self.pretrained_hidden, self.pretrained_attn = \
            pretrained_loss, pretrained_output, pretrained_hidden, pretrained_attn = self.pretrained_model(
            tokenized_text = tokenized_text, attention_mask = attention_mask)
        self.tb_output, self.tb_logits, self.tb_projection, self.tb_hidden, self.tb_attn = \
            tb_output, tb_logits, tb_projection, tb_hidden, tb_attn = \
            self.tinybert(tokenized_text, mask=attention_mask)
        # self.tb_out_masked = tb_out_masked = tb_out * attention_mask.transpose(1, 0).unsqueeze(-1)
        pretrained_hidden = pretrained_hidden[::self.step]
        pretrained_attn = pretrained_attn[::self.step]
        return (tokenized_text, attention_mask,
                pretrained_loss, pretrained_output, pretrained_hidden, pretrained_attn, 
                tb_output, tb_logits, tb_projection, tb_hidden, tb_attn)
        

In [None]:
mdl = Model()

In [None]:
def loss_hidden(tb_projection, pretrained_hidden):
    lossfcn = nn.MSELoss()
    return sum([lossfcn(t, p) for t, p in zip(tb_projection, pretrained_hidden)])

def loss_attn(tb_attn, pretrained_attn):
    lossfcn = nn.MSELoss()
    return sum([lossfcn(t, p) for t, p in zip(tb_attn, pretrained_attn)])
    
def loss_pred(pt_output, tb_logits):
    m = nn.LogSoftmax()
    return -pt_output * m(tb_logits)
    
def loss(pt_loss, pt_output, pt_hidden, pt_attn, tb_output, tb_logits, tb_projection, tb_hidden, tb_attn):
    L_hid = loss_hidden(tb_projection, pt_hidden)
    L_attn = loss_attn(tb_attn, pt_attn)
    L_pred = loss_pred(pt_output, tb_logits)
    return L_hid + L_attn

In [None]:
def step(mdl, text, i):
    mdl.zero_grad()
    tok_txt, msk, pt_loss, pt_output, pt_hidden, pt_attn, tb_output, tb_logits, tb_projection, tb_hidden, tb_attn = mdl(text)
    loss_val = loss(pt_loss, pt_output, pt_hidden, pt_attn, tb_output, tb_logits, tb_projection, tb_hidden, tb_attn)
    loss_val.backward()
    mdl.optimizer.step()
    if i % 3 == 0:
        print(loss_val)
    

In [None]:
itr = 0
for text in dataloader:
#     print(len(text), text)
    itr += 1
    mdl.zero_grad()
    tok_txt, msk, pt_loss, pt_output, pt_hidden, pt_attn, tb_output, tb_logits, tb_projection, tb_hidden, tb_attn = mdl(text)
    loss_val = loss(pt_loss, pt_output, pt_hidden, pt_attn, tb_output, tb_logits, tb_projection, tb_hidden, tb_attn)
    loss_val.backward()
    mdl.optimizer.step()
    if itr % 3 == 0:
        print(loss_val)

In [None]:
lossfcn = nn.MSELoss()

In [None]:
lossfcn(tb_attn[-1], pt_attn[-1])

In [None]:
tb_attn[0].shape, pt_attn[0].shape

In [None]:
for i in range(10000):
    step(mdl, text, i)

In [None]:
loss_val

In [None]:
[p.grad for p in mdl.parameters()]

In [None]:
mdl.tinybert.model.encoder.layers[0].linear1.weight.grad

In [None]:
loss_val

In [None]:
pretrained_hidden[0].shape

In [None]:
[t.shape for t in tb_projection]

In [None]:
tokenized_text, attention_mask, tb_projection, tb_out, tb_attn, pretrained_hidden, pretrained_attn = mdl(['hi there how are you'])

In [None]:
pretrained.model.embeddings(tokenized_text).shape

In [None]:
len(pretrained_attn), [h.shape for h in pretrained_attn]

In [None]:
len(pretrained_hidden), [h.shape for h in pretrained_hidden]

In [None]:
from utils import *

In [None]:
from transformers import BertTokenizer, BertForMaskedLM
import torch
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased',
            output_hidden_states=True,
            output_attentions=True,)
input_ids = tokenizer("Hello, my dog is cute", return_tensors="pt")["input_ids"]
outputs = model(input_ids, labels=input_ids)

In [None]:
m2 = PretrainedModel()

In [None]:
outputs[1].shape

In [None]:
len(outputs[2])

In [None]:
len(outputs[3])

In [None]:
len(outputs)

In [None]:
b = BertForMaskedLM.from_pretrained(
            'bert-base-uncased', return_dict=True,
            output_hidden_states=True,
            output_attentions=True,
        )

In [None]:
ret = b(torch.tensor([[10, 20, 30]]))
len(ret)

In [None]:
ret[0].shape

In [None]:
ret[1].shape

In [None]:
len(ret[2])

In [None]:
len(ret[3])

In [None]:
from transformers import BertTokenizer, BertForPreTraining

In [None]:
from transformers import BertModel, BertTokenizer, BertForPreTraining, BertForMaskedLM


In [None]:
b.BertForPreTrainingOutput

In [88]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-cased-finetuned-mrpc',
            output_hidden_states=True,
            output_attentions=True)

inputs = tokenizer(["hi there.[SEP] How are you?"], return_tensors="pt")
labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
outputs = model(**inputs, labels=labels)
loss = outputs
# logits = outputs.logits

In [89]:
inputs

{'input_ids': tensor([[ 101, 7632, 2045, 1012,  102, 2129, 2024, 2017, 1029,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [96]:
from transformer.modeling import TinyBertForSequenceClassification


ModuleNotFoundError: No module named 'transformer.modeling'; 'transformer' is not a package

In [101]:
from transformers import BertTokenizer, BertForMultipleChoice
import torch
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMultipleChoice.from_pretrained('bert-base-uncased')
prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
choice0 = "It is eaten with a fork and a knife."
choice1 = "It is pood on three times and then thrown out the window"
labels = torch.tensor(0).unsqueeze(0)  # choice0 is correct (according to Wikipedia ;)), batch size 1
encoding = tokenizer([[prompt, prompt], [choice0, choice1]], return_tensors='pt', padding=True)
outputs = model(**{k: v.unsqueeze(0) for k,v in encoding.items()}, labels=labels)  # batch size is 1
# the linear classifier still needs to be trained

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMultipleChoice: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForMultipleChoice from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMultipleChoice from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMultipleChoice were not initialized from the model checkpoint at bert-base-uncased and are newly

In [103]:
from transformers import BertTokenizer, TFBertForMultipleChoice
import tensorflow as tf
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = TFBertForMultipleChoice.from_pretrained('bert-base-cased')
prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
choice0 = "It is eaten with a fork and a knife."
choice1 = "It is eaten while held in the hand."
encoding = tokenizer([[prompt, prompt], [choice0, choice1]], return_tensors='tf', padding=True)
inputs = {k: tf.expand_dims(v, 0) for k, v in encoding.items()}
outputs = model(inputs)  # batch size is 1
# the linear classifier still needs to be trained
logits = outputs[0]

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=526681800.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-cased were not used when initializing TFBertForMultipleChoice: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertForMultipleChoice from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFBertForMultipleChoice from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of TFBertForMultipleChoice were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['dropout_37', 'classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [102]:
outputs

(tensor(0.6946, grad_fn=<NllLossBackward>),
 tensor([[-0.5508, -0.5479]], grad_fn=<ViewBackward>))

In [90]:
outputs

(tensor(1.8309, grad_fn=<NllLossBackward>),
 tensor([[ 1.0453, -0.6110]], grad_fn=<AddmmBackward>),
 (tensor([[[ 0.4920,  0.1265, -0.2230,  ...,  0.0460,  0.0441, -0.1251],
           [-1.2375, -0.2857, -0.5368,  ..., -0.3587, -0.5758,  0.3240],
           [ 0.3236,  0.1171,  0.4229,  ..., -0.5042,  0.9461,  0.6759],
           ...,
           [ 0.7489, -0.9659, -0.0346,  ...,  0.6375, -0.7057, -0.6425],
           [-0.2366, -0.0164, -0.2193,  ..., -0.2918,  0.6380, -0.6107],
           [ 0.0089,  0.0821,  0.3388,  ...,  0.5492, -0.6099,  0.4879]]],
         grad_fn=<NativeLayerNormBackward>),
  tensor([[[ 0.3712, -0.0675, -0.1078,  ...,  0.0038,  0.0427, -0.1037],
           [-1.6868, -0.7524, -0.8203,  ..., -0.5361, -0.8800,  0.7087],
           [ 0.4765, -0.0767,  0.5656,  ..., -1.1559,  1.7221,  0.7663],
           ...,
           [ 1.1163, -1.3334,  0.1978,  ...,  0.2072, -0.9557, -1.1962],
           [-0.2739,  0.2377, -0.4555,  ..., -0.8546,  0.6964, -0.9835],
           [ 0.403