In [43]:
import os

import numpy as np
import torch.nn as nn
from utils import CUDA_ENABLED, to_cuda, build_sentence_list
from data import CustomDataset
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
#from pretrained_model import *
from transformers import BertModel, BertTokenizer, BertForMaskedLM
from transformer import Transformer
import math
#from transformer import *
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
#from torch.nn.functional import leaky_relu
import torch.nn.functional as F
import torch


In [2]:
#What is this? Why do we even use glue?  We will use this later but it's not in use yet

# import download_glue_data
# if not os.path.isdir('glue_data'):
#     download_glue_data.main('')

In [18]:
#Wikipedia sample data
dataset = CustomDataset('wikisample100000.txt')
dataloader = DataLoader(dataset, batch_size = 1, num_workers = 0, shuffle=True)
for x in dataloader: #Test that our dataloader worked fine
    print(x)
    break

['The set of those translates partitions the circle into a countable collection of disjoint sets, which are all pairwise congruent.']


In [19]:
pretrained = to_cuda(BertForMaskedLM.from_pretrained(
            'bert-base-uncased',
            output_hidden_states=True,
            output_attentions=True))
#PretrainedModel() 



Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMaskedLM were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
class PositionalEncoding(nn.Module): #Keep this, removed dropout cause didn't think he needed it
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        #self.dropout = nn.Dropout(p=dropout)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return x

In [21]:
tok = BertTokenizer.from_pretrained('bert-base-uncased')
VOCAB_SIZE = vocab_size = tok.vocab_size

In [33]:

class TinyBert(nn.Module):
    def __init__(self, vocab_size = VOCAB_SIZE, emb_size=144, nhead = 12, num_encoder_layers = 6, teacher_size=768):
        super().__init__()
        self.emb_size = emb_size
        self.model = Transformer(
            d_model = emb_size, nhead = nhead, num_encoder_layers = num_encoder_layers, 
            dim_feedforward = emb_size, dropout = .1, activation = 'lrelu')
        self.embedder = nn.Embedding(vocab_size, emb_size)
        self.PE = PositionalEncoding(emb_size)
        self.teacher_size = teacher_size
        self.linear_layers = nn.ModuleList([nn.Linear(emb_size, teacher_size) for _ in range(num_encoder_layers + 1)])
        self.linear_output = nn.Linear(emb_size, vocab_size)
    def forward(self, src, mask=None):
        if mask is None:
            mask = torch.ones_like(src, dtype = float)
        #reshaping cus trf module is stupid
        self.mask = mask
        self.emb_raw = emb_raw = self.embedder(src)
        self.emb = emb = self.PE(emb_raw)
        self.emb_transposed = emb_transposed = torch.transpose(emb, 1, 0)
        self.hidden, self.attn = hidden, attn = self.model(emb_transposed, src_key_padding_mask=mask)
        self.emb_hidden = [emb_transposed] + hidden
        emb_and_hidden = [torch.transpose(l, 1, 0) for l in self.emb_hidden]
        self.projections = projections = [l(embedding) for l, embedding in zip(self.linear_layers, emb_and_hidden)]
        self.output_logits = output_logits = torch.transpose(self.linear_output(hidden[-1]), 1, 0)
        self.output_probs = output_probs = F.softmax(output_logits, -1)
        return output_probs, output_logits, projections, emb_and_hidden, attn

In [38]:
class Bert_Distiller(nn.Module): #Bert Distiller we will use to train tiny bert
    def __init__(self, num_encoder_layers=7):

        super().__init__()
        self.pretrained_model = to_cuda(BertForMaskedLM.from_pretrained('bert-base-uncased',output_hidden_states=True,output_attentions=True))
        self.tinybert = TinyBert(num_encoder_layers=num_encoder_layers)
        self.num_encoder_layers = num_encoder_layers
        # assuming 13 layers
        self.step = int((13-1)/(self.num_encoder_layers-1))
#         self.tinybert = Transformer(d_model = 100, nhead = 2, num_encoder_layers = 3, 
#                    dim_feedforward = 100, dropout = .1, activation = 'lrelu')
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.y = []
        self.optimizer = optim.RMSprop(self.parameters(), lr=0.01)

    def forward(self, text):
        if isinstance(text[0], list):
            return self.forward_sentence(text)
        elif isinstance(text[0], str):
            return self.forward_maskLM(text)
        else:
            raise ValueError('Invalid Text: ' + text + type(text[0]))
            
    def preprocess_LM(self, text):
        self.y = []
        sentences = [build_sentence_list(
            'CLS', [self.tokenizer.tokenize(line)]) for line in text]
        
        lengths = [len(sentence) - 2 for sentence in sentences]
        mask_idxes = [np.random.choice(length, size=math.ceil(length/7), replace=False) for length in lengths]
        
        masks = [np.ones(length + 2) for length in lengths]
        for mask_idxes, mask, sentence in zip(mask_idxes, masks, sentences):
            self.y.append([])
            for mask_idx in mask_idxes:
                mask[mask_idx + 1] = 0
                self.y[-1].append(sentence[mask_idx + 1])
                sentence[mask_idx + 1] = '[MASK]'
        self.attention_mask = attention_mask = to_cuda(torch.tensor(pad_sequences(masks, padding='post')))
        self.tokenized_text = tokenized_text = to_cuda(torch.tensor(pad_sequences([
            self.tokenizer.convert_tokens_to_ids(sentence) for sentence in sentences]).tolist()))
        return tokenized_text, attention_mask

    def forward_maskLM(self, text):
        tokenized_text, attention_mask = self.preprocess_LM(text)
        self.pretrained_loss, self.pretrained_output, self.pretrained_hidden, self.pretrained_attn = \
            pretrained_loss, pretrained_output, pretrained_hidden, pretrained_attn = self.pretrained_model(
            tokenized_text = tokenized_text, attention_mask = attention_mask)
        self.tb_output, self.tb_logits, self.tb_projection, self.tb_hidden, self.tb_attn = \
            tb_output, tb_logits, tb_projection, tb_hidden, tb_attn = \
            self.tinybert(tokenized_text, mask=attention_mask)
        # self.tb_out_masked = tb_out_masked = tb_out * attention_mask.transpose(1, 0).unsqueeze(-1)
        pretrained_hidden = pretrained_hidden[::self.step]
        pretrained_attn = pretrained_attn[::self.step]
        return (tokenized_text, attention_mask,
                pretrained_loss, pretrained_output, pretrained_hidden, pretrained_attn, 
                tb_output, tb_logits, tb_projection, tb_hidden, tb_attn)
        

In [36]:
tb = TinyBert()


In [39]:
mdl = Bert_Distiller()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMaskedLM were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [40]:
#Put these in a model somewhere make them self. 
#Self.mseloss = nn.mseloss
def loss_hidden(tb_projection, pretrained_hidden):
    lossfcn = nn.MSELoss()
    return sum([lossfcn(t, p) for t, p in zip(tb_projection, pretrained_hidden)])

def loss_attn(tb_attn, pretrained_attn):
    lossfcn = nn.MSELoss()
    return sum([lossfcn(t, p) for t, p in zip(tb_attn, pretrained_attn)])
    
def loss_pred(pt_output, tb_logits):
    m = nn.LogSoftmax()
    return -pt_output * m(tb_logits)
    
def loss(pt_loss, pt_output, pt_hidden, pt_attn, tb_output, tb_logits, tb_projection, tb_hidden, tb_attn):
    L_hid = loss_hidden(tb_projection, pt_hidden)
    L_attn = loss_attn(tb_attn, pt_attn)
    L_pred = loss_pred(pt_output, tb_logits)
    return L_hid + L_attn

In [41]:
def step(mdl, text, i): #Plug in model?  #Maybe keep this outside the model
    mdl.zero_grad()
    tok_txt, msk, pt_loss, pt_output, pt_hidden, pt_attn, tb_output, tb_logits, tb_projection, tb_hidden, tb_attn = mdl(text)
    loss_val = loss(pt_loss, pt_output, pt_hidden, pt_attn, tb_output, tb_logits, tb_projection, tb_hidden, tb_attn)
    loss_val.backward()
    mdl.optimizer.step()
    if i % 3 == 0:
        print(loss_val)
    

In [48]:
##Run the model a couple times and view loss every three times

itr = 0
for text in dataloader:
    itr += 1
    mdl.zero_grad()
    tok_txt, msk, pt_loss, pt_output, pt_hidden, pt_attn, tb_output, tb_logits, tb_projection, tb_hidden, tb_attn = mdl(text)
    loss_val = loss(pt_loss, pt_output, pt_hidden, pt_attn, tb_output, tb_logits, tb_projection, tb_hidden, tb_attn)
    loss_val.backward()
    mdl.optimizer.step()
    if itr % 3 == 0:
        print(loss_val)

AssertionError: Unexpected keyword arguments: ['tokenized_text'].