In [1]:
import os
[f for f in os.listdir() if '.py' in f]

['data.py', 'pretrained_model.py', 'transformer.py', 'utils.py']

In [2]:
from data import *
from utils import *
from pretrained_model import *
from transformer import *
from tensorflow.python.keras.preprocessing.sequence import pad_sequences


In [3]:
dataset = CustomDataset('enwiki-latest-pages-articles_preprocessed.txt')

#Wrap it around a dataloader
dataloader = DataLoader(dataset, batch_size = 2, num_workers = 0)

In [4]:
from torch.nn.functional import leaky_relu


In [5]:
trf =  Transformer(d_model = 100, nhead = 2, num_encoder_layers = 3, 
                   dim_feedforward = 100, dropout = .1, activation = 'lrelu')

In [6]:
trf(torch.rand((10, 32, 100)), src_key_padding_mask=torch.ones((32, 10))).shape

torch.Size([10, 32, 100])

In [7]:
pretrained = PretrainedModel()

In [8]:
torch.transpose(torch.rand((10, 32, 100)), 1, 0).shape

torch.Size([32, 10, 100])

In [9]:
class Embedder(nn.Module):
    def __init__(self, vocab_size, d_model):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, d_model)
    def forward(self, x):
        return self.embed(x)

In [10]:
import math
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
#         self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return x

In [11]:
emb = Embedder(5, 4)
# PE(emb(torch.tensor([[1, 2, 3]]))).shape

In [12]:
tok = BertTokenizer.from_pretrained('bert-base-uncased')

VOCAB_SIZE = vocab_size = tok.vocab_size

In [67]:
class TinyBert(nn.Module):
    def __init__(self, vocab_size = VOCAB_SIZE, emb_size=144, nhead = 12, num_encoder_layers = 6):
        super(TinyBert, self).__init__()
        self.emb_size = emb_size
        self.model = Transformer(
            d_model = emb_size, nhead = nhead, num_encoder_layers = num_encoder_layers, 
            dim_feedforward = emb_size, dropout = .1, activation = 'lrelu')
        self.embedder = Embedder(vocab_size, emb_size)
        self.PE = PositionalEncoding(emb_size)
    def forward(self, src, mask=None):
        if mask is None:
            mask = torch.ones_like(src, dtype = float)
        #reshaping cus trf module is stupid
        self.mask = mask
        self.emb_raw = emb_raw = self.embedder(src)
        self.emb = emb = self.PE(emb_raw)
        self.emb_transposed = emb_transposed = torch.transpose(emb, 1, 0)
        self.trf_output = trf_output = self.model(emb_transposed, src_key_padding_mask=mask)
        return trf_output

In [68]:
tb = TinyBert()

In [69]:
# tb(torch.tensor([[1, 2, 3]]))

In [70]:
??np.random.choice

In [48]:
# from tensorflow.python.keras.preprocessing.sequence import pad_sequences
import torch.optim as optim

class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.pretrained_model = PretrainedModel()
        self.tinybert = TinyBert()
#         self.tinybert = Transformer(d_model = 100, nhead = 2, num_encoder_layers = 3, 
#                    dim_feedforward = 100, dropout = .1, activation = 'lrelu')
        self.tokenizer = self.pretrained_model.tokenizer
        self.y = []
        self.optimizer = optim.RMSprop(self.parameters(), lr=0.01)

    def forward(self, text):
        if isinstance(text[0], list):
            return self.forward_sentence(text)
        elif isinstance(text[0], str):
            return self.forward_maskLM(text)
        else:
            raise ValueError('wtf is this text?' + text + type(text[0]))
            
    def preprocess_LM(self, text):
        self.y = []
        sentences = [build_sentence_list(
            'CLS', [self.tokenizer.tokenize(line)]) for line in text]
        
        lengths = [len(sentence) - 2 for sentence in sentences]
        mask_idxes = [np.random.choice(length, size=math.ceil(length/7), replace=False) for length in lengths]
        
        masks = [np.ones(length + 2) for length in lengths]
        for mask_idxes, mask, sentence in zip(mask_idxes, masks, sentences):
            self.y.append([])
            for mask_idx in mask_idxes:
                mask[mask_idx + 1] = 0
                self.y[-1].append(sentence[mask_idx + 1])
                sentence[mask_idx + 1] = '[MASK]'
        self.attention_mask = attention_mask = to_cuda(torch.tensor(pad_sequences(masks, padding='post')))
        self.tokenized_text = tokenized_text = to_cuda(torch.tensor(pad_sequences([
            self.tokenizer.convert_tokens_to_ids(sentence) for sentence in sentences]).tolist()))
        return tokenized_text, attention_mask
    def forward_maskLM(self, text):
        tokenized_text, attention_mask = self.preprocess_LM(text)
        self.pretrained_hidden, self.pretrained_attn = pretrained_hidden, pretrained_attn = self.pretrained_model(
            tokenized_text = tokenized_text, attention_mask = attention_mask)
        self.tb_out = tb_out = self.tinybert(tokenized_text, mask=attention_mask)
        self.tb_out_masked = tb_out_masked = tb_out * attention_mask.transpose(1, 0).unsqueeze(-1)
        return tokenized_text, attention_mask, tb_out_masked, pretrained_hidden, pretrained_attn
        

In [49]:

mdl = Model()

In [20]:
dataset = CustomDataset('enwiki-latest-pages-articles_preprocessed.txt')

#Wrap it around a dataloader
dataloader = DataLoader(dataset, batch_size = 8, num_workers = 0, shuffle=True)


In [50]:
itr = 0
for text in dataloader:
    print(len(text), text)
    itr += 1
    if itr > 0:
        break

8 ['To axiomatize a system of knowledge is to show that its claims can be derived from a small, well-understood set of sentences (the axioms), and there may be multiple ways to axiomatize a given mathematical domain.', 'The subsidized air company Air Tahiti Nui brings tourists from France, Los Angeles, Japan and China.', 'Uzzayanâ\x80\x99s cult in particular was widespread in south Arabia, and in Qataban, she was invoked as a guardian of the final royal palace.', 'Recommended dosing interval is 4â\x80\x936 hours.', 'In the aftermath of the incident, Steve and Sam plan to keep what happened at Pleasant Hill under wraps for the time being.', 'This area is the traditional homeland of the Tlingit, and home of a historic settling of Haida as well as a modern settlement of Tsimshian.', 'This combination of lifelong military experience and monetary incentives resulted in a cohesive, well-disciplined military.', 'Many dynasties have their own specific adaptation of Nusach Sefard; some, such as

In [51]:

len([v.shape for v in mdl(['hi', 'strawberries'])[0]])

2

In [52]:
# _, hid, attn = mdl(['hi', 'strawberries'])

In [53]:
len(hid), len(attn)

(13, 12)

In [54]:
[a.shape for a in attn]

[torch.Size([2, 12, 4, 4]),
 torch.Size([2, 12, 4, 4]),
 torch.Size([2, 12, 4, 4]),
 torch.Size([2, 12, 4, 4]),
 torch.Size([2, 12, 4, 4]),
 torch.Size([2, 12, 4, 4]),
 torch.Size([2, 12, 4, 4]),
 torch.Size([2, 12, 4, 4]),
 torch.Size([2, 12, 4, 4]),
 torch.Size([2, 12, 4, 4]),
 torch.Size([2, 12, 4, 4]),
 torch.Size([2, 12, 4, 4])]

In [55]:
[h.shape for h in hid]

[torch.Size([2, 4, 768]),
 torch.Size([2, 4, 768]),
 torch.Size([2, 4, 768]),
 torch.Size([2, 4, 768]),
 torch.Size([2, 4, 768]),
 torch.Size([2, 4, 768]),
 torch.Size([2, 4, 768]),
 torch.Size([2, 4, 768]),
 torch.Size([2, 4, 768]),
 torch.Size([2, 4, 768]),
 torch.Size([2, 4, 768]),
 torch.Size([2, 4, 768]),
 torch.Size([2, 4, 768])]

In [61]:
tokenized_text, attention_mask, tb_out_masked, pretrained_hidden, pretrained_attn = mdl(['hi there how are you'])

In [62]:
pretrained.model.embeddings(tokenized_text).shape

torch.Size([1, 7, 768])

In [66]:
len(pretrained_attn), [h.shape for h in pretrained_attn]

(12,
 [torch.Size([1, 12, 7, 7]),
  torch.Size([1, 12, 7, 7]),
  torch.Size([1, 12, 7, 7]),
  torch.Size([1, 12, 7, 7]),
  torch.Size([1, 12, 7, 7]),
  torch.Size([1, 12, 7, 7]),
  torch.Size([1, 12, 7, 7]),
  torch.Size([1, 12, 7, 7]),
  torch.Size([1, 12, 7, 7]),
  torch.Size([1, 12, 7, 7]),
  torch.Size([1, 12, 7, 7]),
  torch.Size([1, 12, 7, 7])])

In [65]:
len(pretrained_hidden), [h.shape for h in pretrained_hidden]

(13,
 [torch.Size([1, 7, 768]),
  torch.Size([1, 7, 768]),
  torch.Size([1, 7, 768]),
  torch.Size([1, 7, 768]),
  torch.Size([1, 7, 768]),
  torch.Size([1, 7, 768]),
  torch.Size([1, 7, 768]),
  torch.Size([1, 7, 768]),
  torch.Size([1, 7, 768]),
  torch.Size([1, 7, 768]),
  torch.Size([1, 7, 768]),
  torch.Size([1, 7, 768]),
  torch.Size([1, 7, 768])])

In [39]:
dir(pretrained)

['T_destination',
 '__annotations__',
 '__call__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_apply',
 '_backward_hooks',
 '_buffers',
 '_call_impl',
 '_forward_hooks',
 '_forward_pre_hooks',
 '_forward_unimplemented',
 '_get_name',
 '_load_from_state_dict',
 '_load_state_dict_pre_hooks',
 '_modules',
 '_named_members',
 '_non_persistent_buffers_set',
 '_parameters',
 '_register_load_state_dict_pre_hook',
 '_register_state_dict_hook',
 '_replicate_for_data_parallel',
 '_save_to_state_dict',
 '_slow_forward',
 '_state_dict_hooks',
 '_version',
 'add_module',
 'apply',
 'bfloat16',
 'buffers',
 'children',
 'cpu',
 'cuda',
 'double