In [1]:
from transformers import BertTokenizer, BertForPreTraining
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForPreTraining.from_pretrained('bert-base-uncased')

Some weights of BertForPreTraining were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:
with open('./corpus.txt', 'r', encoding='utf8') as fp:
    text = fp.read().split('\n')

text[:3]

['The fact that somehow more than a decade after the crash major banks like Deutsche Bank are still insolvent aside (zombie bank walking on bailout funds), Buterin probably has a point.',
 'Only NII tokens are accepted for staking into nahmii’s patent-pending Data Availability Oracle, which monitors the security of the protocol. Token holders have the opportunity to receive significant rewards for participating in the Oracle, which involves voting to reach consensus to maintain network integrity.',
 'Token holders receive access to the best products from Forty Seven Bank, including before they are available to the general public. Additionally, you must hold tokens to get full access to the hackathons, developer meetups, and White Label API and to put an application in the Forty Seven Bank App Store. Tokens also entitle you to vote on important community development projects. Your number of tokens corresponds with the weight of your vote.']

### Preparing for NSP

In [3]:
bag = [item for sentence in text for item in sentence.split('.') if item != '']
bag_size = len(bag)

In [4]:
text[14]

'Meanwhile, DeFi has organically blossomed into a $65 billion industry. It’s a testament to the idea that complex financial systems can run without centralized, corporate oversight. I wouldn’t bemoan an ETF application that tracked MKR, AAVE or COMP, but do we need the validation?'

In [5]:
bag[14:19]

['Until the adoption of the pilot program, banks had restricted money transfers to crypto exchanges because of financial regulations, although people have been making them without specifying that they were crypto operations, Beltrán said',
 'Properties include LEO Tower, a proposed construction in Dubai; LEO Village, a development in Portugal said to have started development in 2013; and LEO Haciendas, “heavily discounted luxury properties” in Menorca sold at “30% below bank valuations”',
 '“The digital collateral registry layer enables delivery-versus-delivery (DvD) ownership transfers of baskets of securities via digital collateral records (DCRs)',
 ' This eliminates the current, operationally onerous, requirement to move securities from one custody location to another',
 ' We make this possible via integration with the leading Eurex Repo F7-trading system where Hqlax platform transactions are executed, and a Deutsche Börse-owned Trusted Third Party (TTP) entity, which connects the c

In [6]:
import random

sentence_a = []
sentence_b = []
label = []

for paragraph in text:
    sentences = [
        sentence for sentence in paragraph.split('.') if sentence != ''
    ]
    num_sentences = len(sentences)
    if num_sentences > 1:
        start = random.randint(0, num_sentences-2)
        # 50/50 whether is IsNextSentence or NotNextSentence
        if random.random() >= 0.5:
            # this is IsNextSentence
            sentence_a.append(sentences[start])
            sentence_b.append(sentences[start+1])
            label.append(0)
        else:
            index = random.randint(0, bag_size-1)
            # this is NotNextSentence
            sentence_a.append(sentences[start])
            sentence_b.append(bag[index])
            label.append(1)

In [7]:
for i in range(3):
    print(label[i])
    print(sentence_a[i] + '\n---')
    print(sentence_b[i] + '\n')

0
Only NII tokens are accepted for staking into nahmii’s patent-pending Data Availability Oracle, which monitors the security of the protocol
---
 Token holders have the opportunity to receive significant rewards for participating in the Oracle, which involves voting to reach consensus to maintain network integrity

0
Token holders receive access to the best products from Forty Seven Bank, including before they are available to the general public
---
 Additionally, you must hold tokens to get full access to the hackathons, developer meetups, and White Label API and to put an application in the Forty Seven Bank App Store

0
 Yes, it is
---
 But let’s remember how Uber went to the market



### Tokenization

In [8]:
inputs = tokenizer(sentence_a, sentence_b, return_tensors='pt',
                   max_length=512, truncation=True, padding='max_length')

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


In [9]:
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [10]:
inputs

{'input_ids': tensor([[  101,  2069,  9152,  ...,     0,     0,     0],
        [  101, 19204, 13304,  ...,     0,     0,     0],
        [  101,  2748,  1010,  ...,     0,     0,     0],
        ...,
        [  101,  2642,  2978,  ...,     0,     0,     0],
        [  101,  2007,  1996,  ...,     0,     0,     0],
        [  101,  1523,  1037,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [11]:
inputs['next_sentence_label'] = torch.LongTensor([label]).T

### MLM

In [12]:
inputs['labels'] = inputs.input_ids.detach().clone()

In [13]:
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'next_sentence_label', 'labels'])

In [14]:
# create random array of floats with equal dimensions to input_ids tensor
rand = torch.rand(inputs.input_ids.shape)
# create mask array
mask_arr = (rand < 0.15) * (inputs.input_ids != 101) * (inputs.input_ids != 102) * (inputs.input_ids != 0)

In [15]:
selection = []

for i in range(inputs.input_ids.shape[0]):
    selection.append(
        torch.flatten(mask_arr[i].nonzero()).tolist()
    )

In [16]:
for i in range(inputs.input_ids.shape[0]):
    inputs.input_ids[i, selection[i]] = 103

## Dataloader

In [17]:
class CryptoDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

In [18]:
dataset = CryptoDataset(inputs)

In [19]:
loader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=True)

## Training

In [20]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [21]:
model.to(device)

BertForPreTraining(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine

In [22]:
model.train()

BertForPreTraining(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine

In [23]:
from transformers import AdamW

optim = AdamW(model.parameters(), lr=5e-5)

In [24]:
from tqdm import tqdm  # for our progress bar

epochs = 4

for epoch in range(epochs):
    # setup loop with TQDM and dataloader
    loop = tqdm(loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        next_sentence_label = batch['next_sentence_label'].to(device)
        labels = batch['labels'].to(device)
        # process
        outputs = model(input_ids, attention_mask=attention_mask,
                        token_type_ids=token_type_ids,
                        next_sentence_label=next_sentence_label,
                        labels=labels)
        # extract loss
        loss = outputs.loss
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  0%|          | 0/55236 [00:05<?, ?it/s]


RuntimeError: CUDA out of memory. Tried to allocate 24.00 MiB (GPU 0; 6.00 GiB total capacity; 4.54 GiB already allocated; 0 bytes free; 4.58 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF