In [10]:
os.listdir()

['nix',
 'core',
 'boot',
 'swapfile',
 'initrd.img',
 'home',
 'cdrom',
 'mnt',
 'usr',
 'media',
 'dev',
 'proc',
 '.VolumeIcon.icns',
 'srv',
 'run',
 'vmlinuz',
 'root',
 'tmp',
 'lib',
 'bin',
 'lost+found',
 'snap',
 'initrd.img.old',
 'sys',
 'var',
 '.VolumeIcon.png',
 'sbin',
 'etc',
 'lib64',
 'opt']

In [1]:
import os, sys
os.chdir('../..')
sys.path.append('')
    
from random import randint, shuffle
from random import random as rand

import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import argparse
from tensorboardX import SummaryWriter
import os
import src.model.models as models
from tqdm import tqdm
import src.model.optim as optim
import src.model.train as train
from torch.utils.data import Dataset, DataLoader

from src.utils import set_seeds, get_device, truncate_tokens_pair, _sample_mask
import src.data.tokenization as tokenization
from src.data.data import Preprocess4Pretrain, SentPairDataset, seq_collate

In [2]:
tokenizer = tokenization.FullTokenizer(vocab_file='./data/vocab.txt', do_lower_case=True)
tokenize = lambda x: tokenizer.tokenize(tokenizer.convert_to_unicode(x))

In [3]:
tokenizer.tokenize('the quick brown fox jumped over the lazy dog')

['the', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', 'dog']

In [4]:
tokenizer.convert_tokens_to_ids(['the', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', 'dog'])

[1996, 4248, 2829, 4419, 5598, 2058, 1996, 13971, 3899]

In [5]:
tokenizer.convert_ids_to_tokens([1996, 4248, 2829, 4419, 5598, 2058, 1996, 13971, 3899])

['the', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', 'dog']

In [6]:
tokenizer.convert_to_unicode('the quick brown fox jumped over the lazy dog')

'the quick brown fox jumped over the lazy dog'

### 1. A highlevel view

In [111]:
tokenizer = tokenization.FullTokenizer(vocab_file='./data/vocab.txt', do_lower_case=True)
tokenize = lambda x: tokenizer.tokenize(tokenizer.convert_to_unicode(x))

pipeline = [Preprocess4Pretrain(400,
                                0.15,
                                list(tokenizer.vocab.keys()),
                                tokenizer.convert_tokens_to_ids,
                                400,
                                1,
                                1,
                                3)]
data_iter = DataLoader(SentPairDataset('./data/wiki.test.tokens',
                            16,
                            tokenize,
                            400,
                            pipeline=pipeline), batch_size=16, collate_fn=seq_collate, num_workers=8)

for batch in tqdm(data_iter):
    input_ids, segment_ids, input_mask, masked_ids, masked_pos, masked_weights, is_next, original_ids = batch



  0%|          | 0/273 [00:00<?, ?it/s][A
  0%|          | 1/273 [00:00<02:05,  2.17it/s][A
  3%|▎         | 7/273 [00:00<01:27,  3.04it/s][A
  3%|▎         | 9/273 [00:00<01:09,  3.78it/s][A
  6%|▌         | 16/273 [00:00<00:49,  5.23it/s][A
  7%|▋         | 19/273 [00:01<00:40,  6.23it/s][A
  8%|▊         | 22/273 [00:01<00:31,  7.89it/s][A
 10%|▉         | 27/273 [00:01<00:25,  9.80it/s][A
 11%|█         | 30/273 [00:01<00:19, 12.18it/s][A
 13%|█▎        | 35/273 [00:02<00:18, 12.98it/s][A
 15%|█▍        | 40/273 [00:02<00:14, 15.74it/s][A
 16%|█▌        | 43/273 [00:02<00:15, 14.82it/s][A
 18%|█▊        | 48/273 [00:02<00:12, 17.33it/s][A
 19%|█▊        | 51/273 [00:02<00:11, 19.20it/s][A
 20%|█▉        | 54/273 [00:02<00:13, 15.75it/s][A
 21%|██        | 56/273 [00:03<00:15, 14.35it/s][A
 22%|██▏       | 60/273 [00:03<00:12, 17.46it/s][A
 23%|██▎       | 64/273 [00:03<00:11, 18.21it/s][A
 25%|██▍       | 68/273 [00:03<00:10, 20.49it/s][A
 26%|██▌       | 71/273

### Looking at it deeply
We see that the text has lots of empty lines

In [17]:
filename = './data/wiki.test.tokens'
N = 5
with open(filename) as myfile:
    for i in range(N):
        print(next(myfile))
#     head = [next(myfile) for x in range(N)]
# print(head)

 

 = Robert <unk> = 

 

 Robert <unk> is an English film , television and theatre actor . He had a guest @-@ starring role on the television series The Bill in 2000 . This was followed by a starring role in the play Herons written by Simon Stephens , which was performed in 2001 at the Royal Court Theatre . He had a guest role in the television series Judge John <unk> in 2002 . In 2004 <unk> landed a role as " Craig " in the episode " Teddy 's Story " of the television series The Long Firm ; he starred alongside actors Mark Strong and Derek Jacobi . He was cast in the 2005 theatre productions of the Philip Ridley play Mercury Fur , which was performed at the Drum Theatre in Plymouth and the <unk> <unk> Factory in London . He was directed by John <unk> and starred alongside Ben <unk> , Shane <unk> , Harry Kent , Fraser <unk> , Sophie Stanton and Dominic Hall . 

 In 2006 , <unk> starred alongside <unk> in the play <unk> written by Mark <unk> . He appeared on a 2006 episode of the telev

## `SentPairDataset`

The key preprocessing object is the `SentPairDataset` object. As it pulls the data, it tokenizes and runs it through the pipeline.  
An important note is the expected format of the input file. Each line should comprise of a single sentence (though not a deal breaker.) Importantly, each document needs to be line-seperated (`'\n'`).

In [20]:
self = SentPairDataset('./data/wiki.test.tokens',
                        16,
                        tokenize,
                        400,
                        pipeline=pipeline)



### 1. Pull and tokenize the sentence pairs
The `tokenize` function can take in any text. There is no need to do some special preprocessing to the original corpus. 

In [56]:
orig_text = "Robert Boulter is an English film, television and theatre actor. He had a guest-starring role on the television series The Bill in 2000. This was followed by a starring role in the play Herons written by Simon Stephens, which was performed in 2001 at the Royal Court Theatre."
tokenize(orig_text)[:10]

['robert',
 'bo',
 '##ult',
 '##er',
 'is',
 'an',
 'english',
 'film',
 ',',
 'television']

We are pulling __pairs__ of segments (not sentences). Thus, a generated int, `len_tokens`, is a number less than half the `max_len`. 10% of the time, it will be even less. `len_tokens` is the length of the one of two segments. 

In [None]:
# 10% of the time, we get a random number of tokens 
# less than half the max_length
len_tokens = randint(1, int(self.max_len / 2)) \
    if rand() < self.short_sampling_prob \
    else int(self.max_len / 2)

When pulling the two segments, `read_tokens` tokenizes and returns each line.  
Importantly, it handles the logic such that it _tries_ to achieve the expected length of half `max_len`.   
  
`discard_last_and_restart` is a vital argument for the first sentence. If the first segment already hits the end of the document, `discard_last_and_restart` will ditch the collected tokens and attempt to tokenize the first segment of the next token.
  
  A small note is that the sum of the two segments may well be larger than `max_len`.

In [92]:
# pulling the pairs of sentences
tokens_a = self.read_tokens(self.f_pos, len_tokens, True)
f_next = self.f_pos # `f_next` should be next point
tokens_b = self.read_tokens(f_next, len_tokens, False)

# SOP, sentence-order prediction
is_next = rand() < 0.5 # whether token_b is next to token_a or not
instance = (is_next, tokens_a, tokens_b) if is_next \
    else (is_next, tokens_b, tokens_a)
print(f"{len(tokens_a)} / {len(tokens_b)}")

248 / 0


Handling possible exceptions:  
1. End of document: When arriving at the end of the document, the second segment may end up being an empty list. Here, I implement an additional logic of splitting the first segment into two. 
2. End of file: it will reset the pointer of the file object, going back to the start of the file.

In [None]:
# there are no more tokens in the document
# split between tokens_a and tokens_b, halfway
if (len(tokens_b) == 0) & (len(tokens_a)>10):
    half_split = int(len(tokens_a)/2)
    tokens_b = tokens_a[half_split:]
    tokens_a = tokens_a[:half_split]
    
    
# there are no more tokens in the entire text file.
if tokens_a is None or tokens_b is None: # end of file
    self.f_pos.seek(0, 0) # reset file pointer

    # re-read token
    tokens_a = self.read_tokens(self.f_pos, len_tokens, True)
    f_next = self.f_pos # `f_next` should be next point
    tokens_b = self.read_tokens(f_next, len_tokens, False)

## `Preprocess4Pretrain` 
After pulling the data in the form of an `instance` which contains `is_next, tokens_a, tokens_b`, the next step is to preprocess the `instance` into the 8 `torch.tensor` objects that our transformer may ingest. 

In [95]:
self = Preprocess4Pretrain(400,
                            0.15,
                            list(tokenizer.vocab.keys()),
                            tokenizer.convert_tokens_to_ids,
                            400,
                            1,
                            1,
                            3)

is_next, tokens_a, tokens_b = instance

`truncate_tokens_pair` as it clearly states, will truncate the two segments to a size accommodating `max_len` and the special tokens.  
`segment_ids` indicate which tokens belong to which segment.  
`input_mask` indicates the useful tokens and the paddings.  
Some reference for BERT: https://medium.com/@aieeshashafique/feature-extraction-from-bert-25887ed2152a

In [100]:
# -3  for special tokens [CLS], [SEP], [SEP]
truncate_tokens_pair(tokens_a, tokens_b, self.max_len - 3)

# Add Special Tokens
tokens = ['[CLS]'] + tokens_a + ['[SEP]'] + tokens_b + ['[SEP]']
segment_ids = [0]*(len(tokens_a)+2) + [1]*(len(tokens_b)+1)
input_mask = [1]*len(tokens)

Next we sample the mask over which our generator will replace words.  
`masked_pos` is just a positional index.  
The naming is a bit strange. `masked_tokens` actually refer to the untouched tokens whereas `tokens` refer to the token sequence with the masked tokens.   
`masked_weights` simply weights all of them equally. We assume that theyre all the same.  

In [108]:
# the number of prediction is sometimes less than max_pred when sequence is short
n_pred = min(self.max_pred, max(1, int(round(len(tokens) * self.mask_prob))))

original_ids = self.indexer(tokens)
# For masked Language Models
masked_tokens, masked_pos, tokens = _sample_mask(tokens, self.mask_alpha,
                                                self.mask_beta, self.max_gram,
                                                goal_num_predict=n_pred)
# prev_masked_tokens = masked_tokens.copy()
# print(len(masked_tokens))

masked_weights = [1]*len(masked_tokens)

# Token Indexing
input_ids = self.indexer(tokens)
masked_ids = self.indexer(masked_tokens)

Finally, we pad!  
Originally, the author set them to be `max_pred` long but errors were encountered. It seems that the correct one should be in fact `max_len`.  

In [89]:
# Zero Padding
n_pad = self.max_len - len(input_ids)
original_ids.extend([0]*n_pad)
input_ids.extend([0]*n_pad)
segment_ids.extend([0]*n_pad)
input_mask.extend([0]*n_pad)

# Zero Padding for masked target
# originally the author constrained masked_ids, masked_pos and masked_weights to be length 75
# but they should be of the same length
# so I replace max_pred w max_len
# prev_masked_id = masked_ids.copy()
if self.max_len > len(masked_ids):
    masked_ids.extend([0] * (self.max_len - len(masked_ids)))
elif self.max_len > len(masked_ids):
    raise ValueError("Strangely, the masked_ids is more than max_pred")
# assert self.max_pred == len(masked_ids), f"self.max_pred {self.max_pred} vs shape of prev {len(prev_masked_id)} vs shape of new {len(masked_ids)}"
if self.max_len > len(masked_pos):
    masked_pos.extend([0] * (self.max_len - len(masked_pos)))
if self.max_len > len(masked_weights):
    masked_weights.extend([0] * (self.max_len - len(masked_weights)))


(False,
 ['"',
  'kiss',
  'you',
  '"',
  'was',
  'well',
  'received',
  'by',
  'contemporary',
  'music',
  'critics',
  ',',
  'who',
  'centred',
  'on',
  'its',
  'quality',
  'of',
  'production',
  '.',
  'both',
  'rolling',
  'stone',
  "'",
  's',
  'jon',
  'do',
  '##lan',
  ',',
  'who',
  'praised',
  'its',
  'effectiveness',
  ',',
  'and',
  'chris',
  'payne',
  'of',
  'billboard',
  ',',
  'who',
  'appreciated',
  'the',
  'melody',
  ',',
  'described',
  '"',
  'kiss',
  'you',
  '"',
  'as',
  'one',
  'of',
  'the',
  'album',
  "'",
  's',
  'highlights',
  '.',
  'alexis',
  '<',
  'un',
  '##k',
  '>',
  'for',
  'the',
  'guardian',
  'commended',
  'the',
  'track',
  "'",
  's',
  'chorus',
  'as',
  '"',
  'hard',
  'to',
  '<',
  'un',
  '##k',
  '>',
  'from',
  'your',
  'brain',
  '"',
  '.',
  'robert',
  'cops',
  '##ey',
  'of',
  'digital',
  'spy',
  'noted',
  'the',
  'song',
  "'",
  's',
  'possibility',
  'to',
  'become',
  'an',
  'in