Generating the books3 corpus of 256-length token chunks for BERT training

In [1]:
!pip install transformers --quiet

[K     |████████████████████████████████| 5.8 MB 7.4 MB/s 
[K     |████████████████████████████████| 7.6 MB 60.7 MB/s 
[K     |████████████████████████████████| 182 kB 53.1 MB/s 
[K     |████████████████████████████████| 1.9 MB 6.9 MB/s 
[K     |████████████████████████████████| 182 kB 62.3 MB/s 
[K     |████████████████████████████████| 174 kB 53.9 MB/s 
[K     |████████████████████████████████| 62 kB 1.3 MB/s 
[K     |████████████████████████████████| 173 kB 53.2 MB/s 
[K     |████████████████████████████████| 168 kB 76.2 MB/s 
[K     |████████████████████████████████| 168 kB 60.6 MB/s 
[K     |████████████████████████████████| 166 kB 59.7 MB/s 
[K     |████████████████████████████████| 166 kB 51.2 MB/s 
[K     |████████████████████████████████| 162 kB 60.2 MB/s 
[K     |████████████████████████████████| 162 kB 52.5 MB/s 
[K     |████████████████████████████████| 158 kB 75.3 MB/s 
[K     |████████████████████████████████| 157 kB 53.6 MB/s 
[K     |███████████████████

In [11]:
import pandas as pd
import numpy as np
from google.colab import drive
import os
import random
import time
import random
from transformers import set_seed, AutoTokenizer

drive.mount('/content/drive', force_remount=False)
random.seed(42)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
DATA_DIR = '/content/drive/MyDrive/AI_Data/'
bibliotik_train = f'{DATA_DIR}bibliotik_corpus/biblitik_22500_full.gz'
bibliotik_val = f'{DATA_DIR}bibliotik_corpus/biblitik_7500_val.gz'

MODEL_DIR = '/content/drive/MyDrive/AI_Models/storygpt/'
TOKENIZER_DIR = f'{MODEL_DIR}storygpt2tokenizer_ft2/'

##Load Tokenizer

In [4]:
BOS_TOKEN = '<BOS>'
EOS_TOKEN = '<EOS>'
PAD_TOKEN = '<PAD>'
CLS_TOKEN = '[CLS]'
MASK_TOKEN = '[MASK]'

tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_DIR)
special_tokens_dict = {'cls_token':CLS_TOKEN, 'bos_token': BOS_TOKEN, 'eos_token': EOS_TOKEN, 'pad_token': PAD_TOKEN, 'mask_token': MASK_TOKEN}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
VOCAB_SIZE = len(tokenizer)
print(VOCAB_SIZE)

52005


##Load Data from Google Drive

In [5]:
df_filename = bibliotik_train
df = pd.read_csv(df_filename, 
            index_col=0,
            compression={'method': 'gzip', 'compresslevel': 2}, 
            chunksize=1, 
            iterator=True)

##Get n random token sequences from each story

In [14]:
TOK_SEQS_PER_STORY = 100
SEQ_LEN = 256
NUM_PAD_TOKENS = 128
all_token_seqs = []

In [15]:
def get_random_subsequence(seq, subsequence_len):
    start_index = random.randint(0, len(seq) - subsequence_len)
    return seq[start_index:start_index + subsequence_len]

In [16]:
story = df.get_chunk()['full_text'][0]
all_tokens = tokenizer.encode(story)
all_tokens = [tokenizer.bos_token_id] + all_tokens + [tokenizer.eos_token_id] + NUM_PAD_TOKENS * [tokenizer.pad_token_id]

for _ in range(TOK_SEQS_PER_STORY):
  token_seq = get_random_subsequence(all_tokens, len(all_tokens))
  all_token_seqs.append(token_seq)
print(len(all_token_seqs))

100


In [None]:
class DfDataLoader():
 
  def __init__(self, df_filename, tokenizer, batch_size, df_file_len, num_seqs_per_story, context_len):
    self.df_filename = df_filename
    self.tokenizer = tokenizer
    self.batch_size = batch_size
    self.total_df_len = df_file_len
    self.remaining_df_items = df_file_len
    self.num_seqs_per_story = num_seqs_per_story
    self.context_len = context_len
    self.DF_BUFFER = 100
    self.all_subsequences_from_story = []
    self.df = self.init_df()


  def init_df(self):
     self.remaining_df_items = self.total_df_len
     df = pd.read_csv(self.df_filename, 
                 index_col=0,
                 compression={'method': 'gzip', 'compresslevel': 2}, 
                 chunksize=1, 
                 iterator=True)
     return df

  def _get_random_subsequence(self, seq, subsequence_len):
    start_index = random.randint(0, len(seq) - subsequence_len)
    return seq[start_index:start_index + subsequence_len]

  def has_next_batch(self):
    return self.remaining_df_items > self.DF_BUFFER
  
  def get_tokens(self):
    # tokens = []
    while True:
      try:
        chunk = self.df.get_chunk()
        self.remaining_df_items -= 1
      except:
        self.remaining_df_items = 0
        raise Exception("Next batch")
          
      text = str(chunk['full_text'][0])
      tokens = tokenizer.encode(text)
      tokens = [tokenizer.bos_token_id] + tokens + [tokenizer.eos_token_id] + self.context_len * [tokenizer.pad_token_id]
      # print(len(tokens))
      if len(tokens) <= self.context_len:
        print("Story too short, retrying now")
        continue
      else:
        return tokens

  def get_next_batch(self):
    batch = []
    while len(batch) < self.batch_size:
      if len(self.all_subsequences_from_story) == 0:
        tokens = self.get_tokens()
        # tokens = tokens + [tokenizer.pad_token_id] * self.context_len

        for _ in range(self.num_seqs_per_story):
          subseq = self._get_random_subsequence(tokens, self.context_len-1)
          subseq = [tokenizer.cls_token_id] + subseq
          self.all_subsequences_from_story.append(subseq)
      subseq = self.all_subsequences_from_story.pop()
      batch.append(subseq)
    if len(batch) > self.batch_size:
      batch = batch[0:self.batch_size]
    return batch


train_dataloader = SingleStoryBertAssistDataLoader(\
                                   text_filename=atlas_shrugged_filename,
                                   tokenizer=tokenizer,
                                   batch_size = config['batch_size'],
                                   context_len=config['context_len'],
                                   bert_model = bert_model,
                                   mask_percentage = config['bert_mask_percentage']
                                    )