Generating the books3 corpus of 256-length token chunks for BERT training

In [1]:
!pip install transformers --quiet

[K     |████████████████████████████████| 5.8 MB 7.9 MB/s 
[K     |████████████████████████████████| 182 kB 58.3 MB/s 
[K     |████████████████████████████████| 7.6 MB 56.9 MB/s 
[?25h

In [2]:
import pandas as pd
import numpy as np
from google.colab import drive
import os
import random
import time
import random
from transformers import set_seed, AutoTokenizer

drive.mount('/content/drive', force_remount=False)
random.seed(42)

Mounted at /content/drive


In [3]:
DATA_DIR = '/content/drive/MyDrive/AI_Data/'
bibliotik_train = f'{DATA_DIR}bibliotik_corpus/biblitik_22500_full.gz'
bibliotik_val = f'{DATA_DIR}bibliotik_corpus/biblitik_7500_val.gz'

MODEL_DIR = '/content/drive/MyDrive/AI_Models/storygpt/'
TOKENIZER_DIR = f'{MODEL_DIR}storygpt2tokenizer_ft2/'

##Load Tokenizer

In [4]:
BOS_TOKEN = '<BOS>'
EOS_TOKEN = '<EOS>'
PAD_TOKEN = '<PAD>'
CLS_TOKEN = '[CLS]'
MASK_TOKEN = '[MASK]'

tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_DIR)
special_tokens_dict = {'cls_token':CLS_TOKEN, 'bos_token': BOS_TOKEN, 'eos_token': EOS_TOKEN, 'pad_token': PAD_TOKEN, 'mask_token': MASK_TOKEN}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
VOCAB_SIZE = len(tokenizer)
print(VOCAB_SIZE)

52005


##Get n random token sequences from each story

###Load Data from Google Drive

In [69]:
TOK_SEQS_PER_STORY = 100
SEQ_LEN = 256
NUM_PAD_TOKENS = 128
READ_CHUNKSIZE = 1
all_token_seqs = []

In [70]:
df_filename = bibliotik_train
df = pd.read_csv(df_filename, 
            index_col=0,
            compression={'method': 'gzip', 'compresslevel': 2}, 
            chunksize=READ_CHUNKSIZE, 
            iterator=True)

In [71]:
def get_random_subsequence(seq, subsequence_len):
    start_index = random.randint(0, len(seq) - subsequence_len)
    return seq[start_index:start_index + subsequence_len]

In [72]:
def get_tokens_from_corpus(df, df_len):
  all_token_seqs = []
  _avg_time = 0
  for i in range(df_len):
    print(f"processings chunk {i}")
    stories = df.get_chunk()['full_text'].to_list()
    # if len(story) <= 100:
    #   
    # stories = stories.
    # print(stories)
    filtered_stories = []
    for story in stories:
      # length = min(len(story) //10, 100)
      # print("STORY_LEN: ",len(story))
      _total_story_len = 0
      if type(story) != type('hello world'):
        print("Unexpected value: ", story)
        continue
      elif len(story) <= 1000: # Roughly the len of 256 tokens?
        print("skipping small story of len ", len(story))
        continue
      else:
        filtered_stories.append(story)
        _total_story_len += len(story)
    # print(len(filtered_stories))
    start_time = time.time()
    tokenized_stories = tokenizer.batch_encode_plus(filtered_stories)
    tokenized_stories = tokenized_stories['input_ids']
    print(((time.time() - start_time) / _total_story_len)*100000)
    for tokenized_story in tokenized_stories:
      # print()
      story_tokens = [tokenizer.bos_token_id] + tokenized_story + [tokenizer.eos_token_id] + NUM_PAD_TOKENS * [tokenizer.pad_token_id]

      for _ in range(TOK_SEQS_PER_STORY):
        token_seq = get_random_subsequence(story_tokens, SEQ_LEN)
        all_token_seqs.append([token_seq])
      
    if i >= 1000:
      break
  return all_token_seqs

all_token_seqs = get_tokens_from_corpus(df, 22500)
print(len(all_token_seqs))
# Chunksize = 100 : 5m 50 s to do first 1000

processings chunk 0
11.042702335616369
processings chunk 1
3.423735142514364
processings chunk 2
4.442656449266655
processings chunk 3
9.985095768871078
processings chunk 4
Unexpected value:  nan
1.1405851572444685
processings chunk 5
6.065896224993468
processings chunk 6
Unexpected value:  nan
Unexpected value:  nan
22.704990867133347
processings chunk 7
Unexpected value:  nan
Unexpected value:  nan
3.1412876102321112
processings chunk 8
skipping small story of len  137
skipping small story of len  98
skipping small story of len  449
6.396416273581881
processings chunk 9
16.683347403765513
processings chunk 10


KeyboardInterrupt: ignored

In [34]:
# print(all_token_seqs[0:10])
random.shuffle(all_token_seqs)
seqs_df = pd.DataFrame(all_token_seqs, columns=['token_seqs'])
seqs_df.head()

Unnamed: 0,token_seqs
0,"[797, 424, 658, 4090, 486, 479, 288, 261, 2883..."
1,"[16369, 288, 3850, 287, 538, 946, 12, 452, 299..."
2,"[362, 825, 287, 261, 3321, 710, 14, 1583, 303,..."
3,"[12, 480, 2523, 1935, 259, 900, 26, 5432, 290,..."
4,"[259, 2198, 14, 7480, 370, 539, 2168, 12, 299,..."


In [35]:
seqs_df.to_csv(f'{DATA_DIR}bibliotik_corpus/biblitik_22500_tokenized.gz', compression={'method': 'gzip', 'compresslevel': 2},)

In [None]:
class DfDataLoader():
 
  def __init__(self, df_filename, tokenizer, batch_size, df_file_len, num_seqs_per_story, context_len):
    self.df_filename = df_filename
    self.tokenizer = tokenizer
    self.batch_size = batch_size
    self.total_df_len = df_file_len
    self.remaining_df_items = df_file_len
    self.num_seqs_per_story = num_seqs_per_story
    self.context_len = context_len
    self.DF_BUFFER = 100
    self.all_subsequences_from_story = []
    self.df = self.init_df()


  def init_df(self):
     self.remaining_df_items = self.total_df_len
     df = pd.read_csv(self.df_filename, 
                 index_col=0,
                 compression={'method': 'gzip', 'compresslevel': 2}, 
                 chunksize=1, 
                 iterator=True)
     return df

  def _get_random_subsequence(self, seq, subsequence_len):
    start_index = random.randint(0, len(seq) - subsequence_len)
    return seq[start_index:start_index + subsequence_len]

  def has_next_batch(self):
    return self.remaining_df_items > self.DF_BUFFER
  
  def get_tokens(self):
    # tokens = []
    while True:
      try:
        chunk = self.df.get_chunk()
        self.remaining_df_items -= 1
      except:
        self.remaining_df_items = 0
        raise Exception("Next batch")
          
      text = str(chunk['full_text'][0])
      tokens = tokenizer.encode(text)
      tokens = [tokenizer.bos_token_id] + tokens + [tokenizer.eos_token_id] + self.context_len * [tokenizer.pad_token_id]
      # print(len(tokens))
      if len(tokens) <= self.context_len:
        print("Story too short, retrying now")
        continue
      else:
        return tokens

  def get_next_batch(self):
    batch = []
    while len(batch) < self.batch_size:
      if len(self.all_subsequences_from_story) == 0:
        tokens = self.get_tokens()
        # tokens = tokens + [tokenizer.pad_token_id] * self.context_len

        for _ in range(self.num_seqs_per_story):
          subseq = self._get_random_subsequence(tokens, self.context_len-1)
          subseq = [tokenizer.cls_token_id] + subseq
          self.all_subsequences_from_story.append(subseq)
      subseq = self.all_subsequences_from_story.pop()
      batch.append(subseq)
    if len(batch) > self.batch_size:
      batch = batch[0:self.batch_size]
    return batch


train_dataloader = SingleStoryBertAssistDataLoader(\
                                   text_filename=atlas_shrugged_filename,
                                   tokenizer=tokenizer,
                                   batch_size = config['batch_size'],
                                   context_len=config['context_len'],
                                   bert_model = bert_model,
                                   mask_percentage = config['bert_mask_percentage']
                                    )