Generating the books3 corpus of 256-length token chunks for BERT training

In [1]:
!pip install transformers --quiet

In [2]:
import pandas as pd
import numpy as np
from google.colab import drive
import os
import random
import time
import random
from transformers import set_seed, AutoTokenizer

drive.mount('/content/drive', force_remount=False)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
DATA_DIR = '/content/drive/MyDrive/AI_Data/'
bibliotik_train = f'{DATA_DIR}bibliotik_corpus/biblitik_22500_full.gz'
bibliotik_val = f'{DATA_DIR}bibliotik_corpus/biblitik_7500_val.gz'

MODEL_DIR = '/content/drive/MyDrive/AI_Models/storygpt/'
TOKENIZER_DIR = f'{MODEL_DIR}storygpt2tokenizer_ft2/'

##Load Tokenizer

In [4]:
BOS_TOKEN = '<BOS>'
EOS_TOKEN = '<EOS>'
PAD_TOKEN = '<PAD>'
CLS_TOKEN = '[CLS]'
MASK_TOKEN = '[MASK]'

tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_DIR)
special_tokens_dict = {'cls_token':CLS_TOKEN, 'bos_token': BOS_TOKEN, 'eos_token': EOS_TOKEN, 'pad_token': PAD_TOKEN, 'mask_token': MASK_TOKEN}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
VOCAB_SIZE = len(tokenizer)
print(VOCAB_SIZE)

52005


##Get n random token sequences from each story

###Load Data from Google Drive

In [10]:
TOK_SEQS_PER_STORY = 100
SEQ_LEN = 256
NUM_PAD_TOKENS = 128
READ_CHUNKSIZE = 100
FILTER_OUT_SHORTER_THAN = 1200
all_token_seqs = []
random.seed(42)

In [11]:
df_filename = bibliotik_train
df = pd.read_csv(df_filename, 
            index_col=0,
            compression={'method': 'gzip', 'compresslevel': 2}, 
            chunksize=READ_CHUNKSIZE, 
            iterator=True)

In [12]:
def get_random_subsequence(seq, subsequence_len):
    start_index = random.randint(0, len(seq) - subsequence_len)
    return seq[start_index:start_index + subsequence_len]

In [None]:
def get_tokens_from_corpus(df, df_len):
  all_token_seqs = []
  _avg_time = 0
  for i in range(df_len // READ_CHUNKSIZE):
    print(f"processings chunk {i}")
    stories = df.get_chunk()['full_text'].to_list()
    # if len(story) <= 100:
    #   
    # stories = stories.
    # print(stories)
    filtered_stories = []
    for story in stories:
      # length = min(len(story) //10, 100)
      # print("STORY_LEN: ",len(story))
      _total_story_len = 0
      if type(story) != type('hello world'):
        print("Unexpected value: ", story)
        continue
      elif len(story) <= FILTER_OUT_SHORTER_THAN: # Roughly the len of 256 tokens?
        print("skipping small story of len ", len(story))
        continue
      else:
        filtered_stories.append(story)
        _total_story_len += len(story)
    # print(len(filtered_stories))
    if len(filtered_stories) == 0:
      continue
    start_time = time.time()
    tokenized_stories = tokenizer.batch_encode_plus(filtered_stories)
    tokenized_stories = tokenized_stories['input_ids']
    _total_story_len = max(1, _total_story_len)
    print(((time.time() - start_time) / _total_story_len)*100000)
    for tokenized_story in tokenized_stories:
      # print()
      story_tokens = [tokenizer.bos_token_id] + tokenized_story + [tokenizer.eos_token_id] + NUM_PAD_TOKENS * [tokenizer.pad_token_id]

      for _ in range(TOK_SEQS_PER_STORY):
        token_seq = get_random_subsequence(story_tokens, SEQ_LEN)
        all_token_seqs.append([token_seq])
      
    # if i >= 50:
    #   break
  return all_token_seqs

all_token_seqs = get_tokens_from_corpus(df, 22500)
print(len(all_token_seqs))
# Chunksize = 100 : 5m 50 s to do first 1000
# Chunksize = 1 : 3m 19 s to do first 453
# Chunksize = 10 : 3m 5 s to do first 500

processings chunk 0
12.102200015629673
processings chunk 1
3.4839452919707017
processings chunk 2
4.8139285614823715
processings chunk 3


In [None]:
# print(all_token_seqs[0:10])
random.shuffle(all_token_seqs)
seqs_df = pd.DataFrame(all_token_seqs, columns=['token_seqs'])
seqs_df.head()

In [None]:
seqs_df.to_csv(f'{DATA_DIR}bibliotik_corpus/biblitik_22500_256_tokenized.gz', compression={'method': 'gzip', 'compresslevel': 2},)

In [None]:
import time
time.sleep(100)
from google.colab import runtime
runtime.unassign()

In [None]:
;;