Generating the books3 corpus of 256-length token chunks for BERT training

In [1]:
!pip install transformers --quiet

In [2]:
import pandas as pd
import numpy as np
from google.colab import drive
import os
import random
import time
import random
from transformers import set_seed, AutoTokenizer

drive.mount('/content/drive', force_remount=False)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
DATA_DIR = '/content/drive/MyDrive/AI_Data/'
bibliotik_train = f'{DATA_DIR}bibliotik_corpus/biblitik_22500_full.gz'
bibliotik_val = f'{DATA_DIR}bibliotik_corpus/biblitik_7500_val.gz'

MODEL_DIR = '/content/drive/MyDrive/AI_Models/storygpt/'
TOKENIZER_DIR = f'{MODEL_DIR}storygpt2tokenizer_ft2/'

##Load Tokenizer

In [4]:
BOS_TOKEN = '<BOS>'
EOS_TOKEN = '<EOS>'
PAD_TOKEN = '<PAD>'
CLS_TOKEN = '[CLS]'
MASK_TOKEN = '[MASK]'

tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_DIR)
special_tokens_dict = {'cls_token':CLS_TOKEN, 'bos_token': BOS_TOKEN, 'eos_token': EOS_TOKEN, 'pad_token': PAD_TOKEN, 'mask_token': MASK_TOKEN}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
VOCAB_SIZE = len(tokenizer)
print(VOCAB_SIZE)

52005


##Get n random token sequences from each story

###Load Data from Google Drive

In [5]:
TOK_SEQS_PER_STORY = 100
SEQ_LEN = 256
NUM_PAD_TOKENS = 128
READ_CHUNKSIZE = 100
FILTER_OUT_SHORTER_THAN = 1200
all_token_seqs = []
random.seed(42)

In [7]:
def get_random_subsequence(seq, subsequence_len):
    start_index = random.randint(0, len(seq) - subsequence_len)
    return seq[start_index:start_index + subsequence_len]

In [8]:
def get_tokens_from_corpus(df, df_len):
  all_token_seqs = []
  _avg_time = 0
  for i in range(df_len // READ_CHUNKSIZE):
    print(f"processings chunk {i}")
    stories = df.get_chunk()['full_text'].to_list()
    # if len(story) <= 100:
    #   
    # stories = stories.
    # print(stories)
    filtered_stories = []
    for story in stories:
      # length = min(len(story) //10, 100)
      # print("STORY_LEN: ",len(story))
      _total_story_len = 0
      if type(story) != type('hello world'):
        print("Unexpected value: ", story)
        continue
      elif len(story) <= FILTER_OUT_SHORTER_THAN: # Roughly the len of 256 tokens?
        print("skipping small story of len ", len(story))
        continue
      else:
        filtered_stories.append(story)
        _total_story_len += len(story)
    # print(len(filtered_stories))
    if len(filtered_stories) == 0:
      continue
    start_time = time.time()
    tokenized_stories = tokenizer.batch_encode_plus(filtered_stories)
    tokenized_stories = tokenized_stories['input_ids']
    _total_story_len = max(1, _total_story_len)
    print(((time.time() - start_time) / _total_story_len)*100000)
    for tokenized_story in tokenized_stories:
      # print()
      story_tokens = [tokenizer.bos_token_id] + tokenized_story + [tokenizer.eos_token_id] + NUM_PAD_TOKENS * [tokenizer.pad_token_id]

      for _ in range(TOK_SEQS_PER_STORY):
        token_seq = get_random_subsequence(story_tokens, SEQ_LEN)
        all_token_seqs.append([token_seq])
      
    # if i >= 50:
    #   break
  return all_token_seqs

# Chunksize = 100 : 5m 50 s to do first 1000
# Chunksize = 1 : 3m 19 s to do first 453
# Chunksize = 10 : 3m 5 s to do first 500

processings chunk 0


Token indices sequence length is longer than the specified maximum sequence length for this model (17379 > 1024). Running this sequence through the model will result in indexing errors


6.629597433212126
processings chunk 1
1.370330009709133
processings chunk 2
2.0199161285907143
processings chunk 3
4.710222500577237
processings chunk 4
Unexpected value:  nan
0.5949286279810837
processings chunk 5
2.851257197624746
processings chunk 6
Unexpected value:  nan
Unexpected value:  nan
10.168851006694773
processings chunk 7
Unexpected value:  nan
Unexpected value:  nan
1.4869306956339592
processings chunk 8
skipping small story of len  137
skipping small story of len  98
skipping small story of len  449
2.6989226617214976
processings chunk 9
6.813844334877966
processings chunk 10
2.6858672451223478
processings chunk 11
5.1501487167285065
processings chunk 12
skipping small story of len  188
32.389130340612894
processings chunk 13
1.2356877392844636
processings chunk 14
skipping small story of len  1194
4.813658448261086
processings chunk 15
Unexpected value:  nan
skipping small story of len  835
2.8545560175228153
processings chunk 16
skipping small story of len  371
4.6551

In [6]:
df_filename = bibliotik_train
df = pd.read_csv(df_filename, 
            index_col=0,
            compression={'method': 'gzip', 'compresslevel': 2}, 
            chunksize=READ_CHUNKSIZE, 
            iterator=True)

In [None]:
all_token_seqs = get_tokens_from_corpus(df, 22500)
print(len(all_token_seqs))

In [9]:
random.shuffle(all_token_seqs)
seqs_df = pd.DataFrame(all_token_seqs, columns=['token_seqs'])
seqs_df.head()

Unnamed: 0,token_seqs
0,"[818, 12, 261, 8305, 73, 4253, 355, 361, 31490..."
1,"[9424, 12990, 1983, 7146, 439, 199, 11586, 415..."
2,"[259, 37875, 646, 323, 362, 1109, 14, 5330, 18..."
3,"[824, 12, 15738, 291, 821, 38989, 432, 433, 13..."
4,"[1267, 5119, 12, 14091, 12, 2368, 309, 9974, 4..."


In [10]:
seqs_df.to_csv(f'{DATA_DIR}bibliotik_corpus/biblitik_22500_train_256_tokenized.gz', compression={'method': 'gzip', 'compresslevel': 2},)

In [None]:
df_filename = bibliotik_val
df = pd.read_csv(df_filename, 
            index_col=0,
            compression={'method': 'gzip', 'compresslevel': 2}, 
            chunksize=READ_CHUNKSIZE, 
            iterator=True)
all_token_seqs = get_tokens_from_corpus(df, 7500)
print(len(all_token_seqs))

In [None]:
seqs_df.to_csv(f'{DATA_DIR}bibliotik_corpus/biblitik_7500_val_256_tokenized.gz', compression={'method': 'gzip', 'compresslevel': 2},)

In [None]:
import time
time.sleep(100)
from google.colab import runtime
runtime.unassign()