<a href="https://colab.research.google.com/github/jstephencorey/bert-experiments/blob/main/bert_testing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
on_colab = True
first_run = True # Change this when you disconnect and reconnect the runtime
if on_colab and first_run:
    !git clone https://github.com/jstephencorey/bert-experiments.git  bert_git # Only need to run this on colab
    from pathlib import Path
    import sys
    sys.path.append(str('/content/bert_git')) #Only need on the colab
    print(sys.path)
    !pip install transformers --quiet
    !pip install wandb --quiet

Cloning into 'bert_git'...
remote: Enumerating objects: 50, done.[K
remote: Counting objects: 100% (50/50), done.[K
remote: Compressing objects: 100% (48/48), done.[K
remote: Total 50 (delta 13), reused 16 (delta 1), pack-reused 0[K
Unpacking objects: 100% (50/50), done.
['/content', '/env/python', '/usr/lib/python38.zip', '/usr/lib/python3.8', '/usr/lib/python3.8/lib-dynload', '', '/usr/local/lib/python3.8/dist-packages', '/usr/lib/python3/dist-packages', '/usr/local/lib/python3.8/dist-packages/IPython/extensions', '/root/.ipython', '/content/bert_git']
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m61.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.4/182.4 KB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m45.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[

In [30]:
import torch
from torch import nn
import random
from torch import optim
import bert
import wandb
import pandas as pd
import numpy as np
from torch import autocast
wandb.login()



True

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#Import the tokenizers

In [4]:
from transformers import AutoTokenizer

In [5]:
DATA_DIR = '/content/drive/MyDrive/AI_Data/'
MODEL_DIR = '/content/drive/MyDrive/AI_Models/storygpt/'
TOKENIZER_DIR = f'{MODEL_DIR}storygpt2tokenizer_ft2/'

bibliotik_train = f'{DATA_DIR}bibliotik_corpus/biblitik_22500_full.gz'
bibliotik_val = f'{DATA_DIR}bibliotik_corpus/biblitik_7500_val.gz'

bibliotik_train_tokenized = f'{DATA_DIR}bibliotik_corpus/biblitik_22500_train_256_tokenized.gz'
bibliotik_val_tokenized = f'{DATA_DIR}bibliotik_corpus/biblitik_7500_val_256_tokenized.gz'
BIBLIOTIK_TRAIN_TOKENIZED_LEN = 2227700
BIBLIOTIK_VAL_TOKENIZED_LEN = 741200

atlas_shrugged_filename = f'{DATA_DIR}atlas-shrugged.txt'
anthem_filename = f'{DATA_DIR}anthem.txt'

In [6]:
BOS_TOKEN = '<BOS>'
EOS_TOKEN = '<EOS>'
PAD_TOKEN = '<PAD>'
CLS_TOKEN = '[CLS]'
MASK_TOKEN = '[MASK]'

tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_DIR)

special_tokens_dict = {'cls_token':CLS_TOKEN, 'bos_token': BOS_TOKEN, 'eos_token': EOS_TOKEN, 'pad_token': PAD_TOKEN, 'mask_token': MASK_TOKEN}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
VOCAB_SIZE = len(tokenizer)
print(VOCAB_SIZE)

52005


##Setup Wandb config

In [7]:
test_parts = True
device= "cuda" if torch.cuda.is_available() else "cpu"

In [8]:
run_number = 0
previous_run_numbers = []
if run_number in previous_run_numbers:
  raise(Exception("Pick a new run number!"))

run_name = f"bert_no_masking_{run_number}"

wandb.init(project="bert_experiments", 
           name=run_name,
           )

wandb.config = {
  "seed":134,
  # Dataloader config
  "batch_size": 2,
  "val_batch_size": 2,
  "load_chunk_size": 6,
  # Model config
  "d_model":512,
  "vocab_length": VOCAB_SIZE,
  "context_len": 256,
  "num_layers":6,
  "feed_forward_dimensions":1024,
  "attention_heads":8,
  "attention_qkv_dims":128,
  "dropout":.1,
  "device":device,
  #Ablation config
  "save_model": None,
  "bert_mask_percentage": None,
  #Training config
  "lr": 1e-4,
  "epochs": 1,
  "batches_per_epoch":2,
  "batches_per_val":2,
}

config = wandb.config

random.seed(config['seed'])
torch.random.manual_seed(config['seed'])

print(config)

[34m[1mwandb[0m: Currently logged in as: [33mjstephencorey[0m ([33msintez[0m). Use [1m`wandb login --relogin`[0m to force relogin


{'seed': 134, 'batch_size': 2, 'val_batch_size': 2, 'load_chunk_size': 6, 'd_model': 512, 'vocab_length': 52005, 'context_len': 256, 'num_layers': 6, 'feed_forward_dimensions': 1024, 'attention_heads': 8, 'attention_qkv_dims': 128, 'dropout': 0.1, 'device': 'cuda', 'save_model': None, 'bert_mask_percentage': None, 'lr': 0.0001, 'epochs': 1, 'batches_per_epoch': 2, 'batches_per_val': 2}


##Get a dataloader

In [9]:
#@title BertChunkDataLoader
class BertChunkDataLoader():
 
  def __init__(self, df_filename, batch_size, df_file_len, load_chunk_size):
    self.df_filename = df_filename
    self.batch_size = batch_size
    self.total_df_len = df_file_len
    self.remaining_df_items = df_file_len
    self.load_chunk_size = load_chunk_size
    self.df = self.init_df()
    self.sequence_buffer = []


  def init_df(self):
     self.remaining_df_items = self.total_df_len
     df = pd.read_csv(self.df_filename, 
                 index_col=0,
                 compression={'method': 'gzip', 'compresslevel': 2}, 
                 chunksize=self.load_chunk_size, 
                 iterator=True)
     return df

  def has_next_batch(self):
    return self.remaining_df_items > self.DF_BUFFER

  def get_next_seq(self):
    if len(self.sequence_buffer) <= 0:
      if self.remaining_df_items < self.load_chunk_size:
        self.df = self.init_df()
      chunk = self.df.get_chunk()
      self.remaining_df_items -= self.load_chunk_size
      for item in chunk['token_seqs']:
        item_arr = self.str_to_arr(item)
        self.sequence_buffer.append(item_arr)
    return self.sequence_buffer.pop()
  
  def str_to_arr(self, text):
    text = text[1:-1]
    arr = np.fromstring(text,sep=',', dtype=int)
    return arr
    # return torch.LongTensor(arr)

  def get_next_batch(self):
    batch = []
    while len(batch) < self.batch_size:
      batch.append(self.get_next_seq())
      # batch.append(torch.LongTensor(self.get_next_seq()))
    if len(batch) > self.batch_size:
      batch = batch[0:self.batch_size]
    # return torch.LongTensor(np.array(batch))
    return np.array(batch)


In [10]:
#@title StoryDataLoader
class StoryDataLoader():
 
  def __init__(self, df_filename, tokenizer, batch_size, df_file_len, num_seqs_per_story, context_len, load_stories_at_a_time):
    self.df_filename = df_filename
    self.tokenizer = tokenizer
    self.batch_size = batch_size
    self.total_df_len = df_file_len
    self.remaining_df_items = df_file_len
    self.num_seqs_per_story = num_seqs_per_story
    self.context_len = context_len
    self.all_subsequences_from_story = []
    self.load_stories_at_a_time = load_stories_at_a_time
    self.DF_BUFFER = 100 + self.load_stories_at_a_time
    self.df = self.init_df()
    self.story_buffer = []


  def init_df(self):
     self.remaining_df_items = self.total_df_len
     df = pd.read_csv(self.df_filename, 
                 index_col=0,
                 compression={'method': 'gzip', 'compresslevel': 2}, 
                 chunksize=self.load_stories_at_a_time, 
                 iterator=True)
     return df

  def _get_random_subsequence(self, seq, subsequence_len):
    start_index = random.randint(0, len(seq) - subsequence_len)
    return seq[start_index:start_index + subsequence_len]

  def has_next_batch(self):
    return self.remaining_df_items > self.DF_BUFFER

  def get_next_story(self):
    if len(self.story_buffer) <= 0:
      chunk = self.df.get_chunk()
      for item in chunk['full_text']:
        self.story_buffer.append(str(item))
    return self.story_buffer.pop()
  
  def get_tokens(self):
    # tokens = []
    while True:
      try:
        story = self.df.get_next_story()
        self.remaining_df_items -= 1 #self.load_stories_at_a_time
      except:
        self.remaining_df_items = 0
        raise Exception("Next batch")
          
      text = story
      tokens = tokenizer.encode(text)
      # tokens = torch.Tensor(tokens).to(torch.long).to(device)
      tokens = [tokenizer.bos_token_id] + tokens + [tokenizer.eos_token_id] + self.context_len * [tokenizer.pad_token_id]
      # print(len(tokens))
      if len(tokens) <= self.context_len:
        print("Story too short, retrying now")
        continue
      else:
        return tokens

  def get_next_batch(self):
    batch = []
    while len(batch) < self.batch_size:
      if len(self.all_subsequences_from_story) == 0:
        tokens = self.get_tokens()
        # tokens = tokens + [tokenizer.pad_token_id] * self.context_len
        # print(tokens)

        for _ in range(self.num_seqs_per_story):
          subseq = self._get_random_subsequence(tokens, self.context_len-1)
          subseq = [tokenizer.cls_token_id] + subseq
          self.all_subsequences_from_story.append(subseq)
      subseq = self.all_subsequences_from_story.pop()
      batch.append(subseq)
    if len(batch) > self.batch_size:
      batch = batch[0:self.batch_size]
    return batch


In [11]:
#@title SingleStoryDataLoader
class SingleStoryDataLoader():
 
  def __init__(self, text_filename, tokenizer, batch_size, context_len):
    self.text_filename = text_filename
    self.tokenizer = tokenizer
    self.batch_size = batch_size
    self.context_len = context_len
    self.all_text_tokens = self.get_text_tokens()


  def get_text_tokens(self):
    with open(self.text_filename) as text_file:
      uncleaned_text = text_file.read()
    tokens = tokenizer.encode(uncleaned_text)
    tokens = [tokenizer.bos_token_id] + tokens + [tokenizer.eos_token_id] + self.context_len * [tokenizer.pad_token_id]
    return tokens


  def _get_random_subsequence(self, seq, subsequence_len):
    start_index = random.randint(0, len(seq) - subsequence_len)
    return seq[start_index:start_index + subsequence_len]

  def has_next_batch(self):
    return True

  def get_next_batch(self):
    batch = []
    while len(batch) < self.batch_size:
      subseq = self._get_random_subsequence(self.all_text_tokens, self.context_len-1)
      subseq = [tokenizer.cls_token_id] + subseq
      batch.append(subseq)
    if len(batch) > self.batch_size:
      batch = batch[0:self.batch_size]
    return batch


In [12]:
#@title SingleStoryBertAssistDataLoader
class SingleStoryBertAssistDataLoader():
 
  def __init__(self, text_filename, tokenizer, batch_size, context_len, bert_model, mask_percentage=.15):
    self.text_filename = text_filename
    self.tokenizer = tokenizer
    self.batch_size = batch_size
    self.context_len = context_len
    self.all_text_tokens = self.get_text_tokens()
    self.bert_model = bert_model
    self.mask_percentage = mask_percentage
    self.mask_token_id = self.tokenizer.mask_token_id # bert_model.tokenizer.mask_token_id

  def get_text_tokens(self):
    with open(self.text_filename) as text_file:
      uncleaned_text = text_file.read()
    tokens = self.tokenizer.encode(uncleaned_text)
    tokens = tokens + self.context_len * [self.tokenizer.pad_token_id]
    return tokens

  def _get_random_subsequence(self, seq, subsequence_len):
    start_index = random.randint(0, len(seq) - subsequence_len)
    return seq[start_index:start_index + subsequence_len]

  def _augment_subsequence(self, seq):
    rand = torch.rand(len(seq))
    mask_selection = rand < self.mask_percentage
    mask_selection = torch.flatten(mask_selection.nonzero()).tolist()
    # print(mask_selection)
    # mask_selection = [2,4,6,8]
    # old_ids = []
    # print("Pre-change text: ",self.tokenizer.decode(seq) )
    for idx in mask_selection:
      # old_ids.append(seq[idx])
      seq[idx] = self.mask_token_id
    # text = self.tokenizer.decode(seq)
    processed_seq = torch.Tensor([seq]).to(torch.long).to(device)
    # print(seq)
    # print("Conversion text: ",self.tokenizer.decode(seq) )
    new_words = self.bert_model(processed_seq)
    # print(unmasked)
    new_words = new_words['logits'][0]
    # print(new_words)
    # print(seq)
    for idx, new_word in enumerate(new_words):
      if idx not in mask_selection:
        continue
      else:
        new_word_id = torch.argmax(new_word)
        # print(seq, new_word_id)
        seq[idx] = int(new_word_id)
    return seq

  def has_next_batch(self):
    return True

  def get_next_batch(self):
    batch = []
    while len(batch) < self.batch_size:
      subseq = self._get_random_subsequence(self.all_text_tokens, self.context_len-1)
      subseq = self._augment_subsequence(subseq)
      subseq = [tokenizer.cls_token_id] + subseq + 50 * [tokenizer.pad_token_id]
      subseq = subseq[0:self.context_len]
      batch.append(subseq)
    if len(batch) > self.batch_size:
      batch = batch[0:self.batch_size]
    return batch

In [13]:
train_dataloader = BertChunkDataLoader(\
                                       df_filename=bibliotik_train_tokenized,
                                       batch_size=config["batch_size"],
                                       df_file_len=BIBLIOTIK_TRAIN_TOKENIZED_LEN,
                                       load_chunk_size=config["load_chunk_size"],
                                      )
val_dataloader = BertChunkDataLoader(\
                                       df_filename=bibliotik_val_tokenized,
                                       batch_size=config["val_batch_size"],
                                       df_file_len=BIBLIOTIK_VAL_TOKENIZED_LEN,
                                       load_chunk_size=config["load_chunk_size"],
                                      )

In [14]:
if test_parts:
  for i in range(4):
    batch = train_dataloader.get_next_batch()
    print(len(batch), type(batch[0][0:10]), batch[0][0:10])
  for i in range(4):
    batch = val_dataloader.get_next_batch()
    print(len(batch), batch[0][0:10])

2 <class 'numpy.ndarray'> [3011 1228  395  199  199   41 3043   14  371  989]
2 <class 'numpy.ndarray'> [  824    12 15738   291   821 38989   432   433    13    19]
2 <class 'numpy.ndarray'> [ 9424 12990  1983  7146   439   199 11586  4151    12  2339]
2 <class 'numpy.ndarray'> [ 3492   199   199    22   287  1619 35792   335  1645 19144]
2 [24407    12  1100   288   261  1332  5123 25650   376   658]
2 [  14 1178  744   12  348  894 4429  261 9866 3279]
2 [  434 11962 10251   866   291   261  8146    14   199   199]
2 [  854   309   261  5903   338   261 47735 15506   348  9095]


##Setup masking

In [15]:
def mask_batch_do_nothing(batch):
  masked_batch = []
  for row in batch:
    # print(row)
    masked_row = row.copy()
    # masked_row[2] = 9
    # masked_batch = np.concatenate((masked_batch,masked_row))
    masked_batch.append(masked_row)
  masked_batch = np.array(masked_batch)
  return format_batches(masked_batch, batch)

def format_batches(masked_batch, batch):
  masked_batch = torch.LongTensor(masked_batch).to(config["device"])
  batch = torch.LongTensor(batch).to(config["device"])
  return masked_batch, batch

In [16]:
if test_parts:
  batch = train_dataloader.get_next_batch()
  masked_batch, batch = mask_batch_do_nothing(batch)
  print(np.shape(masked_batch))
  print(masked_batch[0][0:20])
  print(batch[0][0:20])

torch.Size([2, 256])
tensor([ 5126,    12,  2013,    12,   261,  4440, 13137, 27544,   443, 11707,
          381,  3407,   288, 17283,   961,   381,  7414,    14,   199,   199],
       device='cuda:0')
tensor([ 5126,    12,  2013,    12,   261,  4440, 13137, 27544,   443, 11707,
          381,  3407,   288, 17283,   961,   381,  7414,    14,   199,   199],
       device='cuda:0')


#Set up the model

In [17]:
model = bert.BertModel(d_model = config["d_model"], 
                        vocab_length = config["vocab_length"], 
                        sequence_length = config['context_len'],
                        num_layers = config["num_layers"], 
                        feed_forward_dimensions = config["feed_forward_dimensions"], 
                        attention_heads = config["attention_heads"],
                        attention_qkv_dims =  config["attention_qkv_dims"], 
                        dropout = config["dropout"], 
                        pad_idx = tokenizer.pad_token_id, 
                        device = config["device"])

In [18]:
if test_parts:
    inp = torch.LongTensor([[0,1,2,3,4,5,6,7,8,29],[1,3,4,5,6,8,0,22,1,1]])
    out = model(inp)
    print(out.size())

torch.Size([2, 10, 52005])


In [37]:
model = model.to(config["device"])
model = model.cuda(0)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [41]:
# batch = torch.LongTensor(train_dataloader.get_next_batch())
# print(batch.size())
# input_ids, segment_ids, masked_tokens, masked_pos, isNext = map(torch.LongTensor, zip(*batch))


for epoch in range(config["epochs"]):
    model.train()
    print(f"Beginning Epoch {epoch}")
    for i in range(config["batches_per_epoch"]):
      with autocast('cuda'):
        macked_batch, batch = mask_batch_do_nothing(train_dataloader.get_next_batch())
        optimizer.zero_grad()
        print(batch.device)
        # print(model.type())
        logits_lm = model.to(config["device"])(batch.to(config["device"]))
        print(logits_lm.size(), batch.size())
        loss_lm = criterion(logits_lm.transpose(1, 2), batch) # for masked LM
        loss_lm = (loss_lm.float()).mean()
        # loss_clsf = criterion(logits_clsf, isNext) # for sentence classification
        wandb.log({"Loss": loss_lm})
        if (epoch + 1) % 10 == 0:
            print('Epoch:', '%04d' % (epoch + 1), 'loss =', '{:.6f}'.format(loss_lm))
        loss_lm.backward()
        optimizer.step()
    for j in range(config["batches_per_val"]):
      macked_batch, batch = mask_batch_do_nothing(val_dataloader.get_next_batch())
      optimizer.zero_grad()
      logits_lm = model(batch)
      loss_lm = criterion(logits_lm.transpose(1, 2), batch) # for masked LM
      loss_lm = (loss_lm.float()).mean()
      wandb.log({"Validation Loss": loss_lm})

Beginning Epoch 0
cuda:0


RuntimeError: ignored