<a href="https://colab.research.google.com/github/hsong-77/transformer-practice/blob/main/transformers-from-scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers==4.11.3
!pip install datasets
!pip install psutil
!pip install accelerate==0.5.1

In [None]:
from transformers import pipeline, set_seed

generation_gpt = pipeline("text-generation", model="openai-gpt")
generation_gpt2 = pipeline("text-generation", model="gpt2")

In [None]:
def model_size(model):
  return sum(t.numel() for t in model.parameters())

print(f"GPT  size: {model_size(generation_gpt.model)/1000**2:.1f}M parameters")
print(f"GPT2 size: {model_size(generation_gpt2.model)/1000**2:.1f}M parameters")

In [None]:
def enum_pipeline_outputs(pipe, prompt, num_return_sequences):
  out = pipe(prompt, num_return_sequences=num_return_sequences, clean_up_tokenization_spaces=True)
  return "\n".join(f"{i+1}." + s["generated_text"] for i, s in enumerate(out))

prompt = "\nWhen they came back"
print("GPT completions:\n" + enum_pipeline_outputs(generation_gpt, prompt, 3))
print("")
print("GPT-2 completions:\n" + enum_pipeline_outputs(generation_gpt2, prompt, 3))

In [None]:
from tqdm.auto import tqdm
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode

tokenizer = AutoTokenizer.from_pretrained("gpt2")

byte_to_unicode_map = bytes_to_unicode()
unicode_to_byte_map = dict((v, k) for k, v in byte_to_unicode_map.items())
base_vocab = list(unicode_to_byte_map.keys())

length = 100000
dataset = load_dataset("transformersbook/codeparrot-train", split="train", streaming=True)
iter_dataset = iter(dataset)

def batch_iterator(batch_size=10):
  for _ in tqdm(range(0, length, batch_size)):
    yield [next(iter_dataset)["content"] for _ in range(batch_size)]

new_tokenizer = tokenizer.train_new_from_iterator(batch_iterator(), vocab_size=12500, initial_alphabet=base_vocab)

In [None]:
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("transformersbook/codeparrot-small-vocabulary")
config = AutoConfig.from_pretrained("gpt2", vocal_size=len(tokenizer))
model = AutoModelForCausalLM.from_config(config)

In [None]:
print(f'GPT-2 size: {model_size(model)/1000**2:.1f}M parameters')

In [None]:
examples, total_characters, total_tokens = 500, 0, 0
dataset = load_dataset("transformersbook/codeparrot-train", split="train", streaming=True)

for _, example in tqdm(zip(range(examples), iter(dataset)), total=examples):
  total_characters += len(example["content"])
  total_tokens += len(tokenizer(example["content"]).tokens())

characters_per_token = total_characters / total_tokens
characters_per_token

In [None]:
import torch
from torch.utils.data import IterableDataset

class ConstantLengthDataset(IterableDataset):
  def __init__(self, tokenizer, dataset, seq_length=1024, num_of_sequences=1024, chars_per_token=3.4):
    self.tokenizer = tokenizer
    self.concat_token_id = tokenizer.eos_token_id
    self.dataset = dataset
    self.seq_length = seq_length
    self.input_characters = seq_length * chars_per_token * num_of_sequences

  def __iter__(self):
    iterator = iter(self.dataset)
    more_examples = True
    while more_examples:
      buffer, buffer_len = [], 0
      while True:
        if buffer_len >= self.input_characters:
          print(f"Buffer full: {buffer_len}>={self.input_characters:.0f}")
          break
        try:
          print(f"Fill buffer: {buffer_len}<{self.input_characters:.0f}")
          buffer.append(next(iterator)["content"])
          buffer_len += len(buffer[-1])
        except StopIteration:
          iterator = iter(self.dataset)

      all_token_ids = []
      tokenized_inputs = self.tokenizer(buffer, truncation=False)
      for tokenized_input in tokenized_inputs["input_ids"]:
        all_token_ids.extend(tokenized_input + [self.concat_token_id])

      for i in range(0, len(all_token_ids), self.seq_length):
        input_ids = all_token_ids[i:i+self.seq_length]
        if len(input_ids) == self.seq_length:
          yield torch.tensor(input_ids)

In [None]:
shuffled_dataset = dataset.shuffle(buffer_size=100)
constant_length_dataset = ConstantLengthDataset(tokenizer, shuffled_dataset, num_of_sequences=10)
dataset_iterator = iter(constant_length_dataset)

lengths = [len(b) for _, b in zip(range(5), dataset_iterator)]
print(f"Lengths of the sequences: {lengths}")

In [None]:
from argparse import Namespace

config = {"train_batch_size": 12,
          "valid_batch_size": 12,
          "weight_decay": 0.1,
          "shuffle_buffer": 1000,
          "learning_rate": 5e-4,
          "lr_scheduler_type": "cosine",
          "num_warmup_steps": 2000,
          "gradient_accumulation_steps": 1,
          "max_train_steps": 150000,
          "max_eval_steps": -1,
          "seq_length": 1024,
          "seed": 1,
          "save_checkpoint_steps": 15000}

args = Namespace(**config)

In [None]:
!pip install wandb

In [None]:
from accelerate import Accelerator
from torch.utils.tensorboard import SummaryWriter
import logging
import wandb

accelerator = Accelerator()

def setup_logging(project_name):
  logger = logging.getLogger(__name__)
  logging.basicConfig(format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
                      datefmt="%m/%d/%Y %H:%M:%S",
                      level=logging.INFO,
                      # handlers=[logging.FileHandler(f"log/debug_{accelerator.process_index}.log"), logging.StreamHandler()])
                      handlers=[logging.StreamHandler()])
  if accelerator.is_main_process:
    wandb.init(project=project_name, config=args)
    run_name = wandb.run.name
    tb_writer = SummaryWriter()
    tb_writer.add_hparams(vars(args), {'0': 0})
    logger.setLevel(logging.INFO)
    datasets.utils.logging.set_verbosity_debug()
    transformers.utils.logging.set_verbosity_debug()
  else:
    tb_writer = None
    run_name = ""
    logger.setLevel(logging.ERROR)
    datasets.utils.logging.set_verbosity_debug()
    transformers.utils.logging.set_verbosity_debug()

  return logger, tb_writer, run_name

def log_metrics(step, metrics):
  logger.info(f"Step {step}: {metrics}")
  if accelerator.is_main_process:
    wandb.log(metrics)
    [tb_writer.add_scalar(k, v, step) for k, v in metrics.items()]

In [None]:
from torch.utils.data.dataloader import DataLoader

def create_dataloaders(dataset_name):
  train_data = load_dataset(dataset_name+'-train', split="train", streaming=True)
  train_data = train_data.shuffle(buffer_size=args.shuffle_buffer, seed=args.seed)
  valid_data = load_dataset(dataset_name+'-valid', split="validation", streaming=True)

  train_dataset = ConstantLengthDataset(tokenizer, train_data, seq_length=args.seq_length)
  valid_dataset = ConstantLengthDataset(tokenizer, valid_data, seq_length=args.seq_length)

  train_dataloader = DataLoader(train_dataset, batch_size=args.train_batch_size)
  eval_dataloader = DataLoader(valid_dataset, batch_size=args.valid_batch_size)
  return train_dataloader, eval_dataloader

In [None]:
def get_grouped_params(model, no_decay=["bias", "LayerNorm.weight"]):
  params_with_wd, params_without_wd = [], []
  for n, p in model.named_parameters():
    if any(nd in n for nd in no_decay):
      params_without_wd.append(p)
    else:
      params_with_wd.append(p)
  
  return [{"params": params_with_wd, 'weight_decay': args.weight_decay},
          {'params': params_without_wd, 'weight_decay': 0.0}]

In [None]:
def evaluate():
  model.eval()
  losses = []
  for step, batch in enumerate(eval_dataloader):
    with torch.no_grad():
      outputs = model(batch, labels=batch)
    loss = outputs.loss.repeat(args.valid_batch_size)
    losses.append(accelerator.gather(loss))
    if args.max_eval_steps > 0 and step >= args.max_eval_steps: break

  loss = torch.mean(torch.cat(losses))
  try:
    perplexity = torch.exp(loss)
  except OverflowError:
	  perplexity = torch.tensor(float("inf"))
   
  return loss.item(), perplexity.item()

In [None]:
# AAAAA
from transformers.utils.dummy_pt_objects import AdamW
from transformers import set_seed

set_seed(args.seed)

accelerator = Accelerator()
samples_per_step = accelerator.state.num_processes * args.train_batch_size

logger, tb_writer, run_name = setup_logging(project_name.split("/")[1])
logger.info(accelerator.state)

train_dataloader, eval_dataloader = create_dataloaders("transformersbook/codeparrot-train")

optimizer = AdamW(get_grouped_params(model), lr=args.learning_rate)
lr_scheduler = get_scheduler(name=args.lr_scheduler_type, optimizer=optimizer,
                             num_warmup_steps=args.num_warmup_steps,
                             num_training_steps=args.max_train_steps)
def get_lr():
  return optimizer.param_groups[0]["lr"]

# Prepare everything with our `accelerator`
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(model, optimizer, train_dataloader, eval_dataloader)

# train
model.train()
completed_steps = 0
for step, batch in enumerate(train_dataloader, start=1):
  loss = model(batch, labels=batch).loss
  log_metrics(step, {"lr": get_lr(), "samples": step * samples_per_step,
                     "steps": completed_steps, "loss/train": loss.item()})
  loss = loss / args.gradient_accumulation_steps
  accelerator.backward(loss)

  if step % args.gradient_accumulation_steps == 0:
    optimizer.step()
    lr_scheduler.step()
    optimizer.zero_grad()
    completed_steps += 1
  if step % args.save_checkpoint_steps == 0:
    logger.info("Evaluating model checkpoint")
    eval_loss, perplexity = evaluate()
    log_metrics(step, {"loss/eval": eval_loss, "perplexity": perplexity})
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    if accelerator.is_main_process:
      unwrapped_model.save_pretrained("./")
    model.train()
  
  if completed_steps >= args.max_train_steps:
    break

# the last checkpoint
logger.info("Evaluating model checkpoint")
eval_loss, perplexity = evaluate()
log_metrics(step, {"loss/eval": eval_loss, "perplexity": perplexity})
accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
if accelerator.is_main_process:
  unwrapped_model.save_pretrained("./")

In [None]:
from transformers import pipeline, set_seed

ckpt = "transformersbook/codeparrot-small"
generation = pipeline("text-generation", model=ckpt)

In [None]:
import re

def first_block(string):
  return re.split('\nclass|\ndef|\n#|\n@|\nprint|\nif', string)[0].rstrip()

def complete_code(pipe, prompt, max_length=64, num_completions=4, seed=1):
  set_seed(seed)
  gen_kwargs = {"temperature":0.4, "top_p":0.95, "top_k":0, "num_beams":1, "do_sample":True}
  code_gens = generation(prompt, num_return_sequences=num_completions, max_length=max_length, **gen_kwargs)
  code_strings = []
  for code_gen in code_gens:
    generated_code = first_block(code_gen['generated_text'][len(prompt):])
    code_strings.append(generated_code)
  print(('\n'+'='*80 + '\n').join(code_strings))

In [None]:
prompt = '''def area_of_rectangle(a: float, b: float):
         """Return the area of the rectangle."""'''
complete_code(generation, prompt)