In [1]:
# from utils import *
from transformers import pipeline, set_seed
from datasets import load_dataset, DownloadConfig
from transformers import AutoTokenizer
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm

In [2]:
# download_config = DownloadConfig(delete_extracted=True)
# dataset = load_dataset("json", data_dir="C:/Users/takat/Documents/Programming/Training/data/train.json", streaming=True,)
# dataset = load_dataset("hakatiki/guttenberg-books-corpus",split="train",streaming=True,)

dataset = load_dataset("data/validation",split="validation",streaming=True,)

Using custom data configuration validation-bc27a78d63377863


In [3]:
dataset
iter_dataset = iter(dataset)

In [4]:
example = next(iter_dataset)
example

{'text': '> '}

In [5]:
tokenizer = AutoTokenizer.from_pretrained("hakatiki/hu-gpt")

OSError: Can't load tokenizer for 'hakatiki/hu-gpt'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'hakatiki/hu-gpt' is the correct path to a directory containing all relevant files for a GPT2TokenizerFast tokenizer.

In [9]:
tokenizer = AutoTokenizer.from_pretrained("hakatiki/hu-gpt")
config = AutoConfig.from_pretrained("gpt2", vocab_size=len(tokenizer))
model = AutoModelForCausalLM.from_config(config)

In [10]:
def model_size(model):
    return sum(t.numel() for t in model.parameters())
print(f'GPT-2 size: {model_size(model)/1000**2:.1f}M parameters')

GPT-2 size: 105.5M parameters


In [12]:
# TODO change it back to True
model.save_pretrained("models/hu-gpt", push_to_hub=True,
                      repo_id="hakatiki/hu-gpt")

In [34]:
examples, total_characters, total_tokens = 500, 0, 0


for _, example in tqdm(zip(range(examples), iter(dataset)), total=examples):
    total_characters += len(example['text'])
    total_tokens += len(tokenizer(example['text']).tokens())

characters_per_token = total_characters / total_tokens

100%|██████████| 500/500 [00:09<00:00, 51.89it/s]


In [35]:
import torch
from torch.utils.data import IterableDataset

class ConstantLengthDataset(IterableDataset):
    
    def __init__(self, tokenizer, dataset, seq_length=1024,
                 num_of_sequences=1024, chars_per_token=3.6):
        self.tokenizer = tokenizer
        self.concat_token_id = tokenizer.eos_token_id
        self.dataset = dataset
        self.seq_length = seq_length
        self.input_characters = seq_length * chars_per_token * num_of_sequences
    
    def __iter__(self):
        iterator = iter(self.dataset)
        more_examples = True
        while more_examples:
            buffer, buffer_len = [], 0
            while True:
                if buffer_len >= self.input_characters:
                    m=f"Buffer full: {buffer_len}>={self.input_characters:.0f}"
                    print(m)
                    break
                try:
                    m=f"Fill buffer: {buffer_len}<{self.input_characters:.0f}"
                    print(m)
                    buffer.append(next(iterator)["text"])
                    buffer_len += len(buffer[-1])
                except StopIteration:
                    iterator = iter(self.dataset)

            all_token_ids = []
            tokenized_inputs = self.tokenizer(buffer, truncation=False)
            for tokenized_input in tokenized_inputs['input_ids']:
                all_token_ids.extend(tokenized_input + [self.concat_token_id])
            
            for i in range(0, len(all_token_ids), self.seq_length):
                input_ids = all_token_ids[i : i + self.seq_length]
                if len(input_ids) == self.seq_length:
                    yield torch.tensor(input_ids)

In [36]:
# TODO not sure about shuffling
# shuffled_dataset = dataset.shuffle(buffer_size=100)
constant_length_dataset = ConstantLengthDataset(tokenizer, dataset,
                                                num_of_sequences=10)
dataset_iterator = iter(constant_length_dataset)

lengths = [len(b) for _, b in zip(range(5), dataset_iterator)]
print(f"Lengths of the sequences: {lengths}")

Fill buffer: 0<36864
Fill buffer: 71<36864
Fill buffer: 71<36864
Fill buffer: 135<36864
Fill buffer: 203<36864
Fill buffer: 270<36864
Fill buffer: 316<36864
Fill buffer: 316<36864
Fill buffer: 316<36864
Fill buffer: 343<36864
Fill buffer: 343<36864
Fill buffer: 365<36864
Fill buffer: 365<36864
Fill buffer: 406<36864
Fill buffer: 406<36864
Fill buffer: 425<36864
Fill buffer: 425<36864
Fill buffer: 454<36864
Fill buffer: 454<36864
Fill buffer: 520<36864
Fill buffer: 520<36864
Fill buffer: 520<36864
Fill buffer: 520<36864
Fill buffer: 520<36864
Fill buffer: 576<36864
Fill buffer: 633<36864
Fill buffer: 657<36864
Fill buffer: 657<36864
Fill buffer: 657<36864
Fill buffer: 657<36864
Fill buffer: 657<36864
Fill buffer: 657<36864
Fill buffer: 657<36864
Fill buffer: 672<36864
Fill buffer: 672<36864
Fill buffer: 692<36864
Fill buffer: 692<36864
Fill buffer: 692<36864
Fill buffer: 692<36864
Fill buffer: 692<36864
Fill buffer: 723<36864
Fill buffer: 723<36864
Fill buffer: 723<36864
Fill buffer: 72

In [37]:
from argparse import Namespace

# Commented parameters correspond to the small model
config = {"train_batch_size": 2, # 12
          "valid_batch_size": 2, # 12
          "weight_decay": 0.1,
          "shuffle_buffer": 1000,
          "learning_rate": 2e-4, # 5e-4
          "lr_scheduler_type": "cosine",
          "num_warmup_steps": 750, # 2000
          "gradient_accumulation_steps": 16, # 1
          "max_train_steps": 50000, # 150000
          "max_eval_steps": -1,
          "seq_length": 1024,
          "seed": 1,
          "save_checkpoint_steps": 50000} # 15000

args = Namespace(**config)

In [38]:
from torch.utils.tensorboard import SummaryWriter
import logging
import accelerate
import wandb

def setup_logging(project_name):
    logger = logging.getLogger(__name__)
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, handlers=[
        logging.FileHandler(f"log/debug_{accelerator.process_index}.log"),
        logging.StreamHandler()])
    if accelerator.is_main_process: # We only want to set up logging once
        wandb.init(project=project_name, config=args)
        run_name = wandb.run.name
        tb_writer = SummaryWriter()
        tb_writer.add_hparams(vars(args), {'0': 0})
        logger.setLevel(logging.INFO)
        datasets.utils.logging.set_verbosity_debug()
        transformers.utils.logging.set_verbosity_info()
    else:
        tb_writer = None
        run_name = ''
        logger.setLevel(logging.ERROR)
        datasets.utils.logging.set_verbosity_error()
        transformers.utils.logging.set_verbosity_error()
    return logger, tb_writer, run_name

In [39]:
def log_metrics(step, metrics):
    logger.info(f"Step {step}: {metrics}")
    if accelerator.is_main_process:
        wandb.log(metrics)
        # [tb_writer.add_scalar(k, v, step) for k, v in metrics.items()]

In [44]:

#hide_output
from torch.utils.data.dataloader import DataLoader

def create_dataloaders(dataset_name):
    # TODO change it to sreaming=True and remove the shuffle and add validation dataset
    train_data = load_dataset("hakatiki/guttenberg-books-corpus",split="train",streaming=True,)
    # TODO might need this. Not sure
    # train_data = train_data.shuffle(buffer_size=args.shuffle_buffer,
    #                                 seed=args.seed)
    valid_data = load_dataset("hakatiki/guttenberg-books-corpus",split="train",streaming=True,)
    # valid_data = load_dataset(dataset_name+'-valid', split="validation",
    #                           streaming=True)
    
    train_dataset = ConstantLengthDataset(tokenizer, train_data,
                                          seq_length=args.seq_length)
    valid_dataset = ConstantLengthDataset(tokenizer, valid_data,
                                          seq_length=args.seq_length)
    
    train_dataloader=DataLoader(train_dataset, batch_size=args.train_batch_size)
    eval_dataloader=DataLoader(valid_dataset, batch_size=args.valid_batch_size)
    return train_dataloader, eval_dataloader

In [45]:
def get_grouped_params(model, no_decay=["bias", "LayerNorm.weight"]):
    params_with_wd, params_without_wd = [], []
    for n, p in model.named_parameters():
        if any(nd in n for nd in no_decay):
            params_without_wd.append(p)
        else:
            params_with_wd.append(p)
    return [{'params': params_with_wd, 'weight_decay': args.weight_decay},
            {'params': params_without_wd, 'weight_decay': 0.0}]

In [46]:

def evaluate():
    model.eval()
    losses = []
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs = model(batch, labels=batch)
        loss = outputs.loss.repeat(args.valid_batch_size)
        losses.append(accelerator.gather(loss))
        if args.max_eval_steps > 0 and step >= args.max_eval_steps: break
    loss = torch.mean(torch.cat(losses))
    try:
        perplexity = torch.exp(loss)
    except OverflowError:
            perplexity = torch.tensor(float("inf"))
    return loss.item(), perplexity.item()
     

In [47]:
set_seed(args.seed)

# Accelerator
accelerator = Accelerator()
samples_per_step = accelerator.state.num_processes * args.train_batch_size

# Logging
logger, tb_writer, run_name = setup_logging(project_name.split("/")[1])
logger.info(accelerator.state)

# Load model and tokenizer
if accelerator.is_main_process:
    hf_repo = Repository("./", clone_from=project_name, revision=run_name)
model = AutoModelForCausalLM.from_pretrained("./", gradient_checkpointing=True)
tokenizer = AutoTokenizer.from_pretrained("./")

# Load dataset and dataloader
train_dataloader, eval_dataloader = create_dataloaders(dataset_name)

# Prepare the optimizer and learning rate scheduler
optimizer = AdamW(get_grouped_params(model), lr=args.learning_rate)
lr_scheduler = get_scheduler(name=args.lr_scheduler_type, optimizer=optimizer,
                             num_warmup_steps=args.num_warmup_steps,
                             num_training_steps=args.max_train_steps,)
def get_lr():
    return optimizer.param_groups[0]['lr']

# Prepare everything with our `accelerator` (order of args is not important)
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader)

# Train model
model.train()
completed_steps = 0
for step, batch in enumerate(train_dataloader, start=1):
    loss = model(batch, labels=batch).loss
    log_metrics(step, {'lr': get_lr(), 'samples': step*samples_per_step,
                       'steps': completed_steps, 'loss/train': loss.item()})
    loss = loss / args.gradient_accumulation_steps
    accelerator.backward(loss)
    if step % args.gradient_accumulation_steps == 0:
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        completed_steps += 1
    if step % args.save_checkpoint_steps == 0:
        logger.info('Evaluating and saving model checkpoint')
        eval_loss, perplexity = evaluate()
        log_metrics(step, {'loss/eval': eval_loss, 'perplexity': perplexity})
        accelerator.wait_for_everyone()
        unwrapped_model = accelerator.unwrap_model(model)
        if accelerator.is_main_process:
            unwrapped_model.save_pretrained("./")
            hf_repo.push_to_hub(commit_message=f'step {step}')
        model.train()
    if completed_steps >= args.max_train_steps:
        break

# Evaluate and save the last checkpoint
logger.info('Evaluating and saving model after training')
eval_loss, perplexity = evaluate()
log_metrics(step, {'loss/eval': eval_loss, 'perplexity': perplexity})
accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
if accelerator.is_main_process:
    unwrapped_model.save_pretrained("./")
    hf_repo.push_to_hub(commit_message=f'final model')

NameError: name 'Accelerator' is not defined