In [77]:
from datasets import load_dataset, DatasetDict, Dataset

from transformers import (
    AutoTokenizer,
    AutoConfig, 
    AutoModelForCausalLM,
    DataCollatorWithPadding,
    TrainingArguments,
    GenerationConfig,
    Trainer)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
from torch.nn import CrossEntropyLoss
import torch.nn as nn
import numpy as np

device = 'mps'

In [78]:
def compute_loss(inputs, logits):
  # Drop the first token in inputs
  shifted_labels = inputs[:, 1:]
  # Drop the last token of the output
  shifted_logits = logits[:, :-1, :]
  loss_fn = CrossEntropyLoss(reduction="mean")
  # Reshape to concatenate batches. The overall loss is calculated
  loss = loss_fn(shifted_logits.reshape(-1, shifted_logits.size(-1)), 
                 shifted_labels.reshape(-1)
  )
  return loss


def preprocess_data(dataset_path, min_length, tokenizer) -> str:
    """Prepare dataset for training from the jsonl file.

    Args:
        dataset_path (Path): Extracted text from the book
        min_length (int): Filter pages without text
        tokenizer (PreTrainedTokenizer): HuggingFace tokenizer

    Yields:
        str: text of the pages
    """
    with open(dataset_path, 'r') as f:
        grouped_text = f.read()
        # End of paragraphs defined by ".\n is transformed into EOS token"
        grouped_text = grouped_text.replace('\n\n\"', tokenizer.eos_token + "\"")
        return grouped_text.replace('\n', ' ')
    
def tokenize(element, tokenizer, context_length: int) -> str:
    inputs = tokenizer(element['text'], truncation=True, return_overflowing_tokens=True, 
                       return_length=True, max_length=context_length)
    inputs_batch = []
    for length, input_ids in zip(inputs['length'], inputs['input_ids']):
        if length == context_length: # We drop the last input_ids that are shorter than max_length
            inputs_batch.append(input_ids)
    return {"input_ids": inputs_batch}

def prepare_dataset(dataset_path, min_length, context_length, 
                    test_size, shuffle, hf_repo=None) -> None:
    """Prepare dataset for training and push it to the hub.
    """
    tokenizer =  AutoTokenizer.from_pretrained("distilgpt2")
    tokenizer.pad_token = tokenizer.eos_token
    print(f'Start preparing dataset from {dataset_path}')
    text = preprocess_data(dataset_path=dataset_path, min_length=min_length, tokenizer=tokenizer)
    dataset = Dataset.from_dict({'text': [text]})
    tokenized_dataset = dataset.map(tokenize, batched=True, fn_kwargs={'tokenizer': tokenizer, 'context_length': context_length},
                                         remove_columns=dataset.column_names)
    print(f'The tokenized dataset is composed of {tokenized_dataset.num_rows} elements, each one composed of {context_length} tokens.')
    tokenized_dataset_dict = tokenized_dataset.train_test_split(test_size=test_size, shuffle=shuffle)
    print(f'The training dataset is composed of {tokenized_dataset_dict["train"].num_rows} elements, the test dataset is composed of {tokenized_dataset_dict["test"].num_rows} elements.')
    print(f'Preparing dataset finished.')
    return tokenized_dataset_dict


In [79]:
context_length = 2048
data_filename = "./data/all_lyrics.txt"
min_length = 1
test_size = 0.2
shuffle = True
tokenized_dataset_dict = prepare_dataset(data_filename, min_length, context_length, test_size, shuffle)

Start preparing dataset from ./data/all_lyrics.txt


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

The tokenized dataset is composed of 98 elements, each one composed of 2048 tokens.
The training dataset is composed of 78 elements, the test dataset is composed of 20 elements.
Preparing dataset finished.


In [80]:
tokenized_dataset_dict

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 78
    })
    test: Dataset({
        features: ['input_ids'],
        num_rows: 20
    })
})

In [81]:
model_checkpoint = 'bigscience/bloom-3b'
model = AutoModelForCausalLM.from_pretrained(model_checkpoint, device_map='auto')
# pfet 
for param in model.parameters():
    param.requires_grad = False  # freeze the model - train adapters later
    if param.ndim == 1:
    # cast the small parameters (e.g. layernorm) to fp32 for stability
        param.data = param.data.to(torch.float32)

class CastOutputToFloat(nn.Sequential):
    def forward(self, x): 
        return super().forward(x).to(torch.float32)
model.lm_head = CastOutputToFloat(model.lm_head)
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM" # set this for CLM or Seq2Seq
)

model = get_peft_model(model, lora_config)


Downloading (…)lve/main/config.json:   0%|          | 0.00/693 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/6.01G [00:00<?, ?B/s]

In [82]:
def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    return f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
print_trainable_parameters(model)


'trainable params: 4915200 || all params: 3007472640 || trainable%: 0.1634329082375293'

In [86]:
dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 78
    })
    test: Dataset({
        features: ['input_ids'],
        num_rows: 20
    })
})

In [85]:
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset


dataset = tokenized_dataset_dict

tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-3b")
tokenizer.pad_token = tokenizer.eos_token
trainer = Trainer(
    model=model,
    train_dataset=dataset['train'],
    args=TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=1,
        warmup_steps=100,
        weight_decay=0.1,
        num_train_epochs=3,
        learning_rate=2e-4, 
        fp16=False,
        logging_steps=1, 
        output_dir="outputs"
    ),
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

Downloading (…)okenizer_config.json:   0%|          | 0.00/222 [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

NotImplementedError: Cannot copy out of meta tensor; no data!

In [84]:
model.config.use_cache = False  # silence warnings
trainer.train()

  0%|          | 0/468 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [67]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftConfig, PeftModel

prompt = "Help me understand why \nThere are not stars in the sky\n"

inputs = tokenizer(prompt, return_tensors="pt").to(device)
tokens = model.generate(
    **inputs,
    max_new_tokens=100,
    temperature=1,
    eos_token_id=tokenizer.eos_token_id,
    early_stopping=True
)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [68]:
tokenizer.batch_decode(tokens)

['Help me understand why \nThere are not stars in the sky\nI am a star\nI am a star\nI am a star\nI am a star\nI am a star\nI am a star\nI am a star\nI am a star\nI am a star\nI am a star\nI am a star\nI am a star\nI am a star\nI am a star\nI am a star\nI am a star\nI am a star\nI am a star\nI am a star\nI am a star\n']