In [None]:
# %%capture
# !pip install accelerate peft bitsandbytes transformers trl

In [2]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig
from trl import SFTTrainer

In [5]:
# Model from Hugging Face hub
base_model = "ibm-granite/granite-3.0-2b-instruct"

# New instruction dataset
noot_dataset = "UWV/Leesplank_NL_wikipedia_simplifications_preprocessed"

# Fine-tuned model
# new_model = "llama-2-7b-chat-guanaco"

In [8]:
import timeit

start_time = timeit.default_timer()

dataset_train = load_dataset(noot_dataset, split="train")
dataset_test = load_dataset(noot_dataset, split="test")
dataset_val = load_dataset(noot_dataset, split="val")

dataset_loadtime = timeit.default_timer() - start_time

In [None]:
start_time = timeit.default_timer()
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, BitsAndBytesConfig
from peft import LoraConfig
from trl import SFTTrainer

model_checkpoint = "ibm-granite/granite-3.0-2b-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16 # if not set will throw a warning about slow speeds when training
)

model = AutoModelForCausalLM.from_pretrained(
  model_checkpoint,
  quantization_config=bnb_config,
  device_map="auto"

)

model_loadtime = timeit.default_timer() - start_time

In [None]:
from transformers import pipeline
import datasets

def pirateify(batch):
  prompts = [f"make it sound like a pirate said this, do not include any preamble or explanation only piratify the following: {response}" for response in batch['response']]
  # Tokenize the inputs in batch and move them to GPU
  inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True).to('cuda')
  # Generate the pirate-like responses in batch
  outputs = model.generate(**inputs, max_new_tokens=256, do_sample=True, top_p=0.95, temperature=0.7)
  # Decode the generated tokens into text for each output in the batch
  pirate_responses = []
  for output in outputs:
    pr = tokenizer.decode(output, skip_special_tokens=True)
    if '\n\n' in pr:
      pirate_responses.append(pr.split('\n\n')[-1])
    else:
      pirate_responses.append(pr)

  # Move the outputs back to CPU (to free up GPU memory)
  inputs = inputs.to('cpu')
  outputs = outputs.to('cpu')
  # Clear the GPU cache to release any unused memory
  torch.cuda.empty_cache()
  return {
      'prompt': batch['prompt'],  # The original prompts (already a batch)
      'response': pirate_responses  # The pirate responses, generated in batch
  }


def filter_long_examples(example):
    prompt_tokens = tokenizer.tokenize(example['prompt'])
    response_tokens = tokenizer.tokenize(example['response'])  # Tokenize the response
    return len(response_tokens) <= 200 and len(prompt_tokens) <= 50

# Apply the filter to both train and test splits
train_filtered = dataset['train'].select(range(6000)).filter(filter_long_examples)
test_filtered = dataset['test'].select(range(500)).filter(filter_long_examples)

print(f"train_filtered: {len(train_filtered)} observations\ntest_filtered: {len(test_filtered)} observations")
pirate_train = train_filtered.select(range(1500)).map(pirateify, batched=True, batch_size=64)
pirate_test = test_filtered.select(range(250)).map(pirateify, batched=True, batch_size=64)

# Save the new dataset
pirate_dataset = datasets.DatasetDict({
    'train': pirate_train,
    'test': pirate_test
})

In [None]:
pirate_dataset['train'].to_pandas().head()

In [None]:
import torch
torch.cuda.empty_cache()

# Model sanity check

In [None]:
start_time = timeit.default_timer()
input_text = "<|user>What does 'inheritance' mean?\n<|assistant|>\n"

inputs = tokenizer(input_text, return_tensors="pt")

outputs = model.generate(**inputs, max_new_tokens=100)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

model_check_loadtime = timeit.default_timer() - start_time

# Training Setup

In [None]:
start_time = timeit.default_timer()
def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['prompt'])):
        text = f"<|system|>\nYou are a helpful assistant\n<|user|>\n{example['prompt'][i]}\n<|assistant|>\n{example['response'][i]}<|endoftext|>"
        output_texts.append(text)
    return output_texts

response_template = "\n<|assistant|>\n"

from trl import DataCollatorForCompletionOnlyLM

response_template_ids = tokenizer.encode(response_template, add_special_tokens=False)[2:]
collator = DataCollatorForCompletionOnlyLM(response_template_ids, tokenizer=tokenizer)


# Apply qLoRA
qlora_config = LoraConfig(
    r=16,  # The rank of the Low-Rank Adaptation
    lora_alpha=32,  # Scaling factor for the adapted layers
    target_modules=["q_proj", "v_proj"],  # Layer names to apply LoRA to
    lora_dropout=0.1,
    bias="none"
)

# Initialize the SFTTrainer
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-4,
    per_device_train_batch_size=6,
    per_device_eval_batch_size=6,
    num_train_epochs=3,
    logging_steps=100,
    fp16=True,
    report_to="none"
)

max_seq_length = 250

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=pirate_dataset['train'],
    eval_dataset=pirate_dataset['test'],
    tokenizer=tokenizer,
    peft_config = qlora_config,
    formatting_func=formatting_prompts_func,
    data_collator=collator,
    max_seq_length=max_seq_length,
)

training_setup_loadtime = timeit.default_timer() - start_time