In [None]:
!pip install -U transformers datasets accelerate peft bitsandbytes trl
from transformers import AutoTokenizer
from datasets import load_dataset
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
template_tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")

In [None]:
def format_prompt(example):
  """Format the prompt to using the <|user|> template TinyLLama
  is using"""
  chat = example["messages"]
  prompt = template_tokenizer.apply_chat_template(chat, tokenize=False)
  return {"text": prompt}

In [None]:
dataset = (
load_dataset("HuggingFaceH4/ultrachat_200k", split="test_sft")
  .shuffle(seed=42)
  .select(range(3_000))
)
dataset = dataset.map(format_prompt)

In [None]:
print(dataset["text"][2576])

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

In [None]:
model_name = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"

In [None]:
bnb_config = BitsAndBytesConfig(
  load_in_4bit=True, # Use 4-bit precision model loading
  bnb_4bit_quant_type="nf4", # Quantization type
  bnb_4bit_compute_dtype="float16", # Compute dtype
  bnb_4bit_use_double_quant=True, # Apply nested quantization
)

In [None]:
model = AutoModelForCausalLM.from_pretrained(
  model_name,
  device_map="auto",
  # Leave this out for regular SFT
  quantization_config=bnb_config,
)

In [None]:
model.config.use_cache = False
model.config.pretraining_tp = 1

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = "<PAD>"
tokenizer.padding_side = "left"
tokenizer.chat_template = """
{% for message in messages %}
{{ bos_token }}
{% if message['role'] == 'user' %}
{{ '[INST] ' + message['content'] + ' [/INST]' }}
{% elif message['role'] == 'assistant' %}
{{ message['content'] + eos_token }}
{% endif %}
{% endfor %}
"""

# Test the template
chat = [
    {"role": "user", "content": "Hello!"},
    {"role": "assistant", "content": "Hi! How can I help?"},
]

tokenizer.apply_chat_template(chat, tokenize=False)

In [None]:
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model

In [None]:
peft_config = LoraConfig(
  lora_alpha=32, # LoRA Scaling
  lora_dropout=0.1, # Dropout for LoRA Layers
  r=64, # Rank
  bias="none",
  task_type="CAUSAL_LM",
  target_modules= ["k_proj","gate_proj","v_proj","up_proj","o_proj", "down_proj"] # Layers to target
  )

In [None]:
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)

In [None]:
from transformers import TrainingArguments

In [None]:
output_dir = "./results"

In [None]:
training_arguments = TrainingArguments(
  output_dir=output_dir,
  per_device_train_batch_size=2,
  gradient_accumulation_steps=4,
  optim="paged_adamw_32bit",
  learning_rate=2e-4,
  lr_scheduler_type="cosine",
  num_train_epochs=1,
  logging_steps=10,
  fp16=True,
  gradient_checkpointing=True
)

In [None]:
from trl import SFTTrainer

In [None]:
trainer = SFTTrainer(
  model=model,
  train_dataset=dataset,
  dataset_text_field="text",
  tokenizer=tokenizer,
  args=training_arguments,
  max_seq_length=512,
  # Leave this out for regular SFT
  peft_config=peft_config,
)

In [None]:
trainer.train()

In [None]:
trainer.model.save_pretrained("TinyLlama-1.1B-qlora")