<a href="https://colab.research.google.com/github/harikris001/FineTuning-LLMs/blob/main/Finetuning_Tinyllama_1B_LLM(openai_gsm8k).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# INSTALLING DEPENDENCIES

In [None]:
!pip install -U bitsandbytes -q

In [None]:
!pip install wandb -q

# IMPORTING PACKAGES

In [None]:
import torch

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, TaskType

# MODEL IMPORTING AND CONFIGURATION

The model is available on this [Hugging Face Card](https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0).
we are using a 4bit quantized model for better and faster training.

Techniques used for optimization:
- 4 bit quant model
- LoRA - low rank adaptation.
- Qlora - Quantized LoRA.


In [None]:
model_name = 'TinyLlama/TinyLlama-1.1B-Chat-v1.0'

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code = True,
)

In [None]:
Lora_config = LoraConfig(
    r = 4,
    lora_alpha = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout = 0.05,
    bias = "none",
    task_type = TaskType.CAUSAL_LM
)

lora_model = get_peft_model(model, Lora_config)

In [None]:
data = load_dataset('openai/gsm8k', 'main', split='train[:200]')

The tokenization method is different for each model. Refer the hugging face model to understand the format.

In [None]:
def tokenize(batch):
  texts = [
      f"### Instruction:\n{question}\n ### Response:\n{answer}"
      for question, answer in zip(batch['question'], batch['answer'])
  ]
  inputs = tokenizer(texts, max_length=256, return_tensors="pt", padding='max_length', truncation=True)
  inputs['labels'] = inputs['input_ids'].clone()
  return inputs

In [None]:
tokenised_data = data.map(tokenize, batched=True, batch_size=100, remove_columns=data.column_names)

In [None]:
training_args = TrainingArguments(
    output_dir="./tinyLlama-results",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=1e-3,
    fp16=True,
    num_train_epochs=50,
    logging_steps=25,
    save_strategy='epoch',
    report_to=None,
    remove_unused_columns=False,
    label_names=['labels'],
)

In [None]:
trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=tokenised_data,
    processing_class=tokenizer,
)

To disable the automatic reporting to Wandb run the folowing script. You can skip this if you want to wandb reports.

In [None]:
import wandb
wandb.init(mode='disabled') # Optional

In [None]:
trainer.train()

In [None]:
lora_model.save_pretrained('./tinyLlama-tuned-math')
tokenizer.save_pretrained('./tinyLlama-tuned-math')

In [None]:
from torch.utils.data import DataLoader
from transformers import default_data_collator


from peft import PeftModelForCausalLM
import numpy as np

In [None]:
model_name = 'TinyLlama/TinyLlama-1.1B-Chat-v1.0'
adapter_path = './tinyLlama-tuned-math'

In [None]:
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)

# Loading the finetuned model for evaluation.
tuned_model = PeftModelForCausalLM.from_pretrained(
    base_model,
    adapter_path,
    device_map="auto",
    trust_remote_code=True,
).eval()

In [None]:
eval_data = load_dataset('openai/gsm8k', 'main', split='train[200:400]')
eval_ds = eval_data.map(tokenize, batched=True, remove_columns=data.column_names)
eval_ds = eval_ds.with_format('torch')

In [None]:
eval_dataloader = DataLoader(eval_ds, batch_size=8, collate_fn=default_data_collator)

In [None]:
@torch.no_grad()
def compute_perplexity(model):
    losses = []

    for batch in eval_dataloader:
      batch = {k:v.to('cuda') for k,v in batch.items()}
      loss = model(**batch).loss
      losses.append(loss.item())

    return np.exp(np.mean(losses))

**Perplexity is exactly a measure of how confused the model when predicting the next token. It tells whether the model has learned from the training**


_Untrained model has an average perplexity of 200 - 400_

In [None]:
print(f"Perplexity of Model: {compute_perplexity(tuned_model)}")

In [None]:
def generate_response(model, prompt):
  token_ids = tokenizer(f"### Instruction: \n{prompt}\n### Response: \n", return_tensors='pt').input_ids.to('cuda')

  with torch.no_grad():
    output_ids = model.generate(
        token_ids,
        max_new_tokens=256
      )
  return tokenizer.decode(output_ids[0], skip_special_tokens=True)

In [None]:
eval_data[0]

In [None]:
print(generate_response(tuned_model, eval_data[0]['question']))