# Tuning Gemma 2b with ORPO and QLora on your laptop 

## Overview

This tutorial can run on your laptop with NVIDIA GPU. 
you should install CUDA 12.3，Pycharm/VSode and PyTorch 2.2.1 beforehand.

Dataset: argilla/databricks-dolly-15k-curated-en

## Setup
### download gemma-2b model from huggingface
[https://huggingface.co/google/gemma-2b/tree/main](https://huggingface.co/google/gemma-2b/tree/main)
Note: I don't like the cache model mechanism of huggingface, 

### Configure your wandb key

To use wandb to monitor, you must provide wandb API key. you can apply API key from [https://wandb.ai](https://wandb.ai)

### Configure your Hugging Face access token

if you want to upload your tuned LLM to Hugging Face, you can apply access token (write) from [https://huggingface.co/settings/tokens](https://huggingface.co/settings/tokens)

### Set environment variables

Set environment variables for `wandb` and `huggingface`

In [None]:
#make sure that CUDA works
import torch

torch.cuda.is_available(), torch.version.cuda

In [None]:
# load argilla/dpo-mix-7k dataset

from datasets import load_dataset


#dataset = load_dataset("argilla/distilabel-capybara-dpo-7k-binarized",split="train")
#dataset2 = load_dataset("allenai/ultrafeedback_binarized_cleaned",split="train")

dataset = load_dataset("argilla/dpo-mix-7k",split="train")


dataset[0]["chosen"][0]["content"]

#dataset.to_csv("a.csv")

In [None]:
#format dataset format

from datasets import load_dataset
from transformers import AutoTokenizer

def chatml_format(example):
    message = {"role": "user", "content": example['chosen'][0]['content']}
    # Format instruction
    prompt = tokenizer.apply_chat_template([message], tokenize=False, add_generation_prompt=True)
    # Format chosen answer
    chosen = example['chosen'][1]['content']+tokenizer.eos_token
    # Format rejected answer
    rejected = example['rejected'][1]['content']+tokenizer.eos_token

    return {
        "prompt": prompt,
        "chosen": chosen,
        "rejected": rejected,
    }

# Load dataset
dataset = load_dataset("argilla/dpo-mix-7k",split="train")
#dataset = load_dataset("argilla/distilabel-intel-orca-dpo-pairs", split="train")
#dataset = dataset.filter(
#   lambda r: 
#       r["status"] != "tie" and 
#       r["chosen_score"] >= 5
#       and not r["in_gsm8k_train"]
#)
# Save columns
original_columns = dataset.column_names


# Tokenizer
model_name = "c:/ai/models/gemma"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"


# Format dataset
dataset = dataset.map(
    chatml_format,
    remove_columns=original_columns
)

# Print sample
dataset[1]

In [None]:
dataset

In [None]:
#Using ORPOTrainer and QLora to tune Gemma 2B
import os
import gc
import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, PeftModel
import wandb
from dotenv import load_dotenv, find_dotenv
from trl import ORPOTrainer
from trl import ORPOConfig

#init env
env =load_dotenv(find_dotenv())
hf_token = os.getenv("huggingface")
wb_token = os.getenv('wandb')
wandb.login(key=wb_token)

#local model path
local_model_path ="c:/ai/models/gemma"
# LoRA configuration
peft_config = LoraConfig(
    r=16,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['k_proj', 'gate_proj', 'v_proj', 'up_proj', 'q_proj', 'o_proj', 'down_proj']
)

# Model to fine-tune
model = AutoModelForCausalLM.from_pretrained(
    local_model_path,
    torch_dtype=torch.bfloat16,
    #torch_dtype="auto",
    trust_remote_code=True,
    quantization_config= BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type='nf4',
    )
)
model.config.use_cache = False

new_model = "lion-gemma-2b"

torch.cuda.empty_cache()


# Training arguments
training_args = ORPOConfig (
    per_device_train_batch_size=1,
    gradient_accumulation_steps=16,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={'use_reentrant':True},
    remove_unused_columns=False,
    learning_rate=5e-5,
    lr_scheduler_type="cosine",
    #max_steps=400,
    num_train_epochs=1,
    save_strategy="no",
    logging_steps=1,
    output_dir=new_model,
    optim="adamw_bnb_8bit",
    warmup_steps=80,
    bf16=True,
    max_prompt_length=256,
    max_length=1024,
    report_to="wandb",
)

# Create DPO trainer
orpo_trainer = ORPOTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    tokenizer=tokenizer,
    peft_config=peft_config
)

# Fine-tune model with ORPO
orpo_trainer.train()

In [None]:
# save tuning checkpoint

final_checkpoint = "gemma_final_checkpoint"
orpo_trainer.model.save_pretrained(final_checkpoint)
tokenizer.save_pretrained(final_checkpoint)


In [None]:
# merge checkpoint with original llm
env =load_dotenv(find_dotenv(),override=True)
hf_token = os.getenv("huggingface")

#Flush memory
del orpo_trainer, model
gc.collect()
torch.cuda.empty_cache()

# Reload model in FP16 (instead of NF4)
base_model = AutoModelForCausalLM.from_pretrained(
    local_model_path,
    return_dict=True,
    torch_dtype=torch.float16,
)
tokenizer = AutoTokenizer.from_pretrained(local_model_path)

# Merge base model with the adapter
model = PeftModel.from_pretrained(base_model, final_checkpoint)
model = model.merge_and_unload()

# Save model and tokenizer
model.save_pretrained(new_model)
tokenizer.save_pretrained(new_model)


In [None]:
# Push them to the HF Hub
os.environ["HTTPS_PROXY"] ="http://127.0.0.1:7890"
model.push_to_hub(new_model, use_temp_dir=False, token=hf_token)
tokenizer.push_to_hub(new_model, use_temp_dir=False, token=hf_token)

model-00001-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
# Test the new llm
message = [
    {"role": "user", "content": "What is a Large Language Model?"}
]
tokenizer = AutoTokenizer.from_pretrained(new_model)
prompt = tokenizer.apply_chat_template(message, add_generation_prompt=True, tokenize=False)

# Create pipeline
pipeline = transformers.pipeline(
    "text-generation",
    model=new_model,
    tokenizer=tokenizer
)

# Generate text
sequences = pipeline(
    prompt,
    do_sample=True,
    temperature=0.1,
    top_p=0.9,
    num_return_sequences=1,
    max_length=200,
)
print(sequences[0]['generated_text'])

In [None]:
env =load_dotenv(find_dotenv(),override=True)

hf_token = os.getenv("huggingface")


model = AutoModelForCausalLM.from_pretrained(new_model)
tokenizer =AutoTokenizer.from_pretrained(new_model)
model.push_to_hub(model,token = hf_token)
tokenizer.push_to_hub(model, token =hf_token)