In [1]:
import torch
from peft import LoraConfig, get_peft_model
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

In [2]:
!pip install bitsandbytes
!pip install trl



In [3]:
from trl import DPOTrainer, DPOConfig
from datasets import load_from_disk, Dataset, load_dataset

# Get base model

In [4]:
from huggingface_hub import login
login(new_session=False)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else ('mps' if torch.backends.mps.is_available() else 'cpu'))
print(device)

cuda


In [6]:
def get_model():
  model_name = 'google/gemma-2b-it'
  bnb_config = BitsAndBytesConfig(load_in_4bit=True,
                                  bnb_4bit_quant_type='nf4',
                                  bnb_4bit_compute_dtype=torch.float16,
                                  llm_int8_enable_fp32_cpu_offload=True)
  print(f'loading pretrained model on {device}')
  model = AutoModelForCausalLM.from_pretrained(model_name,
                                               quantization_config=bnb_config,
                                               device_map="auto")
  ref_model = AutoModelForCausalLM.from_pretrained(model_name,
                                               quantization_config=bnb_config,
                                               device_map="auto")
  print('loading tokenizer')
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  tokenizer.pad_token = tokenizer.eos_token

  # Setup LoRA
  peft_config = LoraConfig(
      r=16, # lower r to reduce memory
      lora_alpha=32,
      lora_dropout=0.05,
      # target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj'],
      target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj'],
      bias='none',
      task_type='CAUSAL_LM')

  print('get peft model')
  model = get_peft_model(model, peft_config)
  model.gradient_checkpointing_enable()  # enable checkpointing to reduce memory cost
  model.config.use_cache = False  # disable KV cache to save memory
  model.print_trainable_parameters()

  print('ref model parameters')
  ref_model = get_peft_model(ref_model, peft_config)
  ref_model.requires_grad_(False)
  ref_model.eval()
  ref_model.config.use_cache = False # disable KV cache to save memory
  ref_model.print_trainable_parameters()
  return model, ref_model, tokenizer

In [7]:
!pip install -U bitsandbytes



In [8]:
model, ref_model, tokenizer = get_model()

loading pretrained model on cuda


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

loading tokenizer
get peft model
trainable params: 3,686,400 || all params: 2,509,858,816 || trainable%: 0.1469
ref model parameters
trainable params: 0 || all params: 2,509,858,816 || trainable%: 0.0000


# Prepare post training data

In [9]:
def load_data_ultra_feedback() -> Dataset:
    ds = load_dataset("HuggingFaceH4/ultrafeedback_binarized",split='train_prefs')
    print(ds)
    prompts = []
    for row in ds:
        prompt = row['prompt']
        chosen = row['chosen'][1]['content']
        rejected = row['rejected'][1]['content']
        prompts.append({'prompt': prompt, 'chosen': chosen, 'rejected': rejected})
    return Dataset.from_list(prompts)

In [10]:
dataset = load_data_ultra_feedback()

Dataset({
    features: ['prompt', 'prompt_id', 'chosen', 'rejected', 'messages', 'score_chosen', 'score_rejected'],
    num_rows: 61135
})


# Train model

In [11]:
training_args = DPOConfig(
    output_dir="checkpoints/dpo-gemma2b",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=2,
    learning_rate=1e-5,
    lr_scheduler_type="cosine",
    warmup_steps=100,
    logging_steps=10,
    save_steps=200,
    max_steps=2000,
    beta=0.1,       # Important: strength of preference alignment
    max_length=1024,
    max_prompt_length=512
)

trainer = DPOTrainer(
    model=model,
    ref_model=ref_model,
    args=training_args,
    train_dataset=dataset
)

Extracting prompt in train dataset:   0%|          | 0/61135 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/61135 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/61135 [00:00<?, ? examples/s]

In [None]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mljingbupt[0m ([33mljingbupt-none[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Detected [huggingface_hub.inference, openai] in use.
[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/


Step,Training Loss
10,0.6768
20,0.7048
30,0.6981
40,0.7011
50,0.6923
60,0.6772
70,0.7133
80,0.6337
90,0.8095
100,0.6678


Step,Training Loss
10,0.6768
20,0.7048
30,0.6981
40,0.7011
50,0.6923
60,0.6772
70,0.7133
80,0.6337
90,0.8095
100,0.6678
