In [1]:
%pip install -U "transformers>=4.56,<4.58" "datasets>=2.20,<3" \
  "accelerate>=0.34.2" "peft>=0.16,<0.18" trl sentencepiece


Collecting transformers<4.58,>=4.56
  Using cached transformers-4.57.1-py3-none-any.whl.metadata (43 kB)
  Using cached transformers-4.57.1-py3-none-any.whl.metadata (43 kB)
Collecting accelerate>=0.34.2
  Using cached accelerate-1.11.0-py3-none-any.whl.metadata (19 kB)
Collecting accelerate>=0.34.2
  Using cached accelerate-1.11.0-py3-none-any.whl.metadata (19 kB)
Collecting peft<0.18,>=0.16
  Using cached peft-0.17.1-py3-none-any.whl.metadata (14 kB)
Collecting peft<0.18,>=0.16
  Using cached peft-0.17.1-py3-none-any.whl.metadata (14 kB)
Collecting trl
  Using cached trl-0.25.0-py3-none-any.whl.metadata (11 kB)
Collecting trl
  Using cached trl-0.25.0-py3-none-any.whl.metadata (11 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers<4.58,>=4.56)
  Using cached tokenizers-0.22.1-cp39-abi3-macosx_11_0_arm64.whl.metadata (6.8 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers<4.58,>=4.56)
  Using cached tokenizers-0.22.1-cp39-abi3-macosx_11_0_arm64.whl.metadata (6.8 

In [2]:
import os, torch
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "0"  # avoid fast-downloader dependency

if torch.backends.mps.is_available():
    device_map = {"": "mps"}; dtype = torch.float16; dev = "mps"
else:
    device_map = {"": "cpu"}; dtype = torch.float32; dev = "cpu"

print("Device:", dev, "| dtype:", dtype)


Device: mps | dtype: torch.float16


In [3]:
# Disable HF progress bars to avoid traitlets/layout/contextvar errors
import os
os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "0"  # keep the fast-downloader off on this kernel

from huggingface_hub.utils import disable_progress_bars
disable_progress_bars()

from huggingface_hub import snapshot_download

MODEL_ID  = "HuggingFaceTB/SmolLM2-135M"
CACHE_DIR = "./_hf_cache_colab3"

local_model_path = snapshot_download(
    repo_id=MODEL_ID,
    local_dir=CACHE_DIR,
    allow_patterns=["*.safetensors","*.bin","*.json","*.model","tokenizer*","*merges*"],
    resume_download=True,
    max_workers=8,
)
print("Downloaded to:", local_model_path)


Downloaded to: /Users/keerthana/Keerthana/workspace/unsloth/_hf_cache_colab3




In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained(local_model_path, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Policy (trainable via LoRA)
policy = AutoModelForCausalLM.from_pretrained(
    local_model_path,
    low_cpu_mem_usage=True,
    device_map=device_map,
    attn_implementation="sdpa",
)
policy.resize_token_embeddings(len(tokenizer))

# Reference (frozen). Keep on CPU to save MPS memory.
ref = AutoModelForCausalLM.from_pretrained(
    local_model_path,
    torch_dtype=torch.float32,
    low_cpu_mem_usage=True,
    device_map={"": "cpu"},
    attn_implementation="sdpa",
)
ref.resize_token_embeddings(len(tokenizer))

print("policy on:", next(policy.parameters()).device, "| ref on: CPU")


`torch_dtype` is deprecated! Use `dtype` instead!


policy on: mps:0 | ref on: CPU


In [5]:
from datasets import load_dataset

pref = load_dataset("trl-lib/lm-human-preferences-sentiment", split="train[:1000]")

def to_dpo(row):
    prompt   = "Instruction: " + "Provide a helpful, harmless, and honest reply.\nAnswer:"  # simple neutral prompt
    chosen   = row["chosen"].strip()   + tokenizer.eos_token
    rejected = row["rejected"].strip() + tokenizer.eos_token
    return {"prompt": prompt, "chosen": chosen, "rejected": rejected}

dpo_ds = pref.map(to_dpo, remove_columns=pref.column_names)
print(dpo_ds[0])

{'prompt': 'Instruction: Provide a helpful, harmless, and honest reply.\nAnswer:', 'chosen': 'His eyes are narrowed in concentration and he seems to be looking through me.\n\n"What?" I whisper.<|endoftext|>', 'rejected': '"What the hell is going on with you?" He asks the empty room.<|endoftext|>'}


In [6]:
from trl import DPOTrainer, DPOConfig
from peft import LoraConfig

peft_cfg = LoraConfig(
    r=8, lora_alpha=16, lora_dropout=0.1,
    bias="none", task_type="CAUSAL_LM",
    target_modules=["q_proj","k_proj","v_proj","o_proj","up_proj","down_proj","gate_proj"],
)

MAX_LEN = 384  # shorten on tight memory; try 320/256 if OOM

dpo_cfg = DPOConfig(
    output_dir="./colab3_smollm2_dpo",
    beta=0.1,                              # preference strength
    per_device_train_batch_size=2,         # small batches
    gradient_accumulation_steps=16,        # effective batch 32
    learning_rate=1e-4,
    num_train_epochs=1,
    lr_scheduler_type="cosine",
    warmup_ratio=0.03,
    logging_steps=10,
    save_steps=200,
    save_total_limit=1,
    optim="adafactor",                     # memory-friendly
    bf16=False, fp16=False,                # MPS uses fp16 internally—leave flags False
    torch_compile=False,
    max_length=MAX_LEN,
    max_prompt_length=MAX_LEN//2,
)


In [7]:
import gc, torch
gc.collect()
if torch.backends.mps.is_available(): torch.mps.empty_cache()

trainer = DPOTrainer(
    model=policy,
    ref_model=None,                 # <-- let TRL clone a frozen ref internally
    args=dpo_cfg,
    tokenizer=tokenizer,
    train_dataset=dpo_ds,
    peft_config=peft_cfg,
)

# reduce activation memory
trainer.model.gradient_checkpointing_enable()
trainer.model.enable_input_require_grads()

trainer.train()


  trainer = DPOTrainer(
The model is already on multiple devices. Skipping the move to device specified in `args`.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 0}.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 0}.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
10,0.6929
20,0.6899
30,0.6909


TrainOutput(global_step=32, training_loss=0.6907549723982811, metrics={'train_runtime': 164.0677, 'train_samples_per_second': 6.095, 'train_steps_per_second': 0.195, 'total_flos': 0.0, 'train_loss': 0.6907549723982811, 'epoch': 1.0})

In [8]:
from transformers import pipeline

generator = pipeline("text-generation", model=ref, tokenizer=tokenizer)

prompt = "This movie is amazing and I really loved it because"
generated_text = generator(prompt, max_length=50, num_return_sequences=1)[0]['generated_text']
print("##Prompt 1## \n", generated_text)

prompt = "This movie is shit and I really disliked it because"
generated_text = generator(prompt, max_length=50, num_return_sequences=1)[0]['generated_text']
print("##Prompt 2## \n", generated_text)

Device set to use cpu
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Both `max_new_tokens` (=256) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy 

##Prompt 1## 
 This movie is amazing and I really loved it because it's such a beautiful, powerful, powerful film. I can't wait for more.

Sunday, July 16, 2010

How to Do a Good Job

I'm writing this post because I'm really looking forward to the new film. It's a must see so I need to post this one before I go on my first trip to the movies.

It's a movie about a man who's in a relationship with a woman who he's spending all his time with. It's a man who is the victim of a series of bad decisions. The woman he has an affair with is not only his wife, but a woman who he's spending all his time with. It's a man who's in a relationship with a woman who's a part of a far more dangerous relationship. What's worse is that this woman has been a victim of a man who was very nice to her.

I'm not saying this is the worst film ever made. It's not. I'm just saying it's a movie that doesn't have a good ending for a reason.

This movie is not as good as the movie I mentioned above, but it's better