<a href="https://colab.research.google.com/github/leotodisco/SmolLM-Fine-Tuning/blob/main/SmolLM_GRPO.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -qqq datasets==3.2.0 transformers==4.47.1 trl==0.14.0 peft==0.14.0 accelerate==1.2.1 bitsandbytes==0.45.2 --progress-bar off
!pip install -qqq flash-attn --no-build-isolation --progress-bar off


In [None]:
import torch
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from transformers import AutoModelForCausalLM, AutoTokenizer
from trl import GRPOConfig, GRPOTrainer
from transformers import BitsAndBytesConfig

# Load The Dataset

## Task: reddit posts summarization

In [None]:
dataset = load_dataset("mlabonne/smoltldr")

In [None]:
dataset["train"]["prompt"][0]



In [None]:
dataset["train"]["completion"][0]

' Tried to pet a dog, foot got impaled by a demon stick, never even got to pet the dog.'

The dataset is composed of:
- Prompts
- Completions

We have 2000 prompts and 2000 completions


In [None]:
model_name = "HuggingFaceTB/SmolLM-135M-Instruct"

bnb_config = BitsAndBytesConfig(
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=False
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)
model = prepare_model_for_kbit_training(model)
tokenizer = AutoTokenizer.from_pretrained(model_name)


In [None]:
"""!sudo add-apt-repository ppa:ubuntu-toolchain-r/test
!sudo apt-get update
!sudo apt-get install gcc-4.9
!sudo apt-get upgrade libstdc++6"""

'!sudo add-apt-repository ppa:ubuntu-toolchain-r/test\n!sudo apt-get update\n!sudo apt-get install gcc-4.9\n!sudo apt-get upgrade libstdc++6'

In [None]:
lora_config = LoraConfig(
    task_type="CAUSAL_LM",
    r=8,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules="all-linear"
)
model = get_peft_model(model, lora_config)
print(model.print_trainable_parameters())

trainable params: 2,442,240 || all params: 136,957,248 || trainable%: 1.7832
None


## Reward Function


In [None]:
# Reward function
ideal_length = 50


def reward_len(completions, **kwargs):
    return [-abs(ideal_length - len(completion)) for completion in completions]

In [None]:
training_args = GRPOConfig(
    output_dir="GRPO",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    max_prompt_length=512,
    max_completion_length=96,
    num_generations=8,
    optim="adamw_8bit",
    num_train_epochs=1,
    bf16=True,
    remove_unused_columns=False,
    logging_steps=1,
    report_to=["none"]
)

In [None]:
trainer = GRPOTrainer(
    model=model,
    reward_funcs=[reward_len],
    args=training_args,
    train_dataset=dataset["train"],
)


trainer.train()

  return fn(*args, **kwargs)
