In [None]:
!pip install  -U -q trl peft bitsandbytes wandb
# Tested with transformers==4.47.1, trl==0.14.0, datasets==3.2.0, peft==0.14.0, accelerate==1.2.1

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m318.3/318.3 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.7/69.7 MB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m484.9/484.9 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[2K   [91m━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m131.5/363.4 MB[0m [31m40.8 MB/s[0m eta [36m0:00:06[0m

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
import wandb

wandb.login()

# Name the project
%env WANDB_PROJECT=Cosmos-8B-Instruct-GRPO

In [None]:
from datasets import load_dataset

dataset_id = "alibayram/turkish_mmlu"
train_dataset = load_dataset(dataset_id, split=["train"])
train_dataset = train_dataset[0]
train_dataset = train_dataset.remove_columns(["bolum","konu","aciklama","__index_level_0__"])
train_dataset = train_dataset.shuffle(seed=42).select(range(6600))

In [None]:
# Test dataset
print(train_dataset[0])

In [None]:
SYSTEM_PROMPT = (
    "Kullanıcı ve asistan arasında bir konuşma. Kullanıcı bir soru soruyor, asistan bu soruyu çözüyor ve cevap veriyor. Asistan "
    "soruyu öncelikle kendi zihininde düşünüyor ve sonra kullanıcıya yanıt veriyor. Akıl yürütme "
    "işlemi <düşünce> </düşünce>, cevaplar <cevap> </cevap> etiketleri arasında gösteriliyor."
    "Örneğin: <düşünce>Akıl yürütme kısmı</düşünce><cevap>Cevap Kısmı</cevap>"
)


def make_conversation(example):
    secenekler = [a + b for a,b in zip(["A)","B)","C)","D)","E)"],example["secenekler"])]
    user_content = example["soru"] + " Seçenekler: " + " ".join(secenekler)
    return {
        "prompt": [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": user_content},
        ],
    }

train_dataset = train_dataset.map(make_conversation)
train_dataset = train.remove_columns(["secenekler","soru"])

In [None]:
print(train_dataset)

In [None]:
import torch
from transformers import AutoModelForCausalLM, BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

model_id = "ytu-ce-cosmos/Turkish-Llama-8b-Instruct-v0.1"
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype="auto",
    device_map="auto",
)

In [None]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    task_type="CAUSAL_LM",
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q_proj", "v_proj"],
)

model = get_peft_model(model, lora_config)

model.print_trainable_parameters()

In [None]:
import re


def format_reward(completions, **kwargs):
    """Reward function that checks if the completion has a specific format."""
    pattern = r"^<duşünce>.*?</düşünce>\s*<cevap>.*?</cevap>$"
    completion_contents = [completion[0]["content"] for completion in completions]
    matches = [re.match(pattern, content) for content in completion_contents]
    rewards_list = [1.0 if match else 0.0 for match in matches]
    return [1.0 if match else 0.0 for match in matches]

In [None]:
def accuracy_reward(completions, **kwargs):
    """Reward function that checks if the completion is the same as the ground truth."""
    solutions = kwargs["solution"]
    completion_contents = [completion[0]["content"] for completion in completions]
    rewards = []
    for content, solution in zip(completion_contents, solutions):
        correct_letter = correct_answer.upper()
        # Regex to find standalone option letters (A-E) followed by optional ')' and punctuation/whitespace
        option_pattern = re.compile(r'\b([A-E])\)?(?=\s|\.|,|$)', re.IGNORECASE)
        matches = option_pattern.findall(response.upper())
        letters_found = [m.upper() for m in matches]

        # Check if correct_letter is present and no incorrect letters are mentioned
        if correct_letter in letters_found and all(letter == correct_letter for letter in letters_found):
            rewards.append(1.0)  # Full reward for correct answer and no other options
        else:
            rewards.append(0.0)  # No reward otherwise
    return rewards

In [None]:
from trl import GRPOConfig

# Configure training arguments using GRPOConfig
training_args = GRPOConfig(
    output_dir="Cosmos-8B-GRPO",
    learning_rate=1e-5,
    remove_unused_columns=False,  # to access the solution column in accuracy_reward
    gradient_accumulation_steps=16,
    num_train_epochs=1,
    bf16=True,
    # Parameters that control de data preprocessing
    max_completion_length=64,  # default: 256
    num_generations=4,  # default: 8
    max_prompt_length=128,  # default: 512
    # Parameters related to reporting and saving
    report_to="wandb",
    logging_steps=10,
    push_to_hub=True,
    save_strategy="steps",
    save_steps=10,
)

In [None]:
from trl import GRPOTrainer

trainer = GRPOTrainer(
    model=model, reward_funcs=[format_reward, accuracy_reward], args=training_args, train_dataset=train_dataset
)

In [None]:
trainer.train()
wandb.finish()

In [None]:
trainer.save_model(training_args.output_dir)
trainer.push_to_hub(dataset_name=dataset_id)