# Thoughts

- trl (transformer reinforcement learning) package https://github.com/huggingface/trl/tree/main/examples
- LLM as judge
    - Can we ask the LLM a question, save the answer, and then have a closed LLM answer the same question, compare the responses, and prefer the closed LLM answer?



In [None]:
# data format for using the tlr library
# DPO and CPO methods

# dpo_dataset_dict = {
#     "prompt": [
#         "hello",
#         "how are you",
#         "What is your name?",
#         "What is your name?",
#         "Which is the best programming language?",
#         "Which is the best programming language?",
#         "Which is the best programming language?",
#     ],
#     "chosen": [
#         "hi nice to meet you",
#         "I am fine",
#         "My name is Mary",
#         "My name is Mary",
#         "Python",
#         "Python",
#         "Java",
#     ],
#     "rejected": [
#         "leave me alone",
#         "I am not fine",
#         "Whats it to you?",
#         "I dont have a name",
#         "Javascript",
#         "C++",
#         "C++",
#     ],
# }

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "gpt2"  # You can choose any suitable model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

In [None]:
dataset = [
    {
        "prompt": "What is the capital of France?",
        "chosen": "Paris",
        "rejected": "London",
    },
    # ... more examples
]

In [None]:
# Here we subtract the chose from the rejected, so that the leftover is the loss.
# If negative, the chosen is more likely than the rejected.
# If positive, the rejected is more likely than the chosen.
# If zero, both are equally likely.

def dpo_loss(model, chosen_ids, rejected_ids):
    chosen_logits = model(chosen_ids).logits
    rejected_logits = model(rejected_ids).logits

    chosen_log_probs = torch.nn.functional.log_softmax(chosen_logits, dim=-1)
    rejected_log_probs = torch.nn.functional.log_softmax(rejected_logits, dim=-1)

    loss = -torch.mean(chosen_log_probs - rejected_log_probs)
    return loss

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

for epoch in range(5):
    for batch in dataset:
        chosen_ids = tokenizer(batch["chosen"], return_tensors="pt")["input_ids"]
        rejected_ids = tokenizer(batch["rejected"], return_tensors="pt")["input_ids"]

        optimizer.zero_grad()
        loss = dpo_loss(model, chosen_ids, rejected_ids)
        loss.backward()
        optimizer.step()

In [None]:
model.save_pretrained("fine-tuned-model")