##  Load a pretrained model and tokenizer

In [1]:
import itertools
import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from trl import DPOTrainer, DPOConfig

# Model and tokenizer


model_name_or_path = "Qwen/Qwen2-0.5B-Instruct"
ignore_bias_buffers = False

model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
if ignore_bias_buffers:
    # torch distributed hack
    model._ddp_params_and_buffers_to_ignore = [
        name for name, buffer in model.named_buffers() if buffer.dtype == torch.bool
    ]

ref_model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token


  from .autonotebook import tqdm as notebook_tqdm
Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


## Task 1 - Finding a suitable dataset

In [2]:
# Function to extract prompt from response
def extract_anthropic_prompt(prompt_and_response: str) -> str:
    search_term = "\n\nAssistant:"
    search_term_idx = prompt_and_response.rfind(search_term)
    assert search_term_idx != -1, f"Prompt and response does not contain '{search_term}'"
    return prompt_and_response[: search_term_idx + len(search_term)]

# Load dataset
def get_static_hh(split: str, sanity_check: bool = False, cache_dir: str = None):
    dataset = load_dataset("Dahoas/static-hh", split=split, cache_dir=cache_dir)
    if sanity_check:
        dataset = dataset.select(range(min(len(dataset), 5)))  # Use a smaller dataset for testing

    def filter_columns(sample):
        return {
            "prompt": sample["prompt"],
            "chosen": sample["chosen"],
            "rejected": sample["rejected"],
        }

    return dataset.map(filter_columns)

# Prepare datasets
sanity_check = True  # Set to False for full dataset
train_dataset = get_static_hh("train", sanity_check=sanity_check)
eval_dataset = get_static_hh("test", sanity_check=sanity_check)

##  Task 2. Training a Model with DPOTrainer

In [3]:
# Define hyperparameter
learning_rates = [5e-5]
batch_sizes = [4]
num_epochs = [3]
betas = [0.1]

# Generate all possible hyperparameter combinations
hyperparameter_combinations = list(itertools.product(learning_rates, batch_sizes, num_epochs, betas))

# Store results
results = []
best_loss = float("inf")  # Initialize best loss as infinity
best_model_path = None

for lr, batch_size, epochs, beta in hyperparameter_combinations:
    print(f"\nTraining with lr={lr}, batch_size={batch_size}, epochs={epochs}, beta={beta}")
    output_dir = f"./dpo_lr{lr}_bs{batch_size}_ep{epochs}_beta{beta}"

    # DPO training configuration
    dpo_config = DPOConfig(
        output_dir=output_dir,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=epochs,
        logging_dir="./logs",
        logging_steps=10,
        save_total_limit=2,
        learning_rate=lr,
        report_to="none",
        beta=beta,  # Temperature parameter for preference weighting
    )

    # Initialize DPOTrainer
    dpo_trainer = DPOTrainer(
        model=model,
        ref_model=ref_model,
        args=dpo_config,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
    )

    # Train model
    dpo_trainer.train()

    # Evaluate model
    eval_results = dpo_trainer.evaluate()
    loss = eval_results.get("eval_loss", None)
    results.append({
        "learning_rate": lr,
        "batch_size": batch_size,
        "epochs": epochs,
        "beta": beta,
        "loss": loss
    })
    if loss is not None and loss < best_loss:
        best_loss = loss
        best_model_path = output_dir
        print(f"New best model found! Saving model at: {best_model_path}")

# Print results
print("\nExperiment Results:")
for res in results:
    print(res)


Training with lr=5e-05, batch_size=4, epochs=3, beta=0.1


  dpo_trainer = DPOTrainer(


Epoch,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
1,No log,1.033892,-1.894423,-1.620894,0.125,-0.273529,-284.84082,-126.990891,-3.056025,-3.12009
2,No log,1.445939,-3.56774,-2.634429,0.125,-0.933312,-301.573975,-137.126236,-3.084869,-3.164944
3,No log,1.634201,-4.152308,-2.937025,0.125,-1.215283,-307.419647,-140.152206,-3.097786,-3.176896


New best model found! Saving model at: ./dpo_lr5e-05_bs4_ep3_beta0.1

Experiment Results:
{'learning_rate': 5e-05, 'batch_size': 4, 'epochs': 3, 'beta': 0.1, 'loss': 1.6342010498046875}


In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Define the path to the best model (replace with your actual best_model_path)
repo_id = "khinhlaing/dop_qwan"  # model from hugging face repo

# Load the fine-tuned model
best_model = AutoModelForCausalLM.from_pretrained(repo_id)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(repo_id)

# Ensure padding token is set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

  from .autonotebook import tqdm as notebook_tqdm
Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


In [2]:
best_model.eval()

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=896, out_features=896, bias=True)
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): Linear(in_features=896, out_features=128, bias=True)
          (o_proj): Linear(in_features=896, out_features=896, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((896,), eps=1e-06)
    (rotary_emb): Qwen2RotaryEmbe

In [4]:
import torch
# Define a single sample prompt
sample_prompt = "How to study effectively?"

# Function to generate a response without repeating the prompt
def generate_response(prompt, max_tokens=100):
    try:
        # Format input as a dialogue
        formatted_prompt = f"Human: {prompt}\n\nAssistant:"

        # Tokenize the input
        input_ids = tokenizer(formatted_prompt, return_tensors="pt").input_ids

        # Generate a response
        with torch.no_grad():
            output_ids = best_model.generate(
                input_ids,
                max_new_tokens=max_tokens,  # Controls output length
                temperature=0.7,  # Adds diversity
                top_p=0.9,  # Nucleus sampling
                do_sample=True,  # Enables varied responses
                pad_token_id=tokenizer.eos_token_id,  # Handles padding properly
            )

        # Decode and clean response
        full_response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        response = full_response.replace(formatted_prompt, "").strip()

        return response
    except Exception as e:
        return f"Error generating response: {str(e)}"

# Generate and print the response
response = generate_response(sample_prompt)
print(f"Prompt: {sample_prompt}\nResponse: {response}")


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Prompt: How to study effectively?
Response: 1. Set up a study schedule, make sure you have all the resources and tools needed.
2. Take notes on everything you read, practice good reading habits, and make sure you understand each chapter.
3. Join a club or organization that interests you, share your knowledge with others and get help from experienced mentors.
4. Practice active listening, be prepared to ask questions and give feedback, and use active learning techniques such as summarizing, making connections and using technology like Google Docs.
5
