In [None]:
!pip -q install --upgrade \
  "unsloth>=2025.10.0" \
  "transformers==4.57.1" \
  "accelerate>=1.10.0" \
  "trl>=0.23.0" \
  "peft>=0.17.1" \
  "bitsandbytes>=0.44.1" \
  "sentencepiece"

# IMPORTANT: import unsloth BEFORE transformers/datasets/peft so its patches take effect
import os, sys, platform, json, torch
from unsloth import FastLanguageModel

import transformers, datasets, accelerate, trl, peft

print("Python       :", sys.version.split()[0])
print("Platform     :", platform.platform())
print("Torch        :", torch.__version__)
print("Transformers :", transformers.__version__)
print("TRL          :", trl.__version__)
print("PEFT         :", peft.__version__)
print("Datasets     :", datasets.__version__)

# GPU check
if torch.cuda.is_available():
    print("CUDA GPU     :", torch.cuda.get_device_name(0))
else:
    print("CUDA GPU     : NOT AVAILABLE â€” training will be slow.")

# Load smollm2-135m in 4-bit
model_name = "unsloth/smollm2-135m"
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name     = model_name,
    max_seq_length = 2048,
    dtype          = torch.float16,
    load_in_4bit   = True,
    device_map     = "auto",
)
tokenizer.padding_side = "right"

# Tiny generate smoke test
prompt = "You are a helpful assistant. In one sentence, say hello."
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
with torch.inference_mode():
    out = model.generate(**inputs, max_new_tokens=24)

print("\n=== Smoke test output ===")
print(tokenizer.decode(out[0], skip_special_tokens=True))
print("\nâœ… Step 1 complete: environment OK and model loaded.")

DATASET_CHOICE = "Dahoas/full-hh-rlhf"
print("Planned dataset:", DATASET_CHOICE)


ðŸ¦¥ Unsloth: Will patch your computer to enable 2x faster free finetuning.
ðŸ¦¥ Unsloth Zoo will now patch everything to make training faster!
Python       : 3.12.12
Platform     : Linux-6.6.105+-x86_64-with-glibc2.35
Torch        : 2.8.0+cu126
Transformers : 4.57.1
TRL          : 0.25.0
PEFT         : 0.17.1
Datasets     : 4.3.0
CUDA GPU     : Tesla T4
==((====))==  Unsloth 2025.11.3: Fast Llama patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/269M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/158 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/742 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]


=== Smoke test output ===
You are a helpful assistant. In one sentence, say hello.

The other person is a good friend.

The other person is a good friend.

The other

âœ… Step 1 complete: environment OK and model loaded.
Planned dataset: Dahoas/full-hh-rlhf


In [None]:
# Load dataset

from datasets import load_dataset

DATASET_CHOICE = "Dahoas/full-hh-rlhf"

def load_preference_dataset(name: str, split: str = "train"):
    """
    Returns a Dataset containing ONLY:
        - prompt (str)
        - chosen (str)
        - rejected (str)
    """

    if name == "Dahoas/full-hh-rlhf":
        ds = load_dataset("Dahoas/full-hh-rlhf", split=split)
        keep = {"prompt", "chosen", "rejected"}
        drop_cols = [c for c in ds.column_names if c not in keep]
        if drop_cols:
            ds = ds.remove_columns(drop_cols)
        return ds

    if name == "Dahoas/synthetic-instruct-gptj-pairwise":
        ds = load_dataset("Dahoas/synthetic-instruct-gptj-pairwise", split=split)
        keep = {"prompt", "chosen", "rejected"}
        drop_cols = [c for c in ds.column_names if c not in keep]
        if drop_cols:
            ds = ds.remove_columns(drop_cols)
        return ds

    if name == "Anthropic/hh-rlhf":
        raw = load_dataset("Anthropic/hh-rlhf", split=split)
        # Anthropic dataset has NO explicit "prompt" field â†’ create empty prompt
        def add_prompt(example):
            example["prompt"] = ""
            return example
        raw = raw.map(add_prompt)
        keep = {"prompt", "chosen", "rejected"}
        drop_cols = [c for c in raw.column_names if c not in keep]
        if drop_cols:
            raw = raw.remove_columns(drop_cols)
        return raw

    raise ValueError(f"Unknown dataset name: {name}")


# --- Load the dataset ---
train_dataset = load_preference_dataset(DATASET_CHOICE, split="train")

print("âœ… Dataset loaded successfully")
print("Dataset name :", DATASET_CHOICE)
print("Num rows     :", len(train_dataset))
print("Columns      :", train_dataset.column_names)

# --- Preview a sample row ---
sample = train_dataset[0]

def preview(text, n=280):
    if isinstance(text, str) and len(text) > n:
        return text[:n] + " [...]"
    return text

print("\n--- SAMPLE ROW ---")
print("PROMPT  :", preview(sample.get("prompt")))
print("CHOSEN  :", preview(sample.get("chosen")))
print("REJECTED:", preview(sample.get("rejected")))


README.md:   0%|          | 0.00/478 [00:00<?, ?B/s]

dataset_infos.json:   0%|          | 0.00/930 [00:00<?, ?B/s]

data/train-00000-of-00001-8349d0765e6718(â€¦):   0%|          | 0.00/123M [00:00<?, ?B/s]

data/test-00000-of-00001-ec71e9262143a91(â€¦):   0%|          | 0.00/13.6M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/112052 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/12451 [00:00<?, ? examples/s]

âœ… Dataset loaded successfully
Dataset name : Dahoas/full-hh-rlhf
Num rows     : 112052
Columns      : ['prompt', 'chosen', 'rejected']

--- SAMPLE ROW ---
PROMPT  : 

Human: Should you buy a case to protect your cell phone?

Assistant: It depends on your circumstances.  If you carry your phone in a pocket or a purse then you probably want a case.  But if you only need a phone for quick interactions, a case may actually cause more harm than g [...]
CHOSEN  :  Youâ€™re welcome.
REJECTED:  It sounds like youâ€™ve got the basics down.  Any further questions or concerns?  You can send me any feedback you have at help@babba.ai.


In [None]:
# Tokenization
def tokenize_pref(example):
    """
    Creates model-ready input for DPO:
        input_ids_chosen
        attention_mask_chosen
        input_ids_rejected
        attention_mask_rejected
    """
    prompt   = example["prompt"]
    chosen   = example["chosen"]
    rejected = example["rejected"]

    # Chat-style format (simple & safe for preference training)
    text_chosen = f"{prompt}\n\n### Response:\n{chosen}"
    text_rejected = f"{prompt}\n\n### Response:\n{rejected}"

    # Tokenize separate paths
    tokens_chosen = tokenizer(
        text_chosen,
        truncation=True,
        padding="max_length",
        max_length=512,
    )
    tokens_rejected = tokenizer(
        text_rejected,
        truncation=True,
        padding="max_length",
        max_length=512,
    )

    return {
        "input_ids_chosen": tokens_chosen["input_ids"],
        "attention_mask_chosen": tokens_chosen["attention_mask"],
        "input_ids_rejected": tokens_rejected["input_ids"],
        "attention_mask_rejected": tokens_rejected["attention_mask"],
    }

print("Tokenizing dataset... this may take ~1â€“2 minutes.")

tokenized_dataset = train_dataset.map(
    tokenize_pref,
    batched=False,
    remove_columns=train_dataset.column_names,  # keep only tokenized fields
)

print("âœ… Tokenization complete!")
print("Columns now:", tokenized_dataset.column_names)

# Show 1 mini sample to confirm structure:
sample_tok = tokenized_dataset[0]
print("\n--- TOKENIZED SAMPLE ---")
print("Chosen IDs length  :", len(sample_tok["input_ids_chosen"]))
print("Rejected IDs length:", len(sample_tok["input_ids_rejected"]))
print("Masks OK?          :",
      len(sample_tok["attention_mask_chosen"]),
      len(sample_tok["attention_mask_rejected"]))


Tokenizing dataset... this may take ~1â€“2 minutes.


Map:   0%|          | 0/112052 [00:00<?, ? examples/s]

âœ… Tokenization complete!
Columns now: ['input_ids_chosen', 'attention_mask_chosen', 'input_ids_rejected', 'attention_mask_rejected']

--- TOKENIZED SAMPLE ---
Chosen IDs length  : 512
Rejected IDs length: 512
Masks OK?          : 512 512


In [None]:
# Build DPOTrainer

from trl import DPOConfig, DPOTrainer
from peft import LoraConfig

# 1) (Re)load RAW train split with the 3 required text columns
train_dataset_raw = load_preference_dataset(DATASET_CHOICE, split="train")
print("RAW columns:", train_dataset_raw.column_names[:10])

# 2) Define LoRA (same as before â€” safe to re-run)
lora_config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=[
        "q_proj","k_proj","v_proj","o_proj",
        "gate_proj","up_proj","down_proj",
        "embed_tokens","lm_head",
    ],
    lora_dropout=0.0,
    bias="none",
    task_type="CAUSAL_LM",
)

# 3) Trainer args (turn off column pruning to be extra safe)
dpo_config = DPOConfig(
    learning_rate=5e-5,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    max_grad_norm=1.0,
    num_train_epochs=0.02,         # tiny quick run; we can change later
    output_dir="preference_rl_model",
    remove_unused_columns=False,    # <-- important
    logging_steps=25,
)

# 4) Build trainer with the RAW dataset (NOT the tokenized one)
trainer = DPOTrainer(
    model=model,
    tokenizer=tokenizer,
    args=dpo_config,
    train_dataset=train_dataset_raw,     # <-- raw
    peft_config=lora_config,
)

print("\nâœ… STEP 4 complete: Trainer is ready (built on RAW dataset).")
print("You can now run STEP 5 to start training.")


RAW columns: ['prompt', 'chosen', 'rejected']




Extracting prompt in train dataset (num_proc=6):   0%|          | 0/112052 [00:00<?, ? examples/s]

Applying chat template to train dataset (num_proc=6):   0%|          | 0/112052 [00:00<?, ? examples/s]

Tokenizing train dataset (num_proc=6):   0%|          | 0/112052 [00:00<?, ? examples/s]


âœ… STEP 4 complete: Trainer is ready (built on RAW dataset).
You can now run STEP 5 to start training.


In [None]:
# Training

print("Starting preference-RL training...")

train_result = trainer.train()

print("\nâœ… Training finished!")
print(train_result)


The model is already on multiple devices. Skipping the move to device specified in `args`.


Starting preference-RL training...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 112,052 | Num Epochs = 1 | Total steps = 141
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 8 x 1) = 16
 "-____-"     Trainable parameters = 6,475,808 of 140,991,392 (4.59% trained)
  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 Â·Â·Â·Â·Â·Â·Â·Â·Â·Â·


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mrutujabhaskarrao[0m ([33mrutujabhaskarrao-san-jose-state-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Detected [huggingface_hub.inference, openai] in use.
[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / chosen,logps / rejected,logits / chosen,logits / rejected,eval_logits / chosen,eval_logits / rejected,nll_loss
25,0.6928,0.006299,0.005478,0.475,0.000821,-179.216049,-170.474731,6.397267,6.435376,0,0,0
50,0.6874,0.044888,0.032514,0.575,0.012374,-210.34726,-195.908737,6.311908,6.436098,No Log,No Log,No Log
75,0.6825,0.091843,0.068295,0.6225,0.023548,-215.496735,-194.208237,6.060249,6.013487,No Log,No Log,No Log
100,0.6785,0.124838,0.091641,0.625,0.033197,-206.066757,-180.811508,6.312346,6.400402,No Log,No Log,No Log
125,0.6754,0.141903,0.100805,0.5925,0.041098,-203.565033,-178.2789,5.980382,6.306888,No Log,No Log,No Log





âœ… Training finished!
TrainOutput(global_step=141, training_loss=0.6823462695940167, metrics={'train_runtime': 965.9835, 'train_samples_per_second': 2.32, 'train_steps_per_second': 0.146, 'total_flos': 0.0, 'train_loss': 0.6823462695940167, 'epoch': 0.020133509442044766})


In [None]:
# merge to FP16

from unsloth import FastLanguageModel

import torch, os
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name   = "unsloth/smollm2-135m"
ADAPTER_DIR  = "preference_rl_model"         # where trainer saved the LoRA adapter/tokenizer
MERGED_DIR   = "preference_rl_model_merged"  # new folder for merged FP16 weights
os.makedirs(MERGED_DIR, exist_ok=True)

print("1) Load a fresh FP16 base model (NOT 4-bit)â€¦")
base_model, base_tok = FastLanguageModel.from_pretrained(
    model_name     = model_name,
    load_in_4bit   = False,            # <-- important
    dtype          = torch.float16,    # real FP16 weights
    device_map     = "auto",
)

print("2) Attach the LoRA adapter from trainingâ€¦")
peft_model = PeftModel.from_pretrained(base_model, ADAPTER_DIR)

print("3) Merge LoRA into the base model and drop adapter hooksâ€¦")
merged_model = peft_model.merge_and_unload()   # produces a plain FP16 Transformers model

print("4) Save merged model + tokenizerâ€¦")
merged_model.save_pretrained(MERGED_DIR, safe_serialization=False)
base_tok.save_pretrained(MERGED_DIR)
print("âœ… Merge complete ->", MERGED_DIR)

# ---------- Quick generation test (without reloading) ----------
print("\n>>> Quick generation with the in-memory merged model")
test_prompt = "What is the difference between data science and machine learning?"
tok_inputs  = base_tok(test_prompt, return_tensors="pt").to(merged_model.device)
with torch.inference_mode():
    out = merged_model.generate(**tok_inputs, max_new_tokens=120)
print(base_tok.decode(out[0], skip_special_tokens=True))

# ---------- Optional: Reload from disk and test again ----------
print("\n>>> Reload merged model from disk for clean inference")
# Keep unsloth imported before transformers (done above). Now load normally:
reload_tok = AutoTokenizer.from_pretrained(MERGED_DIR, use_fast=True)
reload_mdl, _ = FastLanguageModel.from_pretrained(
    model_name     = MERGED_DIR,
    load_in_4bit   = False,
    dtype          = torch.float16,
    device_map     = "auto",
)
with torch.inference_mode():
    out2 = reload_mdl.generate(**reload_tok(test_prompt, return_tensors="pt").to(reload_mdl.device),
                               max_new_tokens=120)
print(reload_tok.decode(out2[0], skip_special_tokens=True))

print("\nâœ… Step complete: merged FP16 model saved and inference verified.")

1) Load a fresh FP16 base model (NOT 4-bit)â€¦
==((====))==  Unsloth 2025.11.3: Fast Llama patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
2) Attach the LoRA adapter from trainingâ€¦




3) Merge LoRA into the base model and drop adapter hooksâ€¦
4) Save merged model + tokenizerâ€¦


```python
from transformers import AutoModelForCausalLM

# Load original tied model
model = AutoModelForCausalLM.from_pretrained("google/gemma-2-2b-it", tie_word_embeddings=False)

# Set the randomly initialized lm_head to the previously tied embeddings
model.lm_head.weight.data = model.model.embed_tokens.weight.data.clone()

# Save the untied model
untied_model_dir = "dir/for/untied/model"
model.save_pretrained(untied_model_dir)
model.config.save_pretrained(untied_model_dir)

# Now use the original model but in untied format
model = AutoModelForCausalLM.from_pretrained(untied_model_dir)
```



âœ… Merge complete -> preference_rl_model_merged

>>> Quick generation with the in-memory merged model
What is the difference between data science and machine learning?

Data science is the process of collecting, analyzing, and interpreting data to make predictions, decisions, and decisions. Machine learning is a subset of data science that focuses on developing algorithms that can learn from data without being explicitly programmed.

What is the difference between data science and machine learning?

Data science is the process of collecting, analyzing, and interpreting data to make predictions, decisions, and decisions. Machine learning is the process of developing algorithms that can learn from data without being explicitly programmed.

What is the difference between data science and machine learning?

Data science is

>>> Reload merged model from disk for clean inference
==((====))==  Unsloth 2025.11.3: Fast Llama patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1.

In [None]:
# Tiny evaluation on preference pairs

from unsloth import FastLanguageModel
import torch, math, random
from datasets import load_dataset

MERGED_DIR  = "preference_rl_model_merged"   # from previous step
DEVICE_MAP  = "auto"
DTYPE       = torch.float16
MAXLEN      = 512
N_EVAL      = 200          # try 50 for a very quick run; increase for better signal
SEED        = 42
random.seed(SEED)

# 1) Load merged FP16 model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name    = MERGED_DIR,
    load_in_4bit  = False,
    dtype         = DTYPE,
    device_map    = DEVICE_MAP,
)
tokenizer.padding_side = "right"

# 2) Load a test split with (prompt, chosen, rejected)
test_raw = load_dataset("Dahoas/full-hh-rlhf", split="test")
keep = {"prompt","chosen","rejected"}
drop_cols = [c for c in test_raw.column_names if c not in keep]
if drop_cols: test_raw = test_raw.remove_columns(drop_cols)

# Subsample for speed
if len(test_raw) > N_EVAL:
    test_raw = test_raw.shuffle(seed=SEED).select(range(N_EVAL))

def build_pair_text(example):
    p = example["prompt"] or ""
    return (
        f"{p}\n\n### Response:\n{example['chosen']}",
        f"{p}\n\n### Response:\n{example['rejected']}",
    )

@torch.inference_mode()
def mean_logprob_of_response(full_text: str, prompt_len_tokens: int):
    """Compute mean token logprob of the continuation given the prompt.
       We tokenize once, shift labels, and average continuation token logprobs."""
    enc = tokenizer(full_text, return_tensors="pt", truncation=True, padding=False, max_length=MAXLEN)
    input_ids = enc["input_ids"].to(model.device)
    attn_mask = enc["attention_mask"].to(model.device)

    # forward pass
    out = model(input_ids=input_ids, attention_mask=attn_mask, labels=input_ids)
    # token-wise negative log-likelihood
    # out.loss is averaged over all tokens; we want continuation only
    # So compute per-token loss from logits
    logits = out.logits[:, :-1, :]                      # (B, T-1, V)
    labels = input_ids[:, 1:]                           # (B, T-1)
    # cross-entropy per token
    logprobs = torch.log_softmax(logits, dim=-1).gather(-1, labels.unsqueeze(-1)).squeeze(-1)  # (B, T-1)

    # mask: only continuation tokens (after prompt_len_tokens)
    Tm1 = logprobs.size(1)
    cont_start = max(1, min(prompt_len_tokens, Tm1))    # avoid OOB
    cont_mask = torch.zeros_like(logprobs, dtype=torch.bool)
    cont_mask[:, cont_start:] = True

    sel = logprobs.masked_select(cont_mask)
    if sel.numel() == 0:
        return -1e9  # no continuation, treat as very low score
    return sel.mean().item()

def prompt_len_tokens_from_prompt(prompt_text: str):
    ids = tokenizer(prompt_text, truncation=True, max_length=MAXLEN)["input_ids"]
    return len(ids)

# 3) Evaluate win-rate: logprob(chosen) > logprob(rejected)
wins = 0
rows = []
for ex in test_raw:
    chosen_text, rejected_text = build_pair_text(ex)
    prompt_tokens = prompt_len_tokens_from_prompt(ex["prompt"] or "")
    lp_c = mean_logprob_of_response(chosen_text,   prompt_tokens)
    lp_r = mean_logprob_of_response(rejected_text, prompt_tokens)
    win = 1 if lp_c > lp_r else 0
    wins += win
    # keep a few examples for display
    if len(rows) < 5:
        rows.append({
            "lp_chosen": round(lp_c, 4),
            "lp_rejected": round(lp_r, 4),
            "margin": round(lp_c - lp_r, 4),
            "snippet_prompt": (ex["prompt"] or "")[:100].replace("\n"," "),
            "snippet_chosen": ex["chosen"][:100].replace("\n"," "),
            "snippet_rejected": ex["rejected"][:100].replace("\n"," "),
        })

win_rate = wins / len(test_raw)
print(f"\nâœ… Eval complete on {len(test_raw)} pairs.")
print(f"Win-rate (chosen beats rejected by mean logprob): {win_rate:.3f}")

print("\n--- sample scored rows ---")
for i, r in enumerate(rows, 1):
    print(f"[{i}] margin={r['margin']:+.4f} | lp_c={r['lp_chosen']:.3f} | lp_r={r['lp_rejected']:.3f}")
    print(f"    prompt   : {r['snippet_prompt']}")
    print(f"    chosen   : {r['snippet_chosen']}")
    print(f"    rejected : {r['snippet_rejected']}\n")

# 4) Tiny interactive demo
def generate(prompt: str, max_new_tokens: int = 128):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.inference_mode():
        out = model.generate(**inputs, max_new_tokens=max_new_tokens)
    return tokenizer.decode(out[0], skip_special_tokens=True)

print("\nTry me:\n", generate("Explain DPO in simple terms. Give a 3-sentence answer."))


==((====))==  Unsloth 2025.11.3: Fast Llama patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!

âœ… Eval complete on 200 pairs.
Win-rate (chosen beats rejected by mean logprob): 0.560

--- sample scored rows ---
[1] margin=-0.7119 | lp_c=-2.311 | lp_r=-1.599
    prompt   :   Human: What are the planets of the solar system?  Assistant: The solar system consists of eight pl
    chosen   :  Pluto was a planet for a long time, but more recently it was downgraded to the status of a dwarf pl
    rejected :  Pluto was considered to be the ninth planet until 2006, but it was found that it is much smaller th

[2] margin=+0.4102 | lp_c=-2.883 | lp_