**Preference Fine-Tuning of TinyLlama**

**Assignment 05 - Part 2: PFT Implementation**

In [None]:
!pip install bitsandbytes accelerate
!pip install -U trl
!pip install sacrebleu
!pip install evaluate

In [None]:
!pip show trl


In [None]:
import os
import shutil
import requests
from io import BytesIO
import torch
import pandas as pd
import numpy as np
from datasets import load_dataset, Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig
)
import time
import torch
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training,
    TaskType,
    PeftModel
)

from trl import DPOTrainer, DPOConfig
import json
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from evaluate import load
import warnings
warnings.filterwarnings('ignore')

In [None]:
torch.manual_seed(42)
np.random.seed(42)
os.environ["WANDB_DISABLED"] = "true"

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
try:
    with open("/kaggle/input/top-sft-model/best_sft_model_info.json", "r") as f:
        sft_info = json.load(f)
    print("✅ Loaded SFT model information:")
    print(f"   Best SFT Model: {sft_info['best_model_name']}")
    print(f"   Model Path: {sft_info['best_model_path']}")
    print(f"   SFT BLEU Score: {sft_info['bleu_score']:.4f}")
except FileNotFoundError:
    print("❌ SFT model info not found. Please run SFT first.")
    raise

In [None]:
def clear_hf_cache():
    """Clear Hugging Face cache to avoid conflicts"""
    cache_dirs = [
        os.path.expanduser("~/.cache/huggingface/datasets"),
        "/root/.cache/huggingface/datasets",
        "/kaggle/working/.cache/huggingface/datasets",
    ]

    for cache_dir in cache_dirs:
        if os.path.exists(cache_dir):
            try:
                shutil.rmtree(cache_dir)
                print(f"✓ Cleared cache: {cache_dir}")
            except Exception as e:
                print(f"⚠ Could not clear {cache_dir}: {e}")

In [None]:
def load_dpo_mix_dataset(max_samples=None):
    """Load DPO Mix dataset from parquet files"""
    print("🔄 Loading DPO Mix dataset...")

    # Clear cache first
    clear_hf_cache()

    try:
        print("Downloading parquet files directly...")

        # Download the train and test parquet files directly from HuggingFace
        train_url = "https://huggingface.co/datasets/argilla/dpo-mix-7k/resolve/main/data/train-00000-of-00001.parquet"
        test_url = "https://huggingface.co/datasets/argilla/dpo-mix-7k/resolve/main/data/test-00000-of-00001.parquet"

        # Download training data
        train_response = requests.get(train_url)
        train_response.raise_for_status()
        train_df = pd.read_parquet(BytesIO(train_response.content))

        # Download test data
        test_response = requests.get(test_url)
        test_response.raise_for_status()
        test_df = pd.read_parquet(BytesIO(test_response.content))

        # Combine train and test for our purposes (we'll split later)
        df = pd.concat([train_df, test_df], ignore_index=True)

        # Limit samples if specified
        if max_samples and max_samples < len(df):
            df = df.head(max_samples)
            print(f"🔪 Limited to {max_samples} samples")

        # Convert to HuggingFace dataset
        dataset = Dataset.from_pandas(df)

        dataset = dataset.train_test_split(test_size=0.2, seed=42)

        print(f"✅ Successfully loaded {len(df)} samples!")
        return dataset

    except Exception as e:
        print(f"❌ Failed to load dataset: {e}")
        raise Exception("Could not load dataset. Check your internet connection and try again.")

def preprocess_dpo_dataset(dataset):
    """Preprocess the dataset for DPO training"""
    print("Preprocessing DPO dataset...")

    # Create a new dataset with the required format
    processed_data = []

    for sample in dataset['train']:
        # Extract the prompt from the chosen messages
        # The chosen/rejected are lists of message dicts with role/content
        prompt = ""
        chosen_response = ""
        rejected_response = ""

        # Find the user prompt (last user message before assistant response)
        for msg in sample['chosen']:
            if msg['role'] == 'user':
                prompt = msg['content']
            elif msg['role'] == 'assistant':
                chosen_response = msg['content']

        # Get rejected response (last assistant message)
        for msg in sample['rejected']:
            if msg['role'] == 'assistant':
                rejected_response = msg['content']

        # Skip if we couldn't extract proper prompt/response pairs
        if not prompt or not chosen_response or not rejected_response:
            continue

        processed_sample = {
            'prompt': prompt,
            'chosen': chosen_response,
            'rejected': rejected_response,
            'chosen_rating': sample['chosen_rating'],
            'rejected_rating': sample['rejected_rating']
        }
        processed_data.append(processed_sample)

    # Convert to HuggingFace dataset
    processed_dataset = Dataset.from_pandas(pd.DataFrame(processed_data))

    # Split into train and validation (80-20 split)
    train_test_split = processed_dataset.train_test_split(test_size=0.2, seed=42)

    print(f"Processed dataset sizes - Train: {len(train_test_split['train'])}, Val: {len(train_test_split['test'])}")
    return train_test_split

def format_dpo_prompt(prompt):
    """Format the prompt for TinyLlama chat format"""
    return f"""<|system|>
You are a helpful AI assistant that follows instructions carefully and provides accurate responses.
<|user|>
{prompt}
<|assistant|>
"""

In [None]:
dataset = load_dpo_mix_dataset(max_samples=3000)
print("\n📊 Dataset Information:")
print(f"Dataset keys: {list(dataset.keys())}")
print(dataset['train'][0])

dpo_dataset = preprocess_dpo_dataset(dataset)
dpo_train_dataset = dpo_dataset['train']
dpo_val_dataset = dpo_dataset['test']

print("\nSample preprocessed DPO data:")
print(dpo_train_dataset[0])

In [None]:
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

print(f"Loading tokenizer: {MODEL_NAME}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

def load_sft_model():
    """Load the best SFT model"""
    print("🔄 Loading best SFT model...")

    if sft_info['best_model_name'] == 'base_model':
        print("Loading base model (no SFT applied)")
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_NAME,
            quantization_config=bnb_config,
            device_map="auto",
            trust_remote_code=True,
        )
    else:
        print(f"Loading SFT model: {sft_info['best_model_name']}")

        base_model = AutoModelForCausalLM.from_pretrained(
            MODEL_NAME,
            quantization_config=bnb_config,
            device_map="auto",
            trust_remote_code=True,
        )

        model = PeftModel.from_pretrained(
            base_model,
            sft_info['best_model_path'],
            device_map="auto"
        )

        model = model.merge_and_unload()

    model = prepare_model_for_kbit_training(model)
    print("✅ SFT model loaded successfully!")
    return model

In [None]:
sft_model = load_sft_model()

In [None]:
dpo_lora_configs = {
    "dpo_trial_1": {
        "r": 4,
        "lora_alpha": 16,
        "target_modules": ["q_proj", "v_proj"],
        "lora_dropout": 0.05,
    },
    # "dpo_trial_2": {
    #     "r": 6,
    #     "lora_alpha": 24,
    #     "target_modules": ["q_proj", "v_proj"],
    #     "lora_dropout": 0.05,
    # },
    # "dpo_trial_3": {
    #     "r": 8,
    #     "lora_alpha": 32,
    #     "target_modules": ["q_proj", "v_proj", "k_proj"],
    #     "lora_dropout": 0.05,
    # },
    # "dpo_trial_4": {
    #     "r": 8,
    #     "lora_alpha": 32,
    #     "target_modules": ["q_proj", "v_proj", "k_proj", "o_proj"],
    #     "lora_dropout": 0.05,
    # },
    "dpo_trial_5": {
        "r": 12,
        "lora_alpha": 48,
        "target_modules": ["q_proj", "v_proj", "k_proj", "o_proj"],
        "lora_dropout": 0.05,
    },
}

dpo_training_configs = {
    "dpo_trial_1": {
        "learning_rate": 5e-6,
        "per_device_train_batch_size": 4,
        "num_train_epochs": 2,
        "gradient_accumulation_steps": 3,
        "beta": 0.1,
    },
    # "dpo_trial_2": {
    #     "learning_rate": 6e-6,
    #     "per_device_train_batch_size": 3,
    #     "num_train_epochs": 2,
    #     "gradient_accumulation_steps": 2,
    #     "beta": 0.12,
    # },
    # "dpo_trial_3": {
    #     "learning_rate": 7e-6,
    #     "per_device_train_batch_size": 2,
    #     "num_train_epochs": 3,
    #     "gradient_accumulation_steps": 4,
    #     "beta": 0.15,
    # },
    # "dpo_trial_4": {
    #     "learning_rate": 1e-5,
    #     "per_device_train_batch_size": 4,
    #     "num_train_epochs": 3,
    #     "gradient_accumulation_steps": 5,
    #     "beta": 0.2,
    # },
    "dpo_trial_5": {
        "learning_rate": 1.2e-5,
        "per_device_train_batch_size": 2,
        "num_train_epochs": 3,
        "gradient_accumulation_steps": 4,
        "beta": 0.25,
    },
}


In [None]:
def train_dpo_model(trial_name, lora_config, training_config, sft_model):
    print(f"\n{'='*60}")
    print(f"Training DPO {trial_name}")
    print(f"{'='*60}")

    start_time = time.time()

    model_copy = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True,
    )

    if sft_info['best_model_name'] != 'base_model':
        model_copy = PeftModel.from_pretrained(
            model_copy,
            sft_info['best_model_path'],
            device_map="auto"
        )
        model_copy = model_copy.merge_and_unload()

    model_copy = prepare_model_for_kbit_training(model_copy)

    peft_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        inference_mode=False,
        r=lora_config["r"],
        lora_alpha=lora_config["lora_alpha"],
        target_modules=lora_config["target_modules"],
        lora_dropout=lora_config["lora_dropout"],
        bias="none",
    )

    model_copy = get_peft_model(model_copy, peft_config)
    model_copy.print_trainable_parameters()

    ref_model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True,
    )

    if sft_info['best_model_name'] != 'base_model':
        ref_model = PeftModel.from_pretrained(
            ref_model,
            sft_info['best_model_path'],
            device_map="auto"
        )
        ref_model = ref_model.merge_and_unload()

    training_args = DPOConfig(
        output_dir=f"./dpo_results/{trial_name}",
        learning_rate=training_config["learning_rate"],
        per_device_train_batch_size=training_config["per_device_train_batch_size"],
        per_device_eval_batch_size=1,
        num_train_epochs=training_config["num_train_epochs"],
        gradient_accumulation_steps=training_config["gradient_accumulation_steps"],
        eval_strategy="steps",
        eval_steps=50,
        logging_steps=120,
        save_strategy="epoch",
        fp16=True,
        remove_unused_columns=False,
        run_name=f"dpo_{trial_name}",
        dataloader_drop_last=True,
        beta=training_config["beta"],
        max_length=512,
        max_prompt_length=256,
    )

    def format_dataset_for_dpo(examples):
        """Format the dataset for general instruction-following"""
        formatted_examples = {
            'prompt': [],
            'chosen': [],
            'rejected': []
        }

        for i in range(len(examples['prompt'])):
            formatted_prompt = f"""<|system|>
You are a helpful AI assistant that follows instructions carefully and provides accurate, concise, and informative responses.
<|user|>
{examples['prompt'][i]}
<|assistant|>
"""
            formatted_examples['prompt'].append(formatted_prompt)
            formatted_examples['chosen'].append(examples['chosen'][i])
            formatted_examples['rejected'].append(examples['rejected'][i])

        return formatted_examples

    formatted_train_dataset = dpo_train_dataset.map(
        format_dataset_for_dpo,
        batched=True,
        remove_columns=dpo_train_dataset.column_names
    )

    formatted_val_dataset = dpo_val_dataset.map(
        format_dataset_for_dpo,
        batched=True,
        remove_columns=dpo_val_dataset.column_names
    )

    dpo_trainer = DPOTrainer(
        model=model_copy,
        ref_model=None,
        args=training_args,
        train_dataset=formatted_train_dataset,
        eval_dataset=formatted_val_dataset,
        processing_class=tokenizer,
        peft_config=peft_config,
    )

    print("Starting DPO training...")
    dpo_trainer.train()
    dpo_trainer.save_model(f"./dpo_models/{trial_name}")

    end_time = time.time()
    training_time = end_time - start_time

    print(f"{trial_name} completed in {training_time:.2f} seconds")

    training_logs = {
        "train_loss_history": dpo_trainer.state.log_history,
        "eval_loss_history": [log for log in dpo_trainer.state.log_history if 'eval_loss' in log],
        "learning_rate_schedule": [log.get('learning_rate', 0) for log in dpo_trainer.state.log_history],
        "global_steps": dpo_trainer.state.global_step,
        "total_epochs": dpo_trainer.state.epoch,
    }

    with open(f"dpo_training_metrics_{trial_name}.json", "w") as f:
        json.dump(training_logs, f, indent=2)

    del model_copy
    del ref_model
    del dpo_trainer
    torch.cuda.empty_cache()

    return {
        "trial_name": trial_name,
        "training_time": training_time,
        "lora_config": lora_config,
        "training_config": training_config
    }



def preprocess_dpo_dataset(dataset):
    """Extract prompt, chosen, and rejected from multi-turn chat structure"""
    print("🔄 Preprocessing DPO dataset...")

    processed_data = []

    for sample in dataset['train']:
        prompt = ""
        chosen_response = ""
        rejected_response = ""

        for msg in sample['chosen']:
            if msg['role'] == 'user':
                prompt = msg['content']
            elif msg['role'] == 'assistant':
                chosen_response = msg['content']

        for msg in sample['rejected']:
            if msg['role'] == 'assistant':
                rejected_response = msg['content']

        if not prompt or not chosen_response or not rejected_response:
            continue

        processed_data.append({
            'prompt': prompt,
            'chosen': chosen_response,
            'rejected': rejected_response,
            'chosen_rating': sample['chosen_rating'],
            'rejected_rating': sample['rejected_rating']
        })

    if not processed_data:
        raise ValueError("❌ No valid samples found in dataset.")

    # Convert to HuggingFace Dataset and split
    processed_dataset = Dataset.from_pandas(pd.DataFrame(processed_data))
    split_dataset = processed_dataset.train_test_split(test_size=0.2, seed=42)

    print(f"✅ Processed dataset sizes - Train: {len(split_dataset['train'])}, Val: {len(split_dataset['test'])}")
    return split_dataset


In [None]:
print("\n" + "="*60)
print("STARTING DPO TRAINING EXPERIMENTS")
print("="*60)

dpo_results = []

for trial_name in dpo_lora_configs.keys():
    try:
        result = train_dpo_model(
            trial_name=trial_name,
            lora_config=dpo_lora_configs[trial_name],
            training_config=dpo_training_configs[trial_name],
            sft_model=sft_model
        )

        dpo_results.append(result)

        with open(f"dpo_training_results_{trial_name}.json", "w") as f:
            json.dump(result, f, indent=2)

    except Exception as e:
        print(f"Error in {trial_name}: {str(e)}")
        import traceback
        traceback.print_exc()
        continue

In [None]:
print("\n" + "="*60)
print("ALL TRIALS COMPLETED")
print("="*60)

with open("all_dpo_training_results.json", "w") as f:
    json.dump(dpo_results, f, indent=2)

In [None]:
evaluation_prompts = [
    "Explain the difference between machine learning and deep learning.",
    "Write a short paragraph describing the benefits of regular exercise.",
    "What is the capital of France, and why is it famous?",
    "Summarize the plot of Romeo and Juliet.",
    "How do I bake a chocolate cake from scratch?",
    "Give three reasons why climate change is a global concern.",
    "Convert the sentence 'He is reading a book' into passive voice.",
    "What are the pros and cons of remote work?",
    "Write a polite email requesting a deadline extension.",
    "Translate the phrase 'Good morning' into Spanish, French, and German."
]

reference_answers = [
    "Machine learning is a subset of AI that focuses on building systems that learn from data. Deep learning is a type of machine learning that uses neural networks with many layers to analyze complex patterns.",
    "Regular exercise improves cardiovascular health, strengthens muscles, boosts mood, and helps maintain a healthy weight.",
    "The capital of France is Paris. It is famous for landmarks like the Eiffel Tower, its art and culture, fashion industry, and rich history.",
    "Romeo and Juliet is a tragedy by William Shakespeare about two young lovers from feuding families whose deaths ultimately reconcile their families.",
    "To bake a chocolate cake from scratch, mix flour, sugar, cocoa powder, eggs, butter, and baking soda. Pour the batter into a greased pan and bake at 350°F for 30–35 minutes.",
    "Climate change is a global concern because it leads to rising sea levels, more frequent extreme weather events, and disruptions to ecosystems and agriculture.",
    "The passive voice of 'He is reading a book' is 'A book is being read by him.'",
    "Remote work offers flexibility and saves commuting time, but it can also lead to isolation and distractions at home.",
    "Dear Professor, I hope you're doing well. I'm writing to request an extension for the assignment deadline due to unforeseen circumstances. Thank you for your consideration.",
    "Spanish: Buenos días, French: Bonjour, German: Guten Morgen."
]


In [None]:
from sacrebleu import corpus_bleu
import re
import evaluate

bleu_metric = evaluate.load("bleu")

In [None]:
def evaluate_dpo_model(model_path, trial_name):
    print(f"Evaluating DPO model: {trial_name}")

    try:
        base_model = AutoModelForCausalLM.from_pretrained(
            MODEL_NAME,
            device_map="auto",
            torch_dtype=torch.float16,
        )

        if sft_info['best_model_name'] != 'base_model':
            base_model = PeftModel.from_pretrained(
                base_model,
                sft_info['best_model_path'],
                device_map="auto"
            )
            base_model = base_model.merge_and_unload()

        model = PeftModel.from_pretrained(
            base_model,
            model_path,
            device_map="auto"
        )

        predictions = []

        for i, prompt in enumerate(evaluation_prompts, 1):
            print(f"\n{'-'*50}")
            print(f"{trial_name.upper()} - PROMPT {i}: {prompt}")
            print(f"{'-'*50}")

            formatted_prompt = f"""<|system|>
You are a helpful AI assistant that follows instructions carefully and provides accurate, concise, and informative responses.
<|user|>
{prompt}
<|assistant|>
"""

            inputs = tokenizer(formatted_prompt, return_tensors="pt").to(device)

            with torch.no_grad():
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=256,
                    temperature=0.7,
                    do_sample=True,
                    pad_token_id=tokenizer.eos_token_id
                )

            response = tokenizer.decode(outputs[0], skip_special_tokens=True)
            assistant_response = response.split("<|assistant|>")[-1].strip()

            print(f"GENERATED RESPONSE:")
            print(assistant_response)

            print(f"\nREFERENCE ANSWER:")
            print(reference_answers[i-1])

            predictions.append(assistant_response)

        del model
        del base_model
        torch.cuda.empty_cache()

        return predictions

    except Exception as e:
        print(f"Error evaluating DPO model {model_path}: {str(e)}")
        return ["Error"] * len(evaluation_prompts)

In [None]:
def calculate_bleu_score(predictions, references):
    """Calculate BLEU score between predictions and references"""
    bleu_metric = load("bleu")
    references_formatted = [[ref] for ref in references]
    bleu_result = bleu_metric.compute(predictions=predictions, references=references_formatted)
    return bleu_result['bleu']


In [None]:
print("\n" + "="*60)
print("STARTING DPO MODEL EVALUATION")
print("="*60)

dpo_evaluation_results = {}

In [None]:
for trial_name in dpo_lora_configs.keys():
    model_path = f"./dpo_models/{trial_name}"
    if os.path.exists(model_path):
        print("\n" + "-" * 40)
        print(f"EVALUATING DPO {trial_name.upper()}")
        print("-" * 40)

        dpo_predictions = evaluate_dpo_model(model_path, trial_name)
        dpo_bleu = calculate_bleu_score(dpo_predictions, reference_answers)

        dpo_evaluation_results[trial_name] = {
            'bleu_score': dpo_bleu,
            'predictions': dpo_predictions,
            'lora_config': dpo_lora_configs[trial_name],
            'training_config': dpo_training_configs[trial_name]
        }

        print(f"{trial_name} BLEU Score: {dpo_bleu:.4f}")

        if sft_info['best_model_name'] in sft_evaluation_results:
            sft_bleu = sft_evaluation_results[sft_info['best_model_name']]['bleu_score']
            improvement = dpo_bleu - sft_bleu
            print(f"Improvement over best SFT: {improvement:+.4f}")
    else:
        print(f"DPO model path {model_path} not found, skipping {trial_name}")


In [None]:
print("\n" + "="*60)
print("DPO EVALUATION RESULTS SUMMARY")
print("="*60)

sorted_dpo_models = sorted(dpo_evaluation_results.items(), key=lambda x: x[1]['bleu_score'], reverse=True)

print(f"{'BLEU Score':<12}")
print("-" * 70)

for model_name, results in sorted_dpo_models:
    bleu_score = results['bleu_score']
    print(f"{model_name:<15} {bleu_score:<12.4f}")


In [None]:
if sorted_dpo_models:
    best_dpo_name = sorted_dpo_models[0][0]
    best_dpo_results = sorted_dpo_models[0][1]

    print("\n" + "="*60)
    print("BEST DPO MODEL")
    print("="*60)
    print(f"Best DPO Model: {best_dpo_name}")
    print(f"BLEU Score: {best_dpo_results['bleu_score']:.4f}")

    print(f"\nLoRA Configuration:")
    for key, value in best_dpo_results['lora_config'].items():
        print(f"  - {key}: {value}")

    print(f"\nDPO Training Configuration:")
    for key, value in best_dpo_results['training_config'].items():
        print(f"  - {key}: {value}")


In [None]:
def generate_detailed_analysis():
    """Generate detailed analysis and insights (failsafe)"""
    print("\n" + "=" * 60)
    print("DETAILED ANALYSIS AND INSIGHTS")
    print("=" * 60)

    insights = {}

    if not dpo_evaluation_results:
        print("No DPO evaluation results found. Skipping analysis.")
        return insights

    try:
        sorted_dpo_models = sorted(
            dpo_evaluation_results.items(),
            key=lambda x: x[1].get('bleu_score', -1),
            reverse=True
        )

        if sorted_dpo_models:
            best_dpo = sorted_dpo_models[0]
            worst_dpo = sorted_dpo_models[-1]

            insights['performance'] = {
                'best_dpo_model': best_dpo[0],
                'best_dpo_bleu': best_dpo[1]['bleu_score'],
                'worst_dpo_model': worst_dpo[0],
                'worst_dpo_bleu': worst_dpo[1]['bleu_score'],
                'performance_gap': best_dpo[1]['bleu_score'] - worst_dpo[1]['bleu_score']
            }

            print(f"Best DPO Model: {best_dpo[0]} (BLEU: {best_dpo[1]['bleu_score']:.4f})")
            print(f"Worst DPO Model: {worst_dpo[0]} (BLEU: {worst_dpo[1]['bleu_score']:.4f})")
            print(f"Performance Gap: {insights['performance']['performance_gap']:.4f}")
        else:
            print("No sorted DPO models found.")
    except Exception as e:
        print(f"Failed to evaluate best/worst DPO models: {e}")

    print(f"\nCONFIGURATION IMPACT ANALYSIS:")
    print("-" * 40)

    rank_performance = {}
    for name, results in dpo_evaluation_results.items():
        rank = results.get('lora_config', {}).get('r')
        if rank is not None:
            rank_performance.setdefault(rank, []).append(results.get('bleu_score', 0))

    print("LoRA Rank Impact:")
    if rank_performance:
        for rank, scores in rank_performance.items():
            avg_score = np.mean(scores)
            print(f"  Rank {rank}: Average BLEU = {avg_score:.4f} (n={len(scores)})")
    else:
        print("  No rank data available.")

    # Beta parameter impact
    beta_performance = {}
    for name, results in dpo_evaluation_results.items():
        beta = results.get('training_config', {}).get('beta')
        if beta is not None:
            beta_performance[beta] = results.get('bleu_score', 0)

    print("\nDPO Beta Parameter Impact:")
    if beta_performance:
        for beta, score in sorted(beta_performance.items()):
            print(f"  Beta {beta}: BLEU = {score:.4f}")
    else:
        print("  No beta data available.")

    lr_performance = {}
    for name, results in dpo_evaluation_results.items():
        lr = results.get('training_config', {}).get('learning_rate')
        if lr is not None:
            lr_performance[lr] = results.get('bleu_score', 0)

    print("\nLearning Rate Impact:")
    if lr_performance:
        for lr, score in sorted(lr_performance.items()):
            print(f"  LR {lr}: BLEU = {score:.4f}")
    else:
        print("  No learning rate data available.")

    insights['configuration_impact'] = {
        'rank_performance': rank_performance,
        'beta_performance': beta_performance,
        'lr_performance': lr_performance
    }

    print(f"\n📈 IMPROVEMENT OVER BEST SFT:")
    print("-" * 40)

    try:
        sft_baseline_name = sft_info.get('best_model_name')
        sft_baseline = sft_evaluation_results[sft_baseline_name]['bleu_score']

        print(f"Best SFT BLEU Score: {sft_baseline:.4f}")

        improvements = {}
        for name, results in dpo_evaluation_results.items():
            dpo_bleu = results.get('bleu_score')
            if dpo_bleu is not None:
                improvement = dpo_bleu - sft_baseline
                improvements[name] = improvement
                status = "✅ Improved" if improvement > 0 else "❌ Degraded"
                print(f"{name}: {improvement:+.4f} {status}")

        insights['sft_comparison'] = {
            'sft_baseline': sft_baseline,
            'improvements': improvements,
            'models_improved': sum(1 for imp in improvements.values() if imp > 0),
            'models_degraded': sum(1 for imp in improvements.values() if imp < 0)
        }

        total = len(improvements)
        print(f"\nSummary: {insights['sft_comparison']['models_improved']}/{total} models improved over SFT")
    except Exception as e:
        print(f"Failed to compute SFT comparison: {e}")
        insights['sft_comparison'] = {}

    return insight

In [None]:
analysis_insights = generate_detailed_analysis()

In [None]:

def save_comprehensive_results():
    print("\n" + "="*60)
    print("SAVING COMPREHENSIVE RESULTS")
    print("="*60)

    final_results = {
        'timestamp': datetime.now().isoformat(),
        'sft_info': sft_info,
        'dpo_evaluation_results': dpo_evaluation_results,
        'analysis_insights': analysis_insights,
        'experiment_summary': {
            'total_dpo_models_trained': len(dpo_lora_configs),
            'total_dpo_models_evaluated': len(dpo_evaluation_results),
            'best_dpo_model': sorted_dpo_models[0][0] if sorted_dpo_models else None,
            'best_dpo_bleu': sorted_dpo_models[0][1]['bleu_score'] if sorted_dpo_models else None,
        }
    }

    with open("final_dpo_results.json", "w") as f:
        json.dump(final_results, f, indent=2, default=str)

    recommendations = {
        'best_overall_model': sorted_dpo_models[0][0] if sorted_dpo_models else None,
        'recommended_configs': {},
        'deployment_notes': []
    }

    if sorted_dpo_models:
        best_model = sorted_dpo_models[0]
        recommendations['recommended_configs'] = {
            'lora_config': best_model[1]['lora_config'],
            'training_config': best_model[1]['training_config']
        }

        recommendations['deployment_notes'] = [
            f"Use {best_model[0]} for production deployment",
            f"Expected BLEU score: {best_model[1]['bleu_score']:.4f}",
            "Model shows good balance across all evaluation criteria"
        ]

    with open("model_recommendations.json", "w") as f:
        json.dump(recommendations, f, indent=2)

    summary_report = f"""
DPO FINE-TUNING EXPERIMENT SUMMARY REPORT
========================================

Experiment Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}

DATASET INFORMATION:
- DPO Dataset: Math-Step-DPO-10K
- Training Samples: {len(dpo_train_dataset)}
- Validation Samples: {len(dpo_val_dataset)}

MODEL TRAINING:
- Base Model: {MODEL_NAME}
- SFT Foundation: {sft_info['best_model_name']}
- DPO Models Trained: {len(dpo_lora_configs)}
- DPO Models Evaluated: {len(dpo_evaluation_results)}

PERFORMANCE RESULTS:
"""

    if sorted_dpo_models:
        summary_report += f"""
BEST PERFORMING MODEL: {sorted_dpo_models[0][0]}
- BLEU Score: {sorted_dpo_models[0][1]['bleu_score']:.4f}

CONFIGURATION:
- LoRA Rank: {sorted_dpo_models[0][1]['lora_config']['r']}
- LoRA Alpha: {sorted_dpo_models[0][1]['lora_config']['lora_alpha']}
- Target Modules: {sorted_dpo_models[0][1]['lora_config']['target_modules']}
- Learning Rate: {sorted_dpo_models[0][1]['training_config']['learning_rate']}
- DPO Beta: {sorted_dpo_models[0][1]['training_config']['beta']}
"""

    summary_report += f"""
EVALUATION PROMPTS: {len(evaluation_prompts)} math word problems
# EVALUATION METRICS: BLEU score + Manual criteria (Helpfulness, Relevance, Accuracy, Harmlessness)

FILES GENERATED:
- final_dpo_results.json: Comprehensive results
- model_recommendations.json: Deployment recommendations
- dpo_evaluation_results.png: Performance visualizations
- comprehensive_model_comparison.png: Model comparison chart
"""

    with open("dpo_experiment_summary.txt", "w") as f:
        f.write(summary_report)

    print("All results saved successfully!")
    print("Generated files:")
    print("   - final_dpo_results.json")
    print("   - model_recommendations.json")
    print("   - dpo_experiment_summary.txt")
    print("   - dpo_evaluation_results.png")
    print("   - comprehensive_model_comparison.png")

In [None]:
save_comprehensive_results()

In [None]:
import shutil

shutil.make_archive('output_backup', 'zip', '.')