**Preference Fine-Tuning of TinyLlama**

**Assignment 05 - Part 1: SFT Implementation**

In [None]:
!pip install bitsandbytes accelerate
!pip install trl
!pip install sacrebleu
!pip install evaluate

In [None]:
import os
import shutil
import requests
from io import BytesIO
import torch
import pandas as pd
import numpy as np
from datasets import load_dataset, Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig
)
import time
import torch
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training,
    TaskType,
)

from trl import SFTTrainer
import json
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from evaluate import load
import warnings
warnings.filterwarnings('ignore')

In [None]:
torch.manual_seed(42)
np.random.seed(42)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
def clear_hf_cache():
    """Clear Hugging Face cache to avoid conflicts"""
    cache_dirs = [
        os.path.expanduser("~/.cache/huggingface/datasets"),
        "/root/.cache/huggingface/datasets",
        "/kaggle/working/.cache/huggingface/datasets",
    ]

    for cache_dir in cache_dirs:
        if os.path.exists(cache_dir):
            try:
                shutil.rmtree(cache_dir)
                print(f"✓ Cleared cache: {cache_dir}")
            except Exception as e:
                print(f"⚠ Could not clear {cache_dir}: {e}")

def load_openassistant_dataset(max_samples=None):
    """Load OpenAssistant dataset from parquet files"""
    print("🔄 Loading OpenAssistant dataset...")

    clear_hf_cache()

    try:
        # URLs for the parquet files
        train_url = "https://huggingface.co/datasets/OpenAssistant/oasst1/resolve/main/data/train-00000-of-00001-b42a775f407cee45.parquet"
        val_url = "https://huggingface.co/datasets/OpenAssistant/oasst1/resolve/main/data/validation-00000-of-00001-134b8fd0c89408b6.parquet"

        print("Downloading training data...")
        train_response = requests.get(train_url)
        train_response.raise_for_status()
        train_df = pd.read_parquet(BytesIO(train_response.content))

        print("Downloading validation data...")
        val_response = requests.get(val_url)
        val_response.raise_for_status()
        val_df = pd.read_parquet(BytesIO(val_response.content))

        print(f"✅ Raw training data: {len(train_df)} messages")
        print(f"✅ Raw validation data: {len(val_df)} messages")

        # Process the datasets to create conversation pairs
        train_conversations = process_oasst_data(train_df, max_samples)
        val_conversations = process_oasst_data(val_df, max_samples//10 if max_samples else None)

        # Convert to HuggingFace datasets
        train_dataset = Dataset.from_pandas(pd.DataFrame(train_conversations))
        val_dataset = Dataset.from_pandas(pd.DataFrame(val_conversations))

        dataset = DatasetDict({
            'train': train_dataset,
            'validation': val_dataset
        })

        print(f"✅ Processed training conversations: {len(train_conversations)}")
        print(f"✅ Processed validation conversations: {len(val_conversations)}")

        return dataset

    except Exception as e:
        print(f"❌ Failed to load dataset: {e}")
        raise Exception("Could not load dataset. Check your internet connection and try again.")

def process_oasst_data(df, max_samples=None):
    """
    Process OpenAssistant data to create instruction-response pairs

    The dataset has a tree structure where:
    - Each message has a parent_id (None for root messages)
    - Roles alternate between 'prompter' and 'assistant'
    - We want to create pairs of (prompter message, assistant response)
    """
    print("🔄 Processing OpenAssistant conversation trees...")

    # Filter for English messages only and approved messages
    df_filtered = df[
        (df['lang'] == 'en') &
        (df['deleted'] == False) &
        (df['review_result'] == True)
    ].copy()

    print(f"Filtered to {len(df_filtered)} quality English messages")

    # Create a mapping from message_id to message data
    messages = {}
    for _, row in df_filtered.iterrows():
        messages[row['message_id']] = {
            'text': row['text'],
            'role': row['role'],
            'parent_id': row['parent_id'],
            'message_id': row['message_id']
        }

    conversations = []

    # Find all prompter messages that have assistant replies
    for msg_id, msg_data in messages.items():
        if msg_data['role'] == 'prompter':
            # Find assistant responses to this prompter message
            for potential_child_id, potential_child in messages.items():
                if (potential_child['parent_id'] == msg_id and
                    potential_child['role'] == 'assistant'):

                    # Create conversation pair
                    conversation = {
                        'instruction': msg_data['text'].strip(),
                        'response': potential_child['text'].strip(),
                        'conversation_id': f"{msg_id}_{potential_child_id}"
                    }
                    conversations.append(conversation)

    print(f"Created {len(conversations)} instruction-response pairs")

    # Limit samples if specified
    if max_samples and max_samples < len(conversations):
        conversations = conversations[:max_samples]
        print(f"🔪 Limited to {max_samples} conversation pairs")

    # Filter out very short or very long conversations
    filtered_conversations = []
    for conv in conversations:
        if (len(conv['instruction']) > 10 and len(conv['response']) > 10 and
            len(conv['instruction']) < 2000 and len(conv['response']) < 2000):
            filtered_conversations.append(conv)

    print(f"After length filtering: {len(filtered_conversations)} conversations")

    return filtered_conversations

In [None]:
SUBSET_SIZE = 5000
dataset = load_openassistant_dataset(max_samples=SUBSET_SIZE)

print("\n📊 Dataset Information:")
print(f"Dataset keys: {list(dataset.keys())}")
print(f"Training samples: {len(dataset['train'])}")
print(f"Validation samples: {len(dataset['validation'])}")

train_dataset = dataset['train']
val_dataset = dataset['validation']

In [None]:
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

print(f"Loading model and tokenizer: {MODEL_NAME}")

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)

model = prepare_model_for_kbit_training(model)

In [None]:
def format_instruction(sample):
    """Format the sample into instruction-response format for OpenAssistant"""
    return f"""<|system|>
You are a helpful AI assistant. Provide helpful, accurate, and detailed responses to user questions.
<|user|>
{sample['instruction']}
<|assistant|>
{sample['response']}"""

def preprocess_function(examples):
    """Preprocess the dataset samples for OpenAssistant format"""
    texts = []
    for i in range(len(examples['instruction'])):
        sample = {
            'instruction': examples['instruction'][i],
            'response': examples['response'][i]
        }
        texts.append(format_instruction(sample))

    return {"text": texts}

In [None]:
train_dataset = train_dataset.map(preprocess_function, batched=True,
                                remove_columns=train_dataset.column_names)
val_dataset = val_dataset.map(preprocess_function, batched=True,
                            remove_columns=val_dataset.column_names)


print("Sample formatted text:")
print(train_dataset[0]['text'][:500] + "...")

In [None]:
lora_configs = {
    # "balanced_trial_1": {
    #     "r": 8,                    # Increased from 4 to 8
    #     "lora_alpha": 32,          # Scaled proportionally (4x rank)
    #     "target_modules": ["q_proj", "v_proj"],  # Core attention modules
    #     "lora_dropout": 0.1,
    # },
    # "balanced_trial_2": {
    #     "r": 12,                   # Increased from 6 to 12
    #     "lora_alpha": 48,          # Scaled proportionally (4x rank)
    #     "target_modules": ["q_proj", "v_proj"],  # Keep to 2 modules for efficiency
    #     "lora_dropout": 0.1,
    # },
    # "balanced_trial_3": {
    #     "r": 6,                    # Increased from 2 to 6 (3x)
    #     "lora_alpha": 24,          # Scaled proportionally (4x rank)
    #     "target_modules": ["q_proj", "v_proj", "k_proj", "o_proj"],  # All attention
    #     "lora_dropout": 0.1,
    # },
    "balanced_trial_4": {
        "r": 16,                   # Increased from 12 to 16
        "lora_alpha": 32,          # Conservative scaling (2x rank)
        "target_modules": ["q_proj", "v_proj"],  # Only key modules
        "lora_dropout": 0.05,      # Keep lower dropout for high rank
    },
    # "balanced_trial_5": {
    #     "r": 10,                   # Increased from 8 to 10
    #     "lora_alpha": 32,          # Moderate scaling
    #     "target_modules": ["q_proj", "v_proj", "k_proj"],  # 3 modules
    #     "lora_dropout": 0.1,
    # },
}


In [None]:
training_configs = {
    # "balanced_trial_1": {
    #     "learning_rate": 2.5e-4,   # Slightly reduced for higher rank
    #     "per_device_train_batch_size": 4,  # Reduced for memory efficiency
    #     "num_train_epochs": 4,      # Increased from 3
    #     "gradient_accumulation_steps": 3,   # Increased to maintain effective batch size
    # },
    # "balanced_trial_2": {
    #     "learning_rate": 2e-4,      # Lower for highest rank trial
    #     "per_device_train_batch_size": 4,   # Consistent with trial 1
    #     "num_train_epochs": 4,      # Increased from 3
    #     "gradient_accumulation_steps": 3,   # Maintain effective batch size
    # },
    # "balanced_trial_3": {
    #     "learning_rate": 2.5e-4,    # Moderate rate for broad coverage
    #     "per_device_train_batch_size": 3,   # Lower due to 4 modules
    #     "num_train_epochs": 5,      # Longer training for broad coverage
    #     "gradient_accumulation_steps": 4,   # Higher accumulation to compensate
    # },
    "balanced_trial_4": {
        "learning_rate": 1.8e-4,    # Most conservative for highest rank
        "per_device_train_batch_size": 4,   # Maintain reasonable batch size
        "num_train_epochs": 5,      # Longer training for complex model
        "gradient_accumulation_steps": 3,   # Balanced accumulation
    },
    # "balanced_trial_5": {
    #     "learning_rate": 2.2e-4,    # Moderate rate
    #     "per_device_train_batch_size": 3,   # Lower for 3 modules
    #     "num_train_epochs": 6,      # Longest training for balanced approach
    #     "gradient_accumulation_steps": 4,   # Higher accumulation
    # },
}

In [None]:
def save_training_metrics(trainer, trial_name):
    """Save detailed training metrics"""
    training_logs = {
        "train_loss_history": trainer.state.log_history,
        "eval_loss_history": [log for log in trainer.state.log_history if 'eval_loss' in log],
        "learning_rate_schedule": [log.get('learning_rate', 0) for log in trainer.state.log_history],
        "global_steps": trainer.state.global_step,
        "total_epochs": trainer.state.epoch,
    }

    with open(f"training_metrics_{trial_name}.json", "w") as f:
        json.dump(training_logs, f, indent=2)

def save_comparison_plots(results):
    """Create and save comparison visualizations"""

    model_names = list(evaluation_results.keys())
    bleu_scores = [evaluation_results[name]['bleu_score'] for name in model_names]

    plt.figure(figsize=(12, 6))
    bars = plt.bar(model_names, bleu_scores)
    plt.title('BLEU Score Comparison Across Models')
    plt.xlabel('Model')
    plt.ylabel('BLEU Score')
    plt.xticks(rotation=45)

    best_idx = bleu_scores.index(max(bleu_scores))
    bars[best_idx].set_color('gold')

    plt.tight_layout()
    plt.savefig('bleu_comparison.png', dpi=300, bbox_inches='tight')
    plt.close()

    training_times = [result['training_time'] for result in results]
    trial_names = [result['trial_name'] for result in results]

    plt.figure(figsize=(10, 6))
    plt.bar(trial_names, training_times)
    plt.title('Training Time Comparison')
    plt.xlabel('Trial')
    plt.ylabel('Training Time (seconds)')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig('training_time_comparison.png', dpi=300, bbox_inches='tight')
    plt.close()

def save_hyperparameter_analysis(results):
    """Analyze relationship between hyperparameters and performance"""
    analysis_data = []
    print('here')

    for trial_name in lora_configs.keys():
        if trial_name in evaluation_results:
            print('here2')
            print("Type of results:", type(results))
            print("Keys of results:", results.keys() if isinstance(results, dict) else "Not a dict")
            analysis_data.append({
                'trial_name': trial_name,
                'bleu_score': evaluation_results[trial_name]['bleu_score'],
                'lora_r': lora_configs[trial_name]['r'],
                'lora_alpha': lora_configs[trial_name]['lora_alpha'],
                'learning_rate': training_configs[trial_name]['learning_rate'],
                'batch_size': training_configs[trial_name]['per_device_train_batch_size'],
                'epochs': training_configs[trial_name]['num_train_epochs'],
                'num_target_modules': len(lora_configs[trial_name]['target_modules']),
                'training_time': next((r['training_time'] for r in results if r['trial_name'] == trial_name), "0")
            })

    df = pd.DataFrame(analysis_data)
    df.to_csv('hyperparameter_analysis.csv', index=False)

    numeric_cols = ['bleu_score', 'lora_r', 'lora_alpha', 'learning_rate',
                   'batch_size', 'epochs', 'num_target_modules', 'training_time']
    corr_matrix = df[numeric_cols].corr()

    plt.figure(figsize=(10, 8))
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0)
    plt.title('Hyperparameter Correlation Matrix')
    plt.tight_layout()
    plt.savefig('hyperparameter_correlation.png', dpi=300, bbox_inches='tight')
    plt.close()

def save_response_analysis():
    """Analyze response quality in detail"""
    response_analysis = {}

    for i, prompt in enumerate(evaluation_prompts):
        prompt_analysis = {
            'prompt': prompt,
            'reference_answer': reference_answers[i],
            'responses': {},
            'response_lengths': {},
            'contains_numbers': {}
        }

        for model_name, results in evaluation_results.items():
            response = results['predictions'][i]
            prompt_analysis['responses'][model_name] = response
            prompt_analysis['response_lengths'][model_name] = len(response.split())
            prompt_analysis['contains_numbers'][model_name] = bool(re.search(r'\d+', response))

        response_analysis[f'prompt_{i+1}'] = prompt_analysis

    with open('detailed_response_analysis.json', 'w') as f:
        json.dump(response_analysis, f, indent=2)

def save_experiment_config():
    """Save complete experiment configuration for reproducibility"""
    config = {
        'timestamp': datetime.now().isoformat(),
        'model_name': MODEL_NAME,
        'dataset_info': {
            'name': 'microsoft/orca-math-word-problems-200k',
            'subset_size': SUBSET_SIZE,
            'train_size': len(train_dataset),
            'val_size': len(val_dataset)
        },
        'lora_configs': lora_configs,
        'training_configs': training_configs,
        'evaluation_prompts': evaluation_prompts,
        'reference_answers': reference_answers,
        'quantization_config': {
            'load_in_4bit': True,
            'bnb_4bit_quant_type': 'nf4',
            'bnb_4bit_compute_dtype': 'float16',
            'bnb_4bit_use_double_quant': True
        },
        'device_info': {
            'device': str(device),
            'cuda_available': torch.cuda.is_available(),
            'gpu_count': torch.cuda.device_count() if torch.cuda.is_available() else 0
        }
    }

    with open('experiment_config.json', 'w') as f:
        json.dump(config, f, indent=2)

def save_model_analysis():
    """Save analysis of model parameters and sizes"""
    model_analysis = {}

    for trial_name in lora_configs.keys():
        model_path = f"./models/{trial_name}"
        if os.path.exists(model_path):
            # Calculate approximate model size
            total_size = 0
            for root, dirs, files in os.walk(model_path):
                for file in files:
                    total_size += os.path.getsize(os.path.join(root, file))

            model_analysis[trial_name] = {
                'model_size_mb': total_size / (1024 * 1024),
                'lora_parameters': lora_configs[trial_name]['r'] * 2 * len(lora_configs[trial_name]['target_modules']),
                'target_modules': lora_configs[trial_name]['target_modules'],
                'efficiency_score': evaluation_results.get(trial_name, {}).get('bleu_score', 0) / (total_size / (1024 * 1024))
            }

    with open('model_analysis.json', 'w') as f:
        json.dump(model_analysis, f, indent=2)

def generate_summary_report():
    """Generate a comprehensive markdown report"""
    report = f"""# Supervised Fine-Tuning Experiment Report

## Experiment Overview
- **Date**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
- **Base Model**: {MODEL_NAME}
- **Dataset**: Microsoft Orca Math Word Problems (subset of {SUBSET_SIZE} samples)
- **Evaluation Metric**: BLEU Score

## Results Summary

### Best Performing Model
- **Model**: {best_model_name}
- **BLEU Score**: {best_model_results['bleu_score']:.4f}
- **Improvement over Base**: {best_model_results['bleu_score'] - evaluation_results['base_model']['bleu_score']:+.4f}

### All Models Performance
| Model | BLEU Score | Improvement |
|-------|------------|-------------|
"""

    for model_name, results in sorted_models:
        improvement = results['bleu_score'] - evaluation_results['base_model']['bleu_score']
        report += f"| {model_name} | {results['bleu_score']:.4f} | {improvement:+.4f} |\n"

    report += f"""
## Hyperparameter Analysis

### Best Configuration
"""
    if best_model_name != 'base_model':
        report += f"""
**LoRA Configuration:**
- Rank (r): {best_model_results['lora_config']['r']}
- Alpha: {best_model_results['lora_config']['lora_alpha']}
- Target Modules: {best_model_results['lora_config']['target_modules']}
- Dropout: {best_model_results['lora_config']['lora_dropout']}

**Training Configuration:**
- Learning Rate: {best_model_results['training_config']['learning_rate']}
- Batch Size: {best_model_results['training_config']['per_device_train_batch_size']}
- Epochs: {best_model_results['training_config']['num_train_epochs']}
- Gradient Accumulation Steps: {best_model_results['training_config']['gradient_accumulation_steps']}
"""

    report += "\n## Files Generated\n"
    generated_files = [
        "training_results_*.json - Individual trial results",
        "all_training_results.json - Combined training results",
        "bleu_evaluation_results.json - BLEU evaluation results",
        "best_sft_model_info.json - Best model info for DPO",
        "experiment_config.json - Complete experiment configuration",
        "hyperparameter_analysis.csv - Hyperparameter analysis data",
        "detailed_response_analysis.json - Detailed response analysis",
        "model_analysis.json - Model size and parameter analysis",
        "*.png - Visualization plots",
        "./models/*/ - Trained model checkpoints"
    ]

    for file_desc in generated_files:
        report += f"- {file_desc}\n"

    with open('experiment_report.md', 'w') as f:
        f.write(report)

In [None]:
os.environ["WANDB_DISABLED"] = "true"

def train_model(trial_name, lora_config, training_config):
    """Train model with given configurations"""
    print(f"\n{'='*60}")
    print(f"Training {trial_name}")
    print(f"{'='*60}")

    start_time = time.time()

    peft_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        inference_mode=False,
        r=lora_config["r"],
        lora_alpha=lora_config["lora_alpha"],
        target_modules=lora_config["target_modules"],
        lora_dropout=lora_config["lora_dropout"],
        bias="none",
    )

    model_copy = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True,
    )
    model_copy = prepare_model_for_kbit_training(model_copy)
    model_copy = get_peft_model(model_copy, peft_config)

    model_copy.print_trainable_parameters()

    training_args = TrainingArguments(
        output_dir=f"./results/{trial_name}",
        learning_rate=training_config["learning_rate"],
        per_device_train_batch_size=training_config["per_device_train_batch_size"],
        per_device_eval_batch_size=5,
        num_train_epochs=training_config["num_train_epochs"],
        gradient_accumulation_steps=training_config["gradient_accumulation_steps"],
        eval_strategy="steps",
        eval_steps=100,
        logging_steps=100,
        fp16=True,
    )

    trainer = SFTTrainer(
        model=model_copy,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        peft_config=peft_config,
        formatting_func=lambda x: x["text"],
        args=training_args,
    )

    trainer.train()

    trainer.save_model(f"./models/{trial_name}")

    end_time = time.time()
    training_time = end_time - start_time
    save_training_metrics(trainer, trial_name)


    print(f"{trial_name} completed in {training_time:.2f} seconds")

    del model_copy
    del trainer
    torch.cuda.empty_cache()

    return {
        "trial_name": trial_name,
        "training_time": training_time,
        "lora_config": lora_config,
        "training_config": training_config
    }

In [None]:
results = []

for trial_name in lora_configs.keys():
    try:
        result = train_model(
            trial_name=trial_name,
            lora_config=lora_configs[trial_name],
            training_config=training_configs[trial_name]
        )
        results.append(result)

        with open(f"training_results_{trial_name}.json", "w") as f:
            json.dump(result, f, indent=2)

    except Exception as e:
        print(f"Error in {trial_name}: {str(e)}")
        continue

In [None]:
print("\n" + "="*60)
print("ALL TRIALS COMPLETED")
print("="*60)

with open("all_training_results.json", "w") as f:
    json.dump(results, f, indent=2)

In [None]:
evaluation_prompts = [
    "What are the main differences between Python and JavaScript programming languages?",
    "How can I improve my time management skills while working from home?",
    "Explain the concept of climate change and its main causes.",
    "What are some effective strategies for learning a new language?",
    "How does the human immune system work to protect our body from diseases?",
    "What are the key principles of good user interface design?",
    "Explain the difference between artificial intelligence and machine learning.",
    "What are some healthy meal prep ideas for busy professionals?",
    "How can I overcome procrastination and become more productive?",
    "What are the environmental benefits of renewable energy sources?"
]

reference_answers = [
    "Python and JavaScript are both popular programming languages but serve different purposes. Python is primarily used for backend development, data science, and automation, with a clean, readable syntax. JavaScript is mainly used for web development, both frontend and backend (Node.js), and has more complex syntax. Python is interpreted and dynamically typed, while JavaScript runs in browsers and servers. Python has extensive libraries for scientific computing, while JavaScript excels in web interactivity and real-time applications.",

    "To improve time management while working from home: 1) Create a dedicated workspace and stick to regular hours, 2) Use time-blocking techniques to schedule specific tasks, 3) Eliminate distractions by turning off notifications during focused work, 4) Take regular breaks using techniques like the Pomodoro method, 5) Set clear boundaries between work and personal time, 6) Prioritize tasks using methods like the Eisenhower Matrix, and 7) Use productivity tools and apps to track your time and progress.",

    "Climate change refers to long-term shifts in global temperatures and weather patterns. Main causes include: 1) Greenhouse gas emissions from burning fossil fuels (coal, oil, gas), 2) Deforestation reducing CO2 absorption, 3) Industrial processes releasing methane and other gases, 4) Agriculture contributing to methane emissions, and 5) Transportation burning fossil fuels. These activities trap heat in Earth's atmosphere, leading to rising temperatures, melting ice caps, sea level rise, and extreme weather events.",

    "Effective language learning strategies include: 1) Immerse yourself through movies, music, and books in the target language, 2) Practice speaking regularly with native speakers or language exchange partners, 3) Use spaced repetition systems for vocabulary building, 4) Set specific, achievable daily goals, 5) Focus on commonly used words and phrases first, 6) Practice writing in the language regularly, 7) Use language learning apps and tools consistently, and 8) Don't fear making mistakes as they're part of the learning process.",

    "The human immune system is a complex network that protects against diseases through multiple layers: 1) Physical barriers like skin and mucous membranes, 2) Innate immunity providing immediate, general responses through white blood cells, 3) Adaptive immunity creating specific responses through T-cells and B-cells, 4) Antibodies that remember and quickly respond to previously encountered pathogens, 5) Lymph nodes filtering harmful substances, and 6) Bone marrow producing immune cells. The system coordinates to identify, attack, and remember threats.",

    "Key principles of good UI design include: 1) Simplicity - keep interfaces clean and uncluttered, 2) Consistency - use uniform elements and patterns throughout, 3) Visibility - make important functions easily discoverable, 4) Feedback - provide clear responses to user actions, 5) Error prevention and recovery - help users avoid mistakes and fix them easily, 6) Accessibility - ensure usability for people with different abilities, 7) User control - let users feel in control of their interactions, and 8) Recognition over recall - make options visible rather than requiring memory.",

    "Artificial Intelligence (AI) is the broader concept of machines performing tasks that typically require human intelligence, including reasoning, learning, and problem-solving. Machine Learning (ML) is a subset of AI that focuses specifically on algorithms that can learn and improve from data without being explicitly programmed for every scenario. While AI encompasses rule-based systems, expert systems, and robotics, ML specifically uses statistical techniques to enable computers to improve performance on tasks through experience. All machine learning is AI, but not all AI is machine learning.",

    "Healthy meal prep ideas for busy professionals: 1) Batch cook proteins like grilled chicken, baked salmon, or roasted tofu, 2) Prepare grain bowls with quinoa, brown rice, and vegetables, 3) Make overnight oats with fruits and nuts for breakfast, 4) Pre-cut vegetables and store in containers for easy access, 5) Prepare soups and stews that can be frozen in portions, 6) Make energy balls or healthy snacks in advance, 7) Use sheet pan meals for easy cooking and cleanup, and 8) Invest in quality containers for proper storage and portion control.",

    "To overcome procrastination and boost productivity: 1) Break large tasks into smaller, manageable steps, 2) Use the 'two-minute rule' - if it takes less than two minutes, do it now, 3) Eliminate decision fatigue by planning your day the night before, 4) Create accountability by sharing goals with others, 5) Remove distractions from your environment, 6) Use positive self-talk and focus on progress rather than perfection, 7) Reward yourself for completing tasks, and 8) Address underlying causes like fear of failure or perfectionism through self-reflection.",

    "Environmental benefits of renewable energy include: 1) Significant reduction in greenhouse gas emissions compared to fossil fuels, 2) Improved air quality by eliminating pollutants that cause smog and respiratory problems, 3) Reduced water usage as most renewables require little to no water for operation, 4) Decreased environmental degradation from mining and drilling operations, 5) Protection of ecosystems and wildlife habitats, 6) Reduced acid rain from sulfur dioxide emissions, 7) Lower risk of environmental disasters like oil spills, and 8) Sustainable energy production that doesn't deplete natural resources."
]


print("Evaluation prompts prepared:")
for i, prompt in enumerate(evaluation_prompts, 1):
    print(f"{i}. {prompt}")

In [None]:
from sacrebleu import corpus_bleu
import re
import evaluate

bleu_metric = evaluate.load("bleu")

In [None]:
def evaluate_base_model():
    print("Evaluating BASE MODEL")

    base_model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        device_map="auto",
        torch_dtype=torch.float16,
    )

    predictions = []

    for i, prompt in enumerate(evaluation_prompts, 1):
        print(f"\n{'-'*50}")
        print(f"BASE MODEL - PROMPT {i}: {prompt}")
        print(f"{'-'*50}")

        formatted_prompt = f"""<|system|>
You are a helpful AI assistant. Provide helpful, accurate, and detailed responses to user questions.
<|user|>
{prompt}
<|assistant|>
"""

        inputs = tokenizer(formatted_prompt, return_tensors="pt").to(device)

        with torch.no_grad():
            outputs = base_model.generate(
                **inputs,
                max_new_tokens=256,
                temperature=0.7,
                do_sample=True,
                pad_token_id=tokenizer.eos_token_id
            )

        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        assistant_response = response.split("<|assistant|>")[-1].strip()

        print(f"GENERATED RESPONSE:")
        print(assistant_response)

        print(f"\nREFERENCE ANSWER:")
        print(reference_answers[i-1])

        predictions.append(assistant_response)

    # Clean up
    del base_model
    torch.cuda.empty_cache()

    return predictions

In [None]:
def evaluate_sft_model(model_path):
    model_name = model_path.split('/')[-1]
    print(f"Evaluating model: {model_name}")

    try:
        model = AutoModelForCausalLM.from_pretrained(
            model_path,
            device_map="auto",
            torch_dtype=torch.float16,
        )

        predictions = []

        for i, prompt in enumerate(evaluation_prompts, 1):
            print(f"\n{'-'*50}")
            print(f"{model_name.upper()} - PROMPT {i}: {prompt}")
            print(f"{'-'*50}")

            formatted_prompt = f"""<|system|>
You are a helpful AI assistant. Provide helpful, accurate, and detailed responses to user questions.
<|user|>
{prompt}
<|assistant|>
"""

            inputs = tokenizer(formatted_prompt, return_tensors="pt").to(device)

            with torch.no_grad():
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=256,
                    temperature=0.7,
                    do_sample=True,
                    pad_token_id=tokenizer.eos_token_id
                )

            response = tokenizer.decode(outputs[0], skip_special_tokens=True)
            assistant_response = response.split("<|assistant|>")[-1].strip()

            print(f"GENERATED RESPONSE:")
            print(assistant_response)

            print(f"\nREFERENCE ANSWER:")
            print(reference_answers[i-1])

            predictions.append(assistant_response)

        del model
        torch.cuda.empty_cache()

        return predictions

    except Exception as e:
        print(f"Error evaluating model {model_path}: {str(e)}")
        return ["Error"] * len(evaluation_prompts)

In [None]:
def calculate_bleu_score(predictions, references):
    references_formatted = [[ref] for ref in references]
    bleu_result = bleu_metric.compute(predictions=predictions, references=references_formatted)
    return bleu_result['bleu']

In [None]:
print("\n" + "-"*40)
print("EVALUATING BASE MODEL")
print("-"*40)
base_predictions = evaluate_base_model()
base_bleu = calculate_bleu_score(base_predictions, reference_answers)
evaluation_results['base_model'] = {
    'bleu_score': base_bleu,
    'predictions': base_predictions
}
print(f"Base Model BLEU Score: {base_bleu:.4f}")

In [None]:
trial_names = list(lora_configs.keys())
for trial_name in trial_names:
    model_path = f"./models/{trial_name}"
    if os.path.exists(model_path):
        print(f"\n" + "-"*40)
        print(f"EVALUATING {trial_name.upper()}")
        print("-"*40)

        sft_predictions = evaluate_sft_model(model_path)
        sft_bleu = calculate_bleu_score(sft_predictions, reference_answers)

        evaluation_results[trial_name] = {
            'bleu_score': sft_bleu,
            'predictions': sft_predictions,
            'lora_config': lora_configs[trial_name],
            'training_config': training_configs[trial_name]
        }

        print(f"{trial_name} BLEU Score: {sft_bleu:.4f}")
        improvement = sft_bleu - base_bleu
        print(f"Improvement over base: {improvement:+.4f}")
    else:
        print(f"Model path {model_path} not found, skipping {trial_name}")


In [None]:
print("\n" + "="*60)
print("BLEU SCORE COMPARISON RESULTS")
print("="*60)

sorted_models = sorted(evaluation_results.items(), key=lambda x: x[1]['bleu_score'], reverse=True)

print(f"{'Model':<15} {'BLEU Score':<12} {'Improvement':<12}")
print("-" * 40)

for model_name, results in sorted_models:
    bleu_score = results['bleu_score']
    improvement = bleu_score - evaluation_results['base_model']['bleu_score']
    print(f"{model_name:<15} {bleu_score:<12.4f} {improvement:+12.4f}")


In [None]:
def save_model_responses_comparison():

    print("Creating model responses comparison file...")

    comparison_content = "# Model Responses Comparison for Manual Evaluation\n\n"
    comparison_content += f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n"
    comparison_content += "---\n\n"

    comparison_content += "## Summary Table\n\n"
    comparison_content += "| Model | BLEU Score | Ranking |\n"
    comparison_content += "|-------|------------|----------|\n"

    for rank, (model_name, results) in enumerate(sorted_models, 1):
        comparison_content += f"| {model_name} | {results['bleu_score']:.4f} | #{rank} |\n"

    comparison_content += "\n---\n\n"

    for i, prompt in enumerate(evaluation_prompts):
        comparison_content += f"## Prompt {i+1}\n\n"
        comparison_content += f"**Question:** {prompt}\n\n"

        # Reference answer
        comparison_content += f"### Reference Answer\n"
        comparison_content += f"{reference_answers[i]}\n\n"

        # Base model response
        comparison_content += f"### Base Model Response\n"
        comparison_content += f"{evaluation_results['base_model']['predictions'][i]}\n\n"

        # Fine-tuned model responses
        for model_name, results in sorted_models:
            if model_name != 'base_model':
                comparison_content += f"### {model_name} Response (BLEU: {results['bleu_score']:.4f})\n"
                comparison_content += f"{results['predictions'][i]}\n\n"

        comparison_content += "---\n\n"

    with open("model_responses_comparison.md", "w", encoding="utf-8") as f:
        f.write(comparison_content)

    csv_data = []
    for i, prompt in enumerate(evaluation_prompts):
        row = {
            'prompt_id': i + 1,
            'prompt': prompt,
            'reference_answer': reference_answers[i]
        }

        for model_name, results in evaluation_results.items():
            row[f'{model_name}_response'] = results['predictions'][i]
            row[f'{model_name}_bleu'] = results['bleu_score']

        csv_data.append(row)

    df_responses = pd.DataFrame(csv_data)
    df_responses.to_csv("model_responses_comparison.csv", index=False)

    manual_scoring_content = "# Manual Scoring Template\n\n"
    manual_scoring_content += "Rate each response on a scale of 1-5 for:\n"
    manual_scoring_content += "- Accuracy (A): How factually correct is the response?\n"
    manual_scoring_content += "- Completeness (C): How well does it answer the full question?\n"
    manual_scoring_content += "- Clarity (Cl): How clear and well-structured is the response?\n"
    manual_scoring_content += "- Relevance (R): How relevant is the response to the question?\n\n"

    manual_scoring_content += "Format: A=X, C=X, Cl=X, R=X, Overall=X\n\n"
    manual_scoring_content += "---\n\n"

    for i, prompt in enumerate(evaluation_prompts):
        manual_scoring_content += f"## Prompt {i+1}: {prompt}\n\n"

        manual_scoring_content += "| Model | Response | Your Scores |\n"
        manual_scoring_content += "|-------|----------|-------------|\n"

        for model_name, results in sorted_models:
            response = results['predictions'][i]
            truncated_response = response[:100] + "..." if len(response) > 100 else response
            truncated_response = truncated_response.replace('\n', ' ').replace('|', '\\|')
            manual_scoring_content += f"| {model_name} | {truncated_response} | A=_, C=_, Cl=_, R=_, Overall=_ |\n"

        manual_scoring_content += "\n### Full Responses:\n\n"
        for model_name, results in sorted_models:
            manual_scoring_content += f"**{model_name}:**\n{results['predictions'][i]}\n\n"

        manual_scoring_content += "---\n\n"

    with open("manual_scoring_template.md", "w", encoding="utf-8") as f:
        f.write(manual_scoring_content)

    print("✓ Created model_responses_comparison.md - Full readable comparison")
    print("✓ Created model_responses_comparison.csv - Spreadsheet format for analysis")
    print("✓ Created manual_scoring_template.md - Template for your manual evaluation")


In [None]:
best_model_name = sorted_models[0][0]
best_model_results = sorted_models[0][1]

print("\n" + "="*60)
print("BEST PERFORMING MODEL")
print("="*60)
print(f"Best Model: {best_model_name}")
print(f"BLEU Score: {best_model_results['bleu_score']:.4f}")

if best_model_name != 'base_model':
    print(f"LoRA Configuration:")
    for key, value in best_model_results['lora_config'].items():
        print(f"  - {key}: {value}")
    print(f"Training Configuration:")
    for key, value in best_model_results['training_config'].items():
        print(f"  - {key}: {value}")

In [None]:
print("\n" + "="*60)
print("SAMPLE PREDICTIONS COMPARISON")
print("="*60)

for i in range(min(3, len(evaluation_prompts))):
    print(f"\n{'='*80}")
    print(f"PROMPT {i+1}: {evaluation_prompts[i]}")
    print(f"{'='*80}")

    print(f"\nREFERENCE ANSWER:")
    print(f"{reference_answers[i]}")

    print(f"\nBASE MODEL PREDICTION:")
    print(f"{evaluation_results['base_model']['predictions'][i]}")

    print(f"\nBEST MODEL ({best_model_name}) PREDICTION:")
    if best_model_name != 'base_model':
        print(f"{best_model_results['predictions'][i]}")
    else:
        print("Same as base model")


In [None]:
with open("bleu_evaluation_results.json", "w") as f:
    json.dump(evaluation_results, f, indent=2)

best_model_info = {
    "best_model_name": best_model_name,
    "best_model_path": f"./models/{best_model_name}" if best_model_name != 'base_model' else MODEL_NAME,
    "bleu_score": best_model_results['bleu_score'],
    "lora_config": best_model_results.get('lora_config', None),
    "training_config": best_model_results.get('training_config', None)
}

with open("best_sft_model_info.json", "w") as f:
    json.dump(best_model_info, f, indent=2)

print(f"\n" + "="*60)
print("EVALUATION COMPLETED")
print("="*60)
print(f"Best model for DPO: {best_model_name}")
print(f"Best model path: {best_model_info['best_model_path']}")
print(f"Files saved:")
print(f"- bleu_evaluation_results.json: Detailed BLEU evaluation results")
print(f"- best_sft_model_info.json: Best model information for DPO phase")


In [None]:
print("\nGenerating additional analysis and visualizations...")

save_hyperparameter_analysis(training_results)
print("✓ Saved hyperparameter analysis")

save_response_analysis()
print("✓ Saved detailed response analysis")

save_model_analysis()
print("✓ Saved model analysis")

generate_summary_report()
print("✓ Generated comprehensive report")

save_comparison_plots(training_results)
print("✓ Saved comparison plots")

save_model_responses_comparison()
print("✓ Saved model responses comparison files")

In [None]:
import shutil

shutil.make_archive('output_backup', 'zip', '.')