In [1]:
%%capture
# 📦 Install required packages
!pip install transformers==4.36.0
!pip install torch
!pip install datasets
!pip install accelerate
!pip install bitsandbytes
!pip install peft
!pip install trl
!pip install wandb
!pip install scipy

In [2]:
%%capture
!pip install --upgrade transformers

In [3]:
import torch
import numpy as np
import json
import os
from datetime import datetime
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig
)
from datasets import Dataset, load_dataset
from peft import (
    LoraConfig,
    get_peft_model,
    TaskType,
    PeftModel
)
from trl import DPOTrainer  # CHANGED: Removed PPO imports, added DPO
import wandb

# 🔧 A100-Optimized Configuration for DPO
class DPOConfig:  # CHANGED: Renamed from RLHFConfig
    def __init__(self):
        # Model paths - UPDATED to match new directory structure
        self.base_model = "meta-llama/Meta-Llama-3.1-8B-Instruct"
        self.fine_tuned_model_path = "/content/drive/MyDrive/llama_31_therapist_outputs/llama31_merged_v104952"
        self.dataset_name = "Psychotherapy-LLM/PsychoCounsel-Preference"

        # A100 Optimized Training parameters for DPO
        self.batch_size = 1
        self.learning_rate = 1.4e-5
        self.max_length = 512
        self.max_prompt_length = 256
        self.gradient_accumulation_steps = 8
        self.num_train_epochs = 3
        self.save_steps = 50  # More frequent saves
        self.logging_steps = 5  # More frequent logging
        self.beta = 0.1  # ADDED: DPO beta parameter

        # Output directory - UPDATED for DPO
        self.output_dir = "/content/drive/MyDrive/llama_31_therapist_outputs/dpo_therapy_model"

        # Device
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        # Weights & Biases logging
        self.use_wandb = True

config = DPOConfig()  # CHANGED: Updated class name

from google.colab import drive
print("Attempting to remount Google Drive...")
drive.mount('/content/drive', force_remount=True) # force_remount can help with stale mounts
print("Google Drive remount initiated. Please check for verification prompt if any.")

print("🚀 Starting A100-Optimized DPO Training Setup...")  # CHANGED: Updated message
print(f"📱 Device: {config.device}")
print(f"🧠 GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
print(f"🔥 GPU Name: {torch.cuda.get_device_properties(0).name}")

# Verify the fine-tuned model path exists
if os.path.exists(config.fine_tuned_model_path):
    print(f"✅ Fine-tuned model found at: {config.fine_tuned_model_path}")
    # Check for model files
    model_files = [f for f in os.listdir(config.fine_tuned_model_path) if f.endswith(('.safetensors', '.bin'))]
    print(f"📁 Model files found: {len(model_files)}")
else:
    print(f"❌ Fine-tuned model NOT found at: {config.fine_tuned_model_path}")
    print("Please check the path or complete the initial fine-tuning first.")

Attempting to remount Google Drive...
Mounted at /content/drive
Google Drive remount initiated. Please check for verification prompt if any.
🚀 Starting A100-Optimized DPO Training Setup...
📱 Device: cuda
🧠 GPU Memory: 42.5 GB
🔥 GPU Name: NVIDIA A100-SXM4-40GB
✅ Fine-tuned model found at: /content/drive/MyDrive/llama_31_therapist_outputs/llama31_merged_v104952
📁 Model files found: 4


In [4]:
def load_preference_dataset():
    """Load and preprocess the preference dataset for DPO"""
    print("📚 Loading PsychoCounsel preference dataset for DPO...")

    try:
        dataset = load_dataset(config.dataset_name)

        # Check if train split exists, otherwise use available split
        if 'train' in dataset:
            train_dataset = dataset['train']
        else:
            # Use the first available split
            available_splits = list(dataset.keys())
            print(f"⚠️ No 'train' split found. Available splits: {available_splits}")
            train_dataset = dataset[available_splits[0]]
            print(f"📊 Using '{available_splits[0]}' split instead")

        print(f"📊 Dataset columns: {train_dataset.column_names}")
        print(f"📊 Dataset size: {len(train_dataset)}")

        # CHANGED: DPO needs prompt, chosen, rejected - keep all three
        def format_data_for_dpo(examples):
            return {
                'prompt': examples['question'],    # The question/prompt
                'chosen': examples['chosen'],      # Preferred response
                'rejected': examples['rejected']   # Less preferred response
            }

        formatted_dataset = train_dataset.map(
            format_data_for_dpo,
            batched=True,
            # CHANGED: Don't remove columns, DPO needs chosen/rejected
        )

        print(f"✅ Dataset loaded and formatted for DPO: {len(formatted_dataset)} examples")

        # ADDED: Randomly sample 3500 examples for faster training
        if len(formatted_dataset) > 3500:
            formatted_dataset = formatted_dataset.shuffle(seed=42).select(range(3500))
            print(f"🚀 Using random subset of {len(formatted_dataset)} examples for faster training")
        else:
            print(f"🚀 Using full dataset of {len(formatted_dataset)} examples")

        # Show a sample to verify format
        if len(formatted_dataset) > 0:
            print(f"📝 Sample prompt: {formatted_dataset[0]['prompt'][:100]}...")
            print(f"📝 Sample chosen: {formatted_dataset[0]['chosen'][:100]}...")
            print(f"📝 Sample rejected: {formatted_dataset[0]['rejected'][:100]}...")

        return formatted_dataset

    except Exception as e:
        print(f"❌ Error loading dataset: {e}")
        print(f"🔍 Dataset name attempted: {config.dataset_name}")

        # CHANGED: Fallback data for DPO (create sample chosen/rejected pairs)
        print("🔄 Creating sample DPO dataset...")
        sample_data = []

        sample_prompts = [
            "I've been feeling really anxious lately and can't seem to calm down. What should I do?",
            "I'm having trouble sleeping and my mind keeps racing at night.",
            "I feel like I'm not good enough and constantly compare myself to others.",
            "I'm going through a difficult breakup and feel lost.",
            "I'm struggling with work-life balance and feel burned out.",
            "How can I manage my panic attacks better?",
            "I feel disconnected from my friends and family.",
            "I'm dealing with imposter syndrome at work.",
            "I can't seem to motivate myself to do anything.",
            "I'm having intrusive thoughts that worry me.",
        ]

        # Create chosen/rejected pairs for each prompt
        for prompt in sample_prompts:
            sample_data.append({
                'prompt': prompt,
                'chosen': f"I understand you're going through a difficult time with {prompt.split()[4:8]}. It's completely normal to feel this way, and I want you to know that you're not alone. Let's work together to explore some strategies that might help you feel better. Can you tell me more about when these feelings are strongest?",
                'rejected': f"That sounds tough. You should probably just try to relax and not think about it too much. Maybe try some deep breathing or something."
            })

        # Extend for more training data
        extended_data = sample_data * 5  # 50 examples total
        sample_dataset = Dataset.from_list(extended_data)

        print(f"✅ Sample DPO dataset created: {len(sample_dataset)} examples")
        return sample_dataset

In [5]:
def load_fine_tuned_model():
    """Load your fine-tuned therapy model - A100 optimized for DPO"""
    print("🦙 Loading fine-tuned Llama 3.1 therapy model for DPO (A100 optimized)...")

    # A100-optimized quantization config (less aggressive for better quality)
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.bfloat16,  # Use bfloat16 for A100
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4"
    )

    # quantization_config = None  # No quantization for highest quality (requires ~16GB VRAM)

    try:
        # Load tokenizer
        tokenizer = AutoTokenizer.from_pretrained(
            config.fine_tuned_model_path,
            trust_remote_code=True
        )
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token

        # Load model - Flash Attention disabled for compatibility
        model = AutoModelForCausalLM.from_pretrained(
            config.fine_tuned_model_path,
            quantization_config=quantization_config,
            device_map="auto",
            torch_dtype=torch.bfloat16,  # Use bfloat16 for A100
            trust_remote_code=True
        )

        print("✅ Fine-tuned model loaded successfully with A100 optimizations!")
        print(f"📊 Model device: {model.device}")
        print(f"🧠 Model dtype: {model.dtype}")
        print("🔧 Ready for DPO training!")

        return model, tokenizer

    except Exception as e:
        print(f"⚠️  Error loading fine-tuned model: {e}")
        print("🔄 Falling back to base model...")

        # Fallback to base model
        tokenizer = AutoTokenizer.from_pretrained(
            config.base_model,
            trust_remote_code=True
        )
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token

        model = AutoModelForCausalLM.from_pretrained(
            config.base_model,
            quantization_config=quantization_config,
            device_map="auto",
            torch_dtype=torch.bfloat16,
            trust_remote_code=True
        )

        print("⚠️  Using base model instead of fine-tuned model")
        print("🔧 Ready for DPO training with base model")
        return model, tokenizer

In [6]:
def setup_dpo_training():
    """Setup DPO training configuration - A100 optimized"""
    print("🚀 Setting up A100-optimized DPO training...")

    # Initialize wandb FIRST
    if config.use_wandb:
        try:
            # Login to wandb (replace with your actual token)
            wandb.login(key="533e36a3654f7c3301150fe947ee1b5cfe62c96b")
            print("✅ wandb login successful")

            wandb.init(
                project="llama-therapy-dpo",
                name=f"dpo-training-{datetime.now().strftime('%Y%m%d-%H%M%S')}",
                config={
                    "base_model": config.base_model,
                    "fine_tuned_model_path": config.fine_tuned_model_path,
                    "batch_size": config.batch_size,
                    "learning_rate": config.learning_rate,
                    "max_length": config.max_length,
                    "max_prompt_length": config.max_prompt_length,
                    "gradient_accumulation_steps": config.gradient_accumulation_steps,
                    "num_train_epochs": config.num_train_epochs,
                    "lora_r": 32,
                    "lora_alpha": 64,
                    "beta": config.beta,
                },
                tags=["dpo", "therapy", "llama-3.1", "a100"]
            )
            print("✅ wandb initialized successfully")

        except Exception as e:
            print(f"⚠️ wandb setup failed: {e}")
            config.use_wandb = False

    # Load model and dataset
    print("📚 Loading model and dataset...")
    model, tokenizer = load_fine_tuned_model()
    dataset = load_preference_dataset()

    # Split dataset for training and evaluation
    print("🔄 Splitting dataset for train/eval...")
    train_test_split = dataset.train_test_split(test_size=0.1, seed=42)
    train_dataset = train_test_split['train']
    eval_dataset = train_test_split['test']

    print(f"📊 Train dataset: {len(train_dataset)} examples")
    print(f"📊 Eval dataset: {len(eval_dataset)} examples")

    # A100-optimized LoRA configuration
    lora_config = LoraConfig(
        r=32,
        lora_alpha=64,
        target_modules=[
            "q_proj", "v_proj", "k_proj", "o_proj",
            "gate_proj", "up_proj", "down_proj"
        ],
        lora_dropout=0.1,
        bias="none",
        task_type=TaskType.CAUSAL_LM,
    )

    print("🎯 Setting up DPO configuration...")

    try:
        from trl import DPOConfig

        dpo_config = DPOConfig(
            output_dir=config.output_dir,
            learning_rate=config.learning_rate,
            per_device_train_batch_size=config.batch_size,
            per_device_eval_batch_size=config.batch_size,
            gradient_accumulation_steps=config.gradient_accumulation_steps,
            num_train_epochs=config.num_train_epochs,
            logging_steps=config.logging_steps,
            save_steps=config.save_steps,
            eval_steps=config.save_steps,
            bf16=True,
            remove_unused_columns=False,
            report_to="wandb" if config.use_wandb else None,
            dataloader_drop_last=True,
            eval_strategy="steps",
            save_strategy="steps",
            load_best_model_at_end=True,
            metric_for_best_model="eval_loss",
            greater_is_better=False,
            save_total_limit=3,
        )

    except Exception as e:
        print(f"⚠️ Using TrainingArguments fallback: {e}")
        # Fallback to TrainingArguments if DPOConfig doesn't work
        dpo_config = TrainingArguments(
            output_dir=config.output_dir,
            learning_rate=config.learning_rate,
            per_device_train_batch_size=config.batch_size,
            per_device_eval_batch_size=config.batch_size,
            gradient_accumulation_steps=config.gradient_accumulation_steps,
            num_train_epochs=config.num_train_epochs,
            logging_steps=config.logging_steps,
            save_steps=config.save_steps,
            eval_steps=config.save_steps,
            bf16=True,
            remove_unused_columns=False,
            report_to="wandb" if config.use_wandb else None,
            dataloader_drop_last=True,
            eval_strategy="steps",
            save_strategy="steps",
            load_best_model_at_end=True,
            metric_for_best_model="eval_loss",
            greater_is_better=False,
            save_total_limit=3,
        )

    print("🏋️ Initializing DPO trainer...")

    # Initialize DPO trainer with CORRECT TRL 0.19.0 signature
    try:
        dpo_trainer = DPOTrainer(
            model=model,                        # main model
            ref_model=None,                     # DPO creates reference model automatically
            args=dpo_config,                    # DPO configuration (not TrainingArguments)
            train_dataset=train_dataset,        # training dataset
            eval_dataset=eval_dataset,          # evaluation dataset
            processing_class=tokenizer,         # tokenizer (called processing_class!)
            peft_config=lora_config,            # LoRA configuration
        )

        print("✅ DPO trainer initialized successfully!")
        print(f"📊 Training dataset size: {len(train_dataset)}")
        print(f"📊 Evaluation dataset size: {len(eval_dataset)}")
        print(f"🎯 Batch size: {config.batch_size}")
        print(f"📏 Max sequence length: {config.max_length}")
        print(f"🎲 Beta parameter: {config.beta}")
        print(f"📈 Evaluation every {config.save_steps} steps")

        # Log additional info to wandb after successful initialization
        if config.use_wandb:
            wandb.log({
                "model/num_parameters": sum(p.numel() for p in model.parameters()),
                "model/trainable_parameters": sum(p.numel() for p in model.parameters() if p.requires_grad),
                "data/dataset_size": len(train_dataset) + len(eval_dataset),
                "system/gpu_name": torch.cuda.get_device_properties(0).name,
                "system/gpu_memory_gb": torch.cuda.get_device_properties(0).total_memory / 1e9,
            })

    except Exception as e:
        print(f"❌ Error initializing DPO trainer: {e}")
        if config.use_wandb:
            wandb.finish()
        raise

    return dpo_trainer

In [7]:
def run_dpo_training():
    """Main DPO training loop - A100 optimized"""
    print("🎓 Starting A100-Optimized DPO Training...")

    # Setup training (much simpler than PPO!)
    dpo_trainer = setup_dpo_training()

    # Create output directory
    os.makedirs(config.output_dir, exist_ok=True)

    print("🚀 Starting DPO training...")

    try:
        # DPO training is much simpler - just call train()!
        training_result = dpo_trainer.train()

        print("✅ DPO training completed successfully!")

        # Save the final model
        print("💾 Saving final model...")
        dpo_trainer.save_model()

        # Save tokenizer
        dpo_trainer.tokenizer.save_pretrained(config.output_dir)

        # Log final metrics
        if config.use_wandb:
            final_metrics = {
                "training/final_loss": training_result.training_loss,
                "training/total_steps": training_result.global_step,
            }
            wandb.log(final_metrics)
            wandb.finish()
            print("✅ wandb logging completed")

        print(f"🎉 DPO Training completed!")
        print(f"📊 Final training loss: {training_result.training_loss:.4f}")
        print(f"📈 Total training steps: {training_result.global_step}")
        print(f"💾 Model saved to: {config.output_dir}")

        return training_result

    except Exception as e:
        print(f"❌ Error during DPO training: {e}")
        if config.use_wandb:
            wandb.finish()
        raise

# MUCH SIMPLER ALTERNATIVE: You can also just use the trainer directly
def simple_dpo_training():
    """Even simpler DPO training - just the essentials"""
    print("🎯 Simple DPO Training...")

    # Setup
    trainer = setup_dpo_training()

    # Train (that's it!)
    trainer.train()

    # Save
    trainer.save_model()

    print("✅ Done!")
    return trainer

In [8]:
def test_dpo_model():
    """Test the DPO trained model using NickyNicky dataset - A100 optimized"""
    print("🧪 Testing A100-Optimized DPO model with NickyNicky/nlp-mental-health-conversations...")

    # Load the DPO trained model
    model_path = config.output_dir  # DPO saves directly to output_dir

    if not os.path.exists(model_path):
        print("❌ No trained DPO model found!")
        return []

    print(f"📂 Loading DPO model from: {model_path}")

    try:
        tokenizer = AutoTokenizer.from_pretrained(model_path)
        model = AutoModelForCausalLM.from_pretrained(
            model_path,
            torch_dtype=torch.bfloat16,  # A100 optimized
            device_map="auto"
        )

        # Load the same testing dataset
        print("📚 Loading NickyNicky/nlp-mental-health-conversations for comparison...")
        try:
            test_dataset = load_dataset("NickyNicky/nlp-mental-health-conversations")
            test_data = test_dataset['train']

            # Test on more examples with A100 power
            test_size = min(100, len(test_data))  # Increased to 100 examples
            test_subset = test_data.select(range(test_size))

            print(f"📊 Testing on {test_size} examples from NickyNicky dataset")

        except Exception as e:
            print(f"⚠️ Error loading NickyNicky dataset: {e}")
            print("🔄 Using fallback test queries...")
            test_queries = [
                "I'm feeling overwhelmed with anxiety. Can you help me?",
                "I've been having trouble sleeping due to stress.",
                "I feel like I'm not making progress in therapy.",
                "I'm struggling with depression and feel hopeless.",
                "How can I cope with panic attacks?",
                "I'm having relationship problems and don't know what to do.",
                "I feel like I'm not good enough and have low self-esteem.",
                "I'm dealing with grief after losing someone close to me.",
                "I'm having trouble managing my anger",
                "I feel lonely and isolated from others",
                "I can't seem to focus on anything lately",
                "I'm worried about my future and career",
                "I feel like I'm always disappointing people",
                "I'm struggling with body image issues",
                "I have trouble setting boundaries with others"
            ]
            test_subset = [{"text": query} for query in test_queries]

        results = []

        # Process in batches for A100 efficiency
        batch_size = 4

        for i in range(0, len(test_subset), batch_size):
            batch = test_subset[i:i+batch_size]

            for j, example in enumerate(batch):
                example_idx = i + j

                # Extract the query/prompt from the dataset
                if isinstance(example, dict):
                    query = example.get('text', example.get('prompt', example.get('input', str(example))))
                else:
                    query = str(example)

                # Limit query length
                if len(query) > 300:
                    query = query[:300] + "..."

                print(f"\n🔄 Processing example {example_idx+1}/{len(test_subset)}")
                print(f"💬 Query: {query[:100]}...")

                # Generate response
                inputs = tokenizer.encode(
                    query,
                    return_tensors="pt",
                    max_length=config.max_prompt_length,
                    truncation=True
                )

                with torch.no_grad():
                    outputs = model.generate(
                        inputs,
                        max_new_tokens=300,  # Increased for A100
                        do_sample=True,
                        temperature=0.7,
                        top_p=0.9,
                        pad_token_id=tokenizer.pad_token_id,
                        eos_token_id=tokenizer.eos_token_id
                    )

                response = tokenizer.decode(outputs[0], skip_special_tokens=True)
                response = response[len(tokenizer.decode(inputs[0], skip_special_tokens=True)):].strip()

                results.append({
                    'example_id': example_idx,
                    'query': query,
                    'dpo_response': response,  # CHANGED: Updated to dpo_response
                    'model_type': 'fine_tuned_plus_dpo_a100',  # CHANGED: Updated model type
                    'response_length': len(response),
                })

                print(f"🤖 DPO Response: {response[:200]}...")

        # Save detailed test results
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        results_file = f"{config.output_dir}/test_results_a100_dpo_{timestamp}.json"  # CHANGED: Updated filename

        with open(results_file, 'w') as f:
            json.dump(results, f, indent=2)

        # Generate summary statistics
        response_lengths = [r['response_length'] for r in results]

        print(f"\n✅ A100-Optimized DPO Testing completed!")  # CHANGED: Updated message
        print(f"📁 Results saved to: {results_file}")
        print(f"📊 Tested on {len(results)} examples")
        print(f"📏 Average response length: {np.mean(response_lengths):.1f} characters")
        print(f"📏 Response length range: {min(response_lengths)} - {max(response_lengths)}")

        print("\n🔍 A100 DPO COMPARISON ANALYSIS:")  # CHANGED: Updated analysis
        print("Compare with your original fine-tuned model results for:")
        print("  • Better preference alignment (chooses better responses)")
        print("  • More consistent quality across responses")
        print("  • Improved empathy and professional guidance balance")
        print("  • Better adherence to therapeutic best practices")
        print("  • More nuanced understanding of user preferences")
        print("  • Reduced harmful or inappropriate responses")

        return results

    except Exception as e:
        print(f"❌ Error during testing: {e}")
        import traceback
        traceback.print_exc()
        return []

In [9]:
from huggingface_hub import login

# Log in to Hugging Face
login(token="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")

print("=" * 70)
print("🎯 LLAMA 3.1 THERAPY MODEL - A100-OPTIMIZED DPO TRAINING")  # CHANGED: RLHF -> DPO
print("=" * 70)
print("🚀 Leveraging A100 power for faster, higher-quality training!")
print("⚡ Expected training time: 1-3 hours")  # CHANGED: DPO is typically faster than PPO
print("🎯 Using Direct Preference Optimization (DPO) for stability!")  # ADDED: DPO advantage
print("=" * 70)

# Start training
start_time = datetime.now()
stats = run_dpo_training()  # CHANGED: run_rlhf_training() -> run_dpo_training()
end_time = datetime.now()

training_duration = end_time - start_time
print(f"\n⏱️ Total DPO training time: {training_duration}")  # CHANGED: Added "DPO"

# Optional: Test the trained model
print("\n" + "=" * 50)
print("🧪 TESTING THE DPO-TRAINED MODEL")  # CHANGED: Updated test message
print("=" * 50)

test_results = test_dpo_model()  # CHANGED: test_rlhf_model() -> test_dpo_model()

print(f"\n🎉 A100-Optimized DPO Training & Testing Complete!")  # CHANGED: RLHF -> DPO
print(f"📊 Training completed in: {training_duration}")
print(f"🧪 Tested on {len(test_results)} examples")
print(f"💾 Model saved to: {config.output_dir}")

if config.use_wandb:
    print(f"📈 Check your training metrics at: https://wandb.ai")

print("\n🏆 Your therapy model is now DPO-aligned and ready to use!")

🎯 LLAMA 3.1 THERAPY MODEL - A100-OPTIMIZED DPO TRAINING
🚀 Leveraging A100 power for faster, higher-quality training!
⚡ Expected training time: 1-3 hours
🎯 Using Direct Preference Optimization (DPO) for stability!
🎓 Starting A100-Optimized DPO Training...
🚀 Setting up A100-optimized DPO training...


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mkeera[0m ([33mkeera-nanyang-technological-university-singapore[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


✅ wandb login successful


✅ wandb initialized successfully
📚 Loading model and dataset...
🦙 Loading fine-tuned Llama 3.1 therapy model for DPO (A100 optimized)...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

✅ Fine-tuned model loaded successfully with A100 optimizations!
📊 Model device: cuda:0
🧠 Model dtype: torch.bfloat16
🔧 Ready for DPO training!
📚 Loading PsychoCounsel preference dataset for DPO...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


📊 Dataset columns: ['ID', 'prefID', 'question', 'chosen', 'rejected', 'chosen_model', 'rejected_model', 'chosen_empathy_rating', 'chosen_relevance_rating', 'chosen_clarity_rating', 'chosen_safety_rating', 'chosen_exploration_rating', 'chosen_autonomy_rating', 'chosen_staging_rating', 'rejected_empathy_rating', 'rejected_relevance_rating', 'rejected_clarity_rating', 'rejected_safety_rating', 'rejected_exploration_rating', 'rejected_autonomy_rating', 'rejected_staging_rating']
📊 Dataset size: 34329


Map:   0%|          | 0/34329 [00:00<?, ? examples/s]

✅ Dataset loaded and formatted for DPO: 34329 examples
🚀 Using random subset of 3500 examples for faster training
📝 Sample prompt: I've just learned that my father left me out of his IRA, only leaving it to my two younger siblings....
📝 Sample chosen: It sounds like you're going through a really difficult time, and it's understandable that you're fee...
📝 Sample rejected: I'm sorry that you are going through this difficult time. It is natural to feel betrayed and hurt by...
🔄 Splitting dataset for train/eval...
📊 Train dataset: 3150 examples
📊 Eval dataset: 350 examples
🎯 Setting up DPO configuration...
🏋️ Initializing DPO trainer...


Extracting prompt in train dataset:   0%|          | 0/3150 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/3150 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/3150 [00:00<?, ? examples/s]

Extracting prompt in eval dataset:   0%|          | 0/350 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/350 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/350 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


✅ DPO trainer initialized successfully!
📊 Training dataset size: 3150
📊 Evaluation dataset size: 350
🎯 Batch size: 1
📏 Max sequence length: 512
🎲 Beta parameter: 0.1
📈 Evaluation every 50 steps
🚀 Starting DPO training...




Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
50,0.2445,0.148293,3.051175,-2.529736,0.954286,5.580911,-323.193268,-308.365143,0.634078,0.653803
100,0.056,0.1144,3.91279,-2.493588,0.96,6.406377,-314.577087,-308.003662,0.5359,0.542549
150,0.0314,0.0881,3.976796,-3.071954,0.971429,7.04875,-313.937042,-313.787354,0.512362,0.500131
200,0.0097,0.085035,3.974954,-3.478834,0.968571,7.453788,-313.955475,-317.85611,0.412503,0.387117
250,0.0052,0.069061,3.99949,-3.643056,0.974286,7.642546,-313.710083,-319.498352,0.275347,0.255954
300,0.0095,0.070254,3.541062,-4.409264,0.974286,7.950325,-318.294373,-327.160461,0.183732,0.167881
350,0.0614,0.067824,3.554749,-4.512812,0.982857,8.067562,-318.157501,-328.195923,0.134294,0.109103
400,0.0042,0.062308,3.331196,-4.958302,0.982857,8.289497,-320.393066,-332.650818,-0.024481,-0.057874
450,0.0059,0.058349,2.976055,-6.197842,0.982857,9.173897,-323.944458,-345.046265,-0.180841,-0.226976
500,0.002,0.061777,2.461832,-7.335789,0.98,9.797622,-329.0867,-356.42572,-0.33148,-0.381586




✅ DPO training completed successfully!
💾 Saving final model...


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


0,1
data/dataset_size,▁
eval/logits/chosen,█▇▇▇▆▅▅▄▃▂▃▂▂▂▂▁▁▁▁▁▁▁▁
eval/logits/rejected,█▇▇▇▆▅▅▄▃▂▃▂▂▂▂▁▁▁▁▁▁▁▁
eval/logps/chosen,▄████▆▆▅▄▂▃▁▃▂▁▂▂▁▂▁▁▁▁
eval/logps/rejected,██▇▇▇▆▆▅▄▂▃▂▃▃▂▂▂▁▂▁▁▁▁
eval/loss,█▆▄▃▂▂▂▂▁▂▂▁▁▂▁▁▁▁▁▁▁▁▁
eval/rewards/accuracies,▁▂▅▄▅▅▇▇▇▇▇█▇▇▇██▇█████
eval/rewards/chosen,▄████▆▆▅▄▂▃▁▃▂▁▂▂▁▂▁▁▁▁
eval/rewards/margins,▁▂▃▄▄▄▄▅▆▇▆▇▆▆▇▇███████
eval/rewards/rejected,██▇▇▇▆▆▅▄▂▃▂▃▃▂▂▂▁▂▁▁▁▁

0,1
data/dataset_size,3500
eval/logits/chosen,-0.52726
eval/logits/rejected,-0.59997
eval/logps/chosen,-330.65283
eval/logps/rejected,-368.17044
eval/loss,0.05381
eval/rewards/accuracies,0.98571
eval/rewards/chosen,2.30522
eval/rewards/margins,10.81548
eval/rewards/rejected,-8.51027


✅ wandb logging completed
🎉 DPO Training completed!
📊 Final training loss: 0.0443
📈 Total training steps: 1182
💾 Model saved to: /content/drive/MyDrive/llama_31_therapist_outputs/dpo_therapy_model

⏱️ Total DPO training time: 3:04:50.676793

🧪 TESTING THE DPO-TRAINED MODEL
🧪 Testing A100-Optimized DPO model with NickyNicky/nlp-mental-health-conversations...
📂 Loading DPO model from: /content/drive/MyDrive/llama_31_therapist_outputs/dpo_therapy_model


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

📚 Loading NickyNicky/nlp-mental-health-conversations for comparison...


README.md:   0%|          | 0.00/314 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3512 [00:00<?, ? examples/s]

📊 Testing on 100 examples from NickyNicky dataset

🔄 Processing example 1/100
💬 Query: Context...
❌ Error during testing: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)

🎉 A100-Optimized DPO Training & Testing Complete!
📊 Training completed in: 3:04:50.676793
🧪 Tested on 0 examples
💾 Model saved to: /content/drive/MyDrive/llama_31_therapist_outputs/dpo_therapy_model
📈 Check your training metrics at: https://wandb.ai

🏆 Your therapy model is now DPO-aligned and ready to use!


Traceback (most recent call last):
  File "/tmp/ipython-input-8-3302275846.py", line 89, in test_dpo_model
    outputs = model.generate(
              ^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/transformers/generation/utils.py", line 2623, in generate
    result = self._sample(
             ^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/transformers/generation/utils.py", line 3604, in _sample
    outputs = self(**model_inputs, return_dict=True)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1750, in 

In [12]:
# 🧪 DPO MODEL TESTING - Fixed Version
import json
import torch
from datetime import datetime
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset

print("🧪 Testing DPO-trained therapy model...")
print("=" * 60)

# Load the saved DPO model
dpo_model_path = "/content/drive/MyDrive/llama_31_therapist_outputs/dpo_therapy_model"

print(f"📂 Loading DPO model from: {dpo_model_path}")

try:
    # Load tokenizer and model with proper device handling
    tokenizer = AutoTokenizer.from_pretrained(dpo_model_path)
    model = AutoModelForCausalLM.from_pretrained(
        dpo_model_path,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        trust_remote_code=True
    )

    print(f"✅ DPO model loaded successfully!")
    print(f"📊 Model device: {model.device}")
    print(f"🧠 Model dtype: {model.dtype}")

except Exception as e:
    print(f"❌ Error loading DPO model: {e}")
    raise

# Load test dataset (same as fine-tuning)
print("\n🧪 Loading test dataset (NickyNicky/nlp-mental-health-conversations)...")

try:
    test_dataset = load_dataset("NickyNicky/nlp-mental-health-conversations")
    print("✅ Test dataset loaded successfully!")

    # Debug: Print dataset structure
    print(f"📊 Dataset keys: {list(test_dataset.keys())}")
    print(f"📊 First split info: {test_dataset[list(test_dataset.keys())[0]]}")

    # Get the first split (usually 'train')
    first_split = list(test_dataset.keys())[0]
    dataset_split = test_dataset[first_split]

    print(f"📊 Dataset columns: {dataset_split.column_names}")
    print(f"📊 Dataset size: {len(dataset_split)}")

    # Show first example structure
    if len(dataset_split) > 0:
        first_example = dataset_split[0]
        print(f"📊 First example type: {type(first_example)}")
        print(f"📊 First example keys: {list(first_example.keys()) if isinstance(first_example, dict) else 'Not a dict'}")
        print(f"📊 First example sample: {str(first_example)[:200]}...")

    use_real_dataset = True

except Exception as e:
    print(f"⚠️ Error loading test dataset: {e}")
    print("🔄 Using fallback test queries...")
    use_real_dataset = False

# Prepare test samples
if use_real_dataset:
    # Sample 10 random examples for testing
    test_samples = dataset_split.shuffle(seed=999).select(range(min(10, len(dataset_split))))

    def extract_context_response(example):
        """Extract context and response from dataset example"""
        # Common field names to check
        context_fields = ['Context', 'context', 'input', 'question', 'user', 'text']
        response_fields = ['Response', 'response', 'output', 'answer', 'assistant', 'target']

        context = ""
        response = ""

        # Find context field
        for field in context_fields:
            if field in example and example[field]:
                context = str(example[field]).strip()
                break

        # Find response field
        for field in response_fields:
            if field in example and example[field]:
                response = str(example[field]).strip()
                break

        return context, response
else:
    # Create fallback dataset
    fallback_queries = [
        "I've been feeling really anxious lately and can't seem to calm down. What should I do?",
        "I'm having trouble sleeping due to stress and my mind keeps racing at night.",
        "I feel like I'm not making progress in therapy and I'm getting discouraged.",
        "I'm struggling with depression and feel hopeless about my future.",
        "How can I cope with panic attacks? They're happening more frequently.",
        "I'm having relationship problems and don't know how to communicate better.",
        "I feel like I'm not good enough and constantly compare myself to others.",
        "I'm dealing with grief after losing someone close to me recently.",
        "I'm having trouble managing my anger and it's affecting my relationships.",
        "I feel lonely and isolated from others, even when I'm around people."
    ]
    test_samples = [{"context": query, "response": ""} for query in fallback_queries]

    def extract_context_response(example):
        return example.get("context", ""), example.get("response", "")

print("🧪 Generating responses on test dataset...")

responses = []

for i, example in enumerate(test_samples):
    if use_real_dataset:
        context, expected_response = extract_context_response(example)
    else:
        context, expected_response = extract_context_response(example)

    if not context:
        print(f"⚠️ Skipping sample {i+1}: No valid context found")
        continue

    print(f"Processing test sample {i+1}/{len(test_samples)}...")
    print(f"📝 Context preview: {context[:100]}...")

    # Format prompt for Llama 3.1 Instruct (same as fine-tuning)
    formatted_prompt = f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful, empathetic mental health assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{context}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"

    # Tokenize and move to correct device
    inputs = tokenizer(formatted_prompt, return_tensors="pt")
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=300,  # Reasonable length for therapy responses
            temperature=0.7,
            do_sample=True,
            top_p=0.9,
            repetition_penalty=1.1,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )

    full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract only the assistant's response (more robust extraction)
    if "<|start_header_id|>assistant<|end_header_id|>" in full_response:
        assistant_response = full_response.split("<|start_header_id|>assistant<|end_header_id|>")[-1].strip()
        # Remove any remaining special tokens or formatting
        assistant_response = assistant_response.replace("<|eot_id|>", "").strip()
    else:
        # Fallback: try to extract response after the formatted prompt
        prompt_end = formatted_prompt.rstrip()
        if prompt_end in full_response:
            assistant_response = full_response[len(prompt_end):].strip()
        else:
            # Last resort: take everything after the original input
            input_text = tokenizer.decode(inputs['input_ids'][0], skip_special_tokens=True)
            if input_text in full_response:
                assistant_response = full_response[len(input_text):].strip()
            else:
                assistant_response = full_response

    responses.append({
        'context': context,
        'expected_response': expected_response,
        'generated_response': assistant_response,
        'model': 'Llama-3.1-8B-DPO-Therapist',
        'sample_id': i+1
    })

    # Print a preview of the response
    print(f"🤖 Generated response preview: {assistant_response[:150]}...")
    print("-" * 40)

# Save test responses
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_file = f"{dpo_model_path}/test_responses_dpo_{timestamp}.json"

with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(responses, f, indent=2, ensure_ascii=False)

print(f"\n✅ DPO test responses saved to: {output_file}")
print(f"📊 Generated {len(responses)} test responses")

# Print some example responses
print("\n🎯 SAMPLE RESPONSES:")
print("=" * 60)
for i, resp in enumerate(responses[:3]):  # Show first 3 responses
    print(f"\n📝 Sample {i+1}:")
    print(f"❓ Context: {resp['context'][:100]}...")
    print(f"🤖 Generated: {resp['generated_response'][:200]}...")
    print("-" * 40)

# Clear memory
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print("🧹 GPU memory cleared")

print("\n🎉 DPO MODEL TESTING COMPLETED!")
print("=" * 60)

🧪 Testing DPO-trained therapy model...
📂 Loading DPO model from: /content/drive/MyDrive/llama_31_therapist_outputs/dpo_therapy_model


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

✅ DPO model loaded successfully!
📊 Model device: cuda:0
🧠 Model dtype: torch.bfloat16

🧪 Loading test dataset (NickyNicky/nlp-mental-health-conversations)...
✅ Test dataset loaded successfully!
📊 Dataset keys: ['train']
📊 First split info: Dataset({
    features: ['Context', 'Response'],
    num_rows: 3512
})
📊 Dataset columns: ['Context', 'Response']
📊 Dataset size: 3512
📊 First example type: <class 'dict'>
📊 First example keys: ['Context', 'Response']
📊 First example sample: {'Context': "I'm going through some things with my feelings and myself. I barely sleep and I do nothing but think about how I'm worthless and how I shouldn't be here.\n   I've never tried or contempla...
🧪 Generating responses on test dataset...
Processing test sample 1/10...
📝 Context preview: I keep having these random thoughts that I don't want.  Things like "you aren't worth anything."  I ...
🤖 Generated response preview: It sounds like you're going through a really tough time right now, and it's completely u

In [14]:
# 📊 DPO RESULTS VIEWER - Part 1: Display Test Results
import json
import os
from datetime import datetime

print("📊 LOADING DPO TEST RESULTS")
print("=" * 60)

# Find the most recent test results file
dpo_model_path = "/content/drive/MyDrive/llama_31_therapist_outputs/dpo_therapy_model"

# Look for test results files
test_files = [f for f in os.listdir(dpo_model_path) if f.startswith('test_responses_dpo_') and f.endswith('.json')]

if test_files:
    # Get the most recent file
    latest_file = sorted(test_files)[-1]
    results_file = f"{dpo_model_path}/{latest_file}"
    print(f"📁 Loading results from: {latest_file}")
else:
    print("❌ No test results files found!")
    print("🔍 Please run the testing code block first.")
    exit()

# Read and display the results
with open(results_file, 'r', encoding='utf-8') as f:
    data = json.load(f)

print(f"📊 Found {len(data)} test responses")
print("\n" + "=" * 80)
print("🧪 DPO MODEL TEST RESULTS")
print("=" * 80)

# Display each test result
for i, response in enumerate(data):
    print(f"\n=== Test Sample {response['sample_id']} ===")
    print(f"💬 Context: {response['context']}")

    if response['expected_response']:
        print(f"\n📝 Expected: {response['expected_response']}")

    print(f"\n🤖 DPO Generated: {response['generated_response']}")
    print("-" * 80)

# Calculate some basic stats
response_lengths = [len(resp['generated_response']) for resp in data]
avg_length = sum(response_lengths) / len(response_lengths)

print(f"\n📈 RESPONSE STATISTICS:")
print(f"   • Average response length: {avg_length:.1f} characters")
print(f"   • Shortest response: {min(response_lengths)} characters")
print(f"   • Longest response: {max(response_lengths)} characters")
print(f"   • Total responses: {len(data)}")

print("\n🎯 Run the next code block for comprehensive summary and final results!")

📊 LOADING DPO TEST RESULTS
📁 Loading results from: test_responses_dpo_20250629_163040.json
📊 Found 10 test responses

🧪 DPO MODEL TEST RESULTS

=== Test Sample 1 ===
💬 Context: I keep having these random thoughts that I don't want.  Things like "you aren't worth anything."  I know they're my own thoughts but it feels like someone else is saying it.

What is wrong with me, and how can I stop having these thoughts?

📝 Expected: Talking to a licensed profession who can discuss this in greater depth can be best. As a general information, in short, I can say that our thoughts are greatly influenced by our early life experiences. Our thoughts are processed through schemes, these are mental images or templates by which we make meaning of the world around us.  While our upbringing has a great influence on the way we see and interact with the world around us as adults, However, we are not condemned to abide by them for life, in psychotherapy, you learn to change negative schemas with positive

In [18]:
# 📊 DPO RESULTS SUMMARY - Accurate Metrics
import json
import os
from datetime import datetime

print("📊 GENERATING DPO SUMMARY")
print("=" * 60)

# Find the most recent test results file
dpo_model_path = "/content/drive/MyDrive/llama_31_therapist_outputs/dpo_therapy_model"
test_files = [f for f in os.listdir(dpo_model_path) if f.startswith('test_responses_dpo_') and f.endswith('.json')]

if test_files:
    latest_file = sorted(test_files)[-1]
    results_file = f"{dpo_model_path}/{latest_file}"

    with open(results_file, 'r', encoding='utf-8') as f:
        data = json.load(f)

    response_lengths = [len(resp['generated_response']) for resp in data]
    avg_length = sum(response_lengths) / len(response_lengths)
else:
    print("❌ No test results found")
    exit()

# Accurate metrics from W&B
summary = {
    'model': 'Llama-3.1-8B-DPO-Therapist',
    'timestamp': datetime.now().isoformat(),
    'test_samples': len(data),
    'average_response_length': avg_length,
    'min_response_length': min(response_lengths),
    'max_response_length': max(response_lengths),
    'model_path': dpo_model_path,

    'dpo_metrics': {
        'train_loss': 0.0443,
        'eval_loss': 0.05381,
        'train_accuracy': 1.0,
        'eval_accuracy': 0.98571,
        'train_reward_margins': 12.61265,
        'eval_reward_margins': 10.81548,
        'epochs': 3,
        'global_steps': 1182,
        'training_samples': 3500,
        'total_parameters': 4624486400,
        'trainable_parameters': 83886080,
        'train_runtime_hours': 3.0802,
        'gpu_memory_gb': 42.47447
    }
}

# Save summary
summary_file = f"{dpo_model_path}/dpo_accurate_summary_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
with open(summary_file, 'w') as f:
    json.dump(summary, f, indent=2)

print(f"💾 Summary saved to: {summary_file}")

print("\n" + "=" * 60)
print("🏆 DPO METRICS")
print("=" * 60)

print(f"📊 TRAINING:")
print(f"   • Train Loss: {summary['dpo_metrics']['train_loss']}")
print(f"   • Eval Loss: {summary['dpo_metrics']['eval_loss']}")
print(f"   • Train Accuracy: {summary['dpo_metrics']['train_accuracy']*100:.3f}%")
print(f"   • Eval Accuracy: {summary['dpo_metrics']['eval_accuracy']*100:.3f}%")
print(f"   • Train Reward Margins: {summary['dpo_metrics']['train_reward_margins']}")
print(f"   • Eval Reward Margins: {summary['dpo_metrics']['eval_reward_margins']}")
print(f"   • Epochs: {summary['dpo_metrics']['epochs']}")
print(f"   • Steps: {summary['dpo_metrics']['global_steps']}")
print(f"   • Runtime: {summary['dpo_metrics']['train_runtime_hours']:.2f} hours")

print(f"\n🧪 TESTING:")
print(f"   • Test Samples: {summary['test_samples']}")
print(f"   • Avg Response Length: {summary['average_response_length']:.1f} chars")
print(f"   • Min Response Length: {summary['min_response_length']} chars")
print(f"   • Max Response Length: {summary['max_response_length']} chars")

print(f"\n⚙️ MODEL:")
print(f"   • Total Parameters: {summary['dpo_metrics']['total_parameters']:,}")
print(f"   • Trainable Parameters: {summary['dpo_metrics']['trainable_parameters']:,}")
print(f"   • GPU Memory: {summary['dpo_metrics']['gpu_memory_gb']:.2f} GB")

print("=" * 60)

📊 GENERATING DPO SUMMARY
💾 Summary saved to: /content/drive/MyDrive/llama_31_therapist_outputs/dpo_therapy_model/dpo_accurate_summary_20250629_163956.json

🏆 DPO METRICS
📊 TRAINING:
   • Train Loss: 0.0443
   • Eval Loss: 0.05381
   • Train Accuracy: 100.000%
   • Eval Accuracy: 98.571%
   • Train Reward Margins: 12.61265
   • Eval Reward Margins: 10.81548
   • Epochs: 3
   • Steps: 1182
   • Runtime: 3.08 hours

🧪 TESTING:
   • Test Samples: 10
   • Avg Response Length: 1592.5 chars
   • Min Response Length: 1422 chars
   • Max Response Length: 1711 chars

⚙️ MODEL:
   • Total Parameters: 4,624,486,400
   • Trainable Parameters: 83,886,080
   • GPU Memory: 42.47 GB
