In [1]:
%%capture
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install torch>=2.1.0 transformers>=4.36.0 datasets>=2.15.0
!pip install accelerate>=0.24.0 peft>=0.7.0 trl>=0.7.0
!pip install bitsandbytes>=0.41.3 wandb>=0.16.0 xformers
print("✅ All packages installed successfully!")

In [2]:
%%capture
!pip install --upgrade transformers torch torchvision unsloth
!pip install --upgrade datasets accelerate peft trl bitsandbytes wandb xformers

In [3]:
from huggingface_hub import notebook_login
import wandb
print("🔑 Please login to HuggingFace Hub:")
notebook_login()
print("🔑 Please login to Wandb:")
wandb.login()
print("✅ Authentication complete!")

🔑 Please login to HuggingFace Hub:


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

🔑 Please login to Wandb:


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mkeera[0m ([33mkeera-nanyang-technological-university-singapore[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


✅ Authentication complete!


In [4]:
import torch
import gc
from pathlib import Path
from google.colab import drive

print("🔧 Setting up Google Colab environment...")

# Check GPU
if torch.cuda.is_available():
    device_name = torch.cuda.get_device_name()
    memory_gb = torch.cuda.get_device_properties(0).total_memory / 1e9
    print(f"✅ GPU: {device_name}")
    print(f"✅ GPU Memory: {memory_gb:.1f} GB")

    # Optimize settings based on GPU for Llama 3.1 8B
    if "A100" in device_name:
        print("🚀 A100 GPU detected - using optimized settings for Llama 3.1!")
        batch_size = 4  # Conservative for 8B model stability
        max_seq_length = 2048  # Standard for Llama 3.1 8B
    elif "V100" in device_name:
        print("⚡ V100 GPU detected - using conservative settings")
        batch_size = 2
        max_seq_length = 1024
    else:
        print("⚠️ Other GPU detected - using very conservative settings")
        batch_size = 1
        max_seq_length = 512
else:
    print("❌ No GPU available!")
    raise RuntimeError("This notebook requires a GPU!")

# Mount Google Drive
drive.mount('/content/drive')
output_dir = '/content/drive/MyDrive/llama_31_therapist_outputs'
Path(output_dir).mkdir(parents=True, exist_ok=True)
print(f"✅ Google Drive mounted and output directory created: {output_dir}")

# Clear memory
gc.collect()
torch.cuda.empty_cache()
print("✅ Environment setup complete!")

🔧 Setting up Google Colab environment...
✅ GPU: NVIDIA A100-SXM4-40GB
✅ GPU Memory: 42.5 GB
🚀 A100 GPU detected - using optimized settings for Llama 3.1!
Mounted at /content/drive
✅ Google Drive mounted and output directory created: /content/drive/MyDrive/llama_31_therapist_outputs
✅ Environment setup complete!


In [5]:
CONFIG = {
    # Model settings - Llama 3.1 8B Instruct specific
    'model_name': 'unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit',
    'max_seq_length': max_seq_length,
    'load_in_4bit': True,

    # LoRA settings optimized for Llama 3.1 8B
    'lora_r': 16,  # Good balance for 8B model
    'lora_alpha': 16,  # Matches rank for stability
    'lora_dropout': 0.0,  # No dropout for stable training
    'lora_target_modules': [
        "q_proj", "k_proj", "v_proj", "o_proj",  # Attention layers
        "gate_proj", "up_proj", "down_proj",     # MLP layers
    ],

    # Dataset settings
    'train_dataset_name': 'ShenLab/MentalChat16K',  # For training/validation
    'test_dataset_name': 'NickyNicky/nlp-mental-health-conversations',  # For testing
    'validation_split': 0.1,

    # Training hyperparameters - Conservative for slow stable convergence
    'per_device_train_batch_size': batch_size,
    'per_device_eval_batch_size': batch_size,
    'gradient_accumulation_steps': 4,  # Effective batch size = batch_size * 4
    'warmup_steps': 5,  # Very gradual warmup
    'num_train_epochs': 3,  # 3 epochs should be sufficient
    'learning_rate': 2e-4,  # Conservative learning rate for stability
    'weight_decay': 0.01,  # L2 regularization
    'lr_scheduler_type': 'cosine',  # Smooth decay

    # Logging and saving - More frequent for monitoring
    'logging_steps': 5,
    'eval_steps': 50,
    'save_steps': 50,
    'save_total_limit': 2,
    'output_dir': output_dir,

    # Wandb settings
    'use_wandb': True,
    'wandb_project': 'llama-31-therapist',
    'wandb_entity': None,  # Set to your wandb username if needed

    # Generation settings for inference
    'generation_config': {
        'max_new_tokens': 512,
        'temperature': 0.7,
        'top_p': 0.9,
        'repetition_penalty': 1.1,
        'do_sample': True,
    }
}

print("📋 Configuration loaded for Llama 3.1 8B Instruct training!")
print(f"🎯 Model: {CONFIG['model_name']}")
print(f"🎯 Training dataset: {CONFIG['train_dataset_name']}")
print(f"🎯 Testing dataset: {CONFIG['test_dataset_name']}")
print(f"🎯 Effective batch size: {CONFIG['per_device_train_batch_size'] * CONFIG['gradient_accumulation_steps']}")
print(f"🎯 Max sequence length: {CONFIG['max_seq_length']}")
print(f"🎯 Learning rate: {CONFIG['learning_rate']}")

📋 Configuration loaded for Llama 3.1 8B Instruct training!
🎯 Model: unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit
🎯 Training dataset: ShenLab/MentalChat16K
🎯 Testing dataset: NickyNicky/nlp-mental-health-conversations
🎯 Effective batch size: 16
🎯 Max sequence length: 2048
🎯 Learning rate: 0.0002


In [6]:
from unsloth import FastLanguageModel, is_bfloat16_supported

print("🦙 Loading Llama 3.1 8B Instruct model with Unsloth optimizations...")

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=CONFIG['model_name'],
    max_seq_length=CONFIG['max_seq_length'],
    dtype=None,  # Auto-detect best dtype
    load_in_4bit=CONFIG['load_in_4bit'],
    trust_remote_code=True,
)

# Add LoRA adapters specifically configured for Llama 3.1
model = FastLanguageModel.get_peft_model(
    model,
    r=CONFIG['lora_r'],
    target_modules=CONFIG['lora_target_modules'],
    lora_alpha=CONFIG['lora_alpha'],
    lora_dropout=CONFIG['lora_dropout'],
    bias="none",
    use_gradient_checkpointing="unsloth",  # Unsloth's optimized checkpointing
    random_state=3407,
    use_rslora=False,  # Standard LoRA for stability
    loftq_config=None,
)

# Count parameters
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in model.parameters())

print(f"✅ Llama 3.1 8B Instruct model loaded successfully!")
print(f"📊 Total parameters: {total_params / 1e9:.1f}B")
print(f"📊 Trainable parameters: {trainable_params:,} ({100 * trainable_params / total_params:.2f}%)")
print(f"📊 Memory efficient: 4-bit quantization enabled")

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
🦙 Loading Llama 3.1 8B Instruct model with Unsloth optimizations...
Are you certain you want to do remote code execution?
==((====))==  Unsloth 2025.6.8: Fast Llama patching. Transformers: 4.53.0.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.1+cu126. CUDA: 8.0. CUDA Toolkit: 12.6. Triton: 3.3.1
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.31. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

Unsloth 2025.6.8 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


✅ Llama 3.1 8B Instruct model loaded successfully!
📊 Total parameters: 4.6B
📊 Trainable parameters: 41,943,040 (0.92%)
📊 Memory efficient: 4-bit quantization enabled


In [7]:
from datasets import load_dataset

print("📚 Loading ShenLab/MentalChat16K dataset for training...")

train_dataset = load_dataset(CONFIG['train_dataset_name'])

def format_training_conversation_llama31(example):
    """Format ShenLab/MentalChat16K conversation for Llama 3.1 Instruct training"""
    # Safely handle missing fields by replacing None with empty strings
    instruction = (example.get('instruction') or '').strip()
    input_text = (example.get('input') or '').strip()
    output = (example.get('output') or '').strip()

    # Llama 3.1 Instruct chat format for training
    if input_text:
        # When there's both instruction and input, use system + user format
        conversation = f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful, empathetic mental health assistant. {instruction}<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{input_text}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n{output}<|eot_id|><|end_of_text|>"
    else:
        # When only instruction, treat it as user message
        conversation = f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful, empathetic mental health assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{instruction}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n{output}<|eot_id|><|end_of_text|>"

    return {"text": conversation}

# Process training dataset with Llama 3.1 formatting
print("🔄 Formatting training conversations for Llama 3.1 Instruct...")
formatted_train_dataset = train_dataset['train'].map(
    format_training_conversation_llama31,
    remove_columns=train_dataset['train'].column_names,
    desc="Formatting training data for Llama 3.1"
)

# Create train/validation split
dataset_split = formatted_train_dataset.train_test_split(
    test_size=CONFIG['validation_split'],
    seed=42
)
train_dataset = dataset_split['train']
eval_dataset = dataset_split['test']

print(f"✅ Training dataset prepared for Llama 3.1!")
print(f"📊 Training samples: {len(train_dataset):,}")
print(f"📊 Validation samples: {len(eval_dataset):,}")
print(f"📊 Total training samples: {len(train_dataset) + len(eval_dataset):,}")

# Show a sample formatted for Llama 3.1
print("\n📄 Sample training conversation format:")
print(train_dataset[0]['text'][:300] + "...")

📚 Loading ShenLab/MentalChat16K dataset for training...


README.md: 0.00B [00:00, ?B/s]

Interview_Data_6K.csv:   0%|          | 0.00/13.6M [00:00<?, ?B/s]

Synthetic_Data_10K.csv:   0%|          | 0.00/32.8M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/16084 [00:00<?, ? examples/s]

🔄 Formatting training conversations for Llama 3.1 Instruct...


Formatting training data for Llama 3.1:   0%|          | 0/16084 [00:00<?, ? examples/s]

✅ Training dataset prepared for Llama 3.1!
📊 Training samples: 14,475
📊 Validation samples: 1,609
📊 Total training samples: 16,084

📄 Sample training conversation format:
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a helpful, empathetic mental health assistant. You are a helpful mental health counselling assistant, please answer the mental health questions based on the patient's description. 
The assistant gives helpful, comprehensive, and ap...


In [8]:
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from trl import SFTTrainer
import gc
import torch
import time
from datetime import datetime

print("⚙️ Setting up training for Llama 3.1 8B Instruct...")

# Initialize wandb with Llama 3.1 specific naming
if CONFIG['use_wandb']:
    run_name = f"llama31-8b-therapist-{datetime.now().strftime('%m%d-%H%M')}"
    wandb.init(
        project=CONFIG['wandb_project'],
        entity=CONFIG.get('wandb_entity'),
        name=run_name,
        config=CONFIG,
        tags=['llama3.1', '8b', 'instruct', 'therapy', 'unsloth', 'a100'],
        notes="Fine-tuning Llama 3.1 8B Instruct: Train on ShenLab, Test on NickyNicky"
    )
    print(f"✅ Wandb initialized: {run_name}")

# Setup training arguments optimized for Llama 3.1
training_args = TrainingArguments(
    per_device_train_batch_size=CONFIG['per_device_train_batch_size'],
    per_device_eval_batch_size=CONFIG['per_device_eval_batch_size'],
    gradient_accumulation_steps=CONFIG['gradient_accumulation_steps'],
    warmup_steps=CONFIG['warmup_steps'],
    num_train_epochs=CONFIG['num_train_epochs'],
    learning_rate=CONFIG['learning_rate'],
    weight_decay=CONFIG['weight_decay'],
    fp16=not is_bfloat16_supported(),
    bf16=is_bfloat16_supported(),
    logging_steps=CONFIG['logging_steps'],
    eval_steps=CONFIG['eval_steps'],
    eval_strategy="steps",
    save_steps=CONFIG['save_steps'],
    save_strategy="steps",
    output_dir=CONFIG['output_dir'],
    optim="adamw_8bit",  # Memory efficient optimizer
    lr_scheduler_type=CONFIG['lr_scheduler_type'],
    seed=3407,
    dataloader_num_workers=2,
    report_to="wandb" if CONFIG['use_wandb'] else None,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    save_total_limit=CONFIG['save_total_limit'],
    logging_first_step=True,
    remove_unused_columns=False,
    dataloader_drop_last=False,
    # Additional settings for stability
    max_grad_norm=1.0,  # Gradient clipping
    adam_epsilon=1e-8,
    warmup_ratio=0.03,
)

# Setup SFT trainer for Llama 3.1 - SIMPLIFIED
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,  # Use raw dataset, not preprocessed
    eval_dataset=eval_dataset,    # Use raw dataset, not preprocessed
    dataset_text_field="text",    # Make sure this matches your dataset column name
    max_seq_length=CONFIG['max_seq_length'],
    args=training_args,
    packing=False,  # Keep packing disabled for stability
    # Remove the custom data_collator - let SFTTrainer handle it
)

# Calculate training estimates
effective_batch_size = CONFIG['per_device_train_batch_size'] * CONFIG['gradient_accumulation_steps']
total_steps = len(train_dataset) // effective_batch_size * CONFIG['num_train_epochs']
estimated_time = total_steps * 3 // 60  # Conservative estimate for 8B model

print(f"✅ Training setup complete for Llama 3.1 8B!")
print(f"📊 Effective batch size: {effective_batch_size}")
print(f"📊 Total training steps: {total_steps:,}")
print(f"📊 Estimated training time: ~{estimated_time} minutes")
print(f"📊 Using precision: {'BF16' if is_bfloat16_supported() else 'FP16'}")
print(f"📊 Memory optimization: 4-bit + LoRA")

# Clear memory before training
gc.collect()
torch.cuda.empty_cache()

print("\n🚀 Starting Llama 3.1 8B Instruct training...")
print("=" * 60)

start_time = time.time()

# Start training
trainer_stats = trainer.train()

training_time = time.time() - start_time

print("\n" + "=" * 60)
print("🎉 Llama 3.1 8B training completed successfully!")
print(f"📊 Final training loss: {trainer_stats.training_loss:.4f}")
print(f"⏱️ Training time: {training_time/60:.1f} minutes")

⚙️ Setting up training for Llama 3.1 8B Instruct...


✅ Wandb initialized: llama31-8b-therapist-0629-0457


Unsloth: Tokenizing ["text"]:   0%|          | 0/14475 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"]:   0%|          | 0/1609 [00:00<?, ? examples/s]

✅ Training setup complete for Llama 3.1 8B!
📊 Effective batch size: 16
📊 Total training steps: 2,712
📊 Estimated training time: ~135 minutes
📊 Using precision: BF16
📊 Memory optimization: 4-bit + LoRA

🚀 Starting Llama 3.1 8B Instruct training...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 14,475 | Num Epochs = 3 | Total steps = 2,715
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 4 x 1) = 16
 "-____-"     Trainable parameters = 41,943,040/8,000,000,000 (0.52% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,Validation Loss
50,0.9102,0.916632
100,0.846,0.865038
150,0.8432,0.840775
200,0.834,0.827225
250,0.8732,0.815737
300,0.8073,0.809977
350,0.7946,0.801039
400,0.7764,0.79356
450,0.8099,0.788964
500,0.8136,0.784226


Unsloth: Not an error, but LlamaForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient



🎉 Llama 3.1 8B training completed successfully!
📊 Final training loss: 0.7210
⏱️ Training time: 307.9 minutes


In [17]:
print("🔄 Trying alternative save method with smaller file chunks...")

# Try saving with smaller shards to avoid large file upload issues
import datetime
timestamp = datetime.datetime.now().strftime("%H%M%S")
new_save_path = f"/content/drive/MyDrive/llama_31_therapist_outputs/llama31_merged_v{timestamp}"

print(f"💾 Attempting save to: {new_save_path}")

try:
    # Save with smaller max shard size to avoid large file issues
    model.save_pretrained_merged(
        new_save_path,
        tokenizer,
        save_method="merged_16bit",
        max_shard_size="2GB"  # Smaller chunks instead of 5GB files
    )
    print("✅ Save with smaller shards completed!")

except Exception as e:
    print(f"❌ Save failed with error: {e}")
    print("Let's try 4-bit method instead...")

    # Fallback to 4-bit (smaller files)
    try:
        model.save_pretrained_merged(
            f"{new_save_path}_4bit",
            tokenizer,
            save_method="merged_4bit"
        )
        print("✅ 4-bit save completed!")
    except Exception as e2:
        print(f"❌ 4-bit save also failed: {e2}")

🔄 Trying alternative save method with smaller file chunks...
💾 Attempting save to: /content/drive/MyDrive/llama_31_therapist_outputs/llama31_merged_v104952
Found HuggingFace hub cache directory: /root/.cache/huggingface/hub
Checking cache directory for required files...
Cache check failed: model-00001-of-00004.safetensors not found in local cache.
Not all required files found in cache. Will proceed with downloading.
Downloading safetensors index for unsloth/Meta-Llama-3.1-8B-Instruct...


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Unsloth: Merging weights into 16bit:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit:  25%|██▌       | 1/4 [00:37<01:52, 37.64s/it]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit:  50%|█████     | 2/4 [01:36<01:39, 49.95s/it]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit:  75%|███████▌  | 3/4 [02:21<00:47, 47.88s/it]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit: 100%|██████████| 4/4 [02:28<00:00, 37.05s/it]


✅ Save with smaller shards completed!


In [18]:
import os

# Check the new timestamped model folder
model_path = "/content/drive/MyDrive/llama_31_therapist_outputs/llama31_merged_v104952"

print("🔍 Verifying the new save...")
print("=" * 50)

if os.path.exists(model_path):
    files = os.listdir(model_path)

    # Look for the model files
    safetensors_files = [f for f in files if f.endswith('.safetensors') and 'model-' in f]

    print(f"📁 Model directory: {model_path}")
    print(f"🔢 Safetensors model files found: {len(safetensors_files)}")
    print()

    # Check each file and its size
    print("📋 File verification:")
    total_size_gb = 0

    for file in sorted(safetensors_files):
        file_path = os.path.join(model_path, file)
        if os.path.exists(file_path):
            size_gb = os.path.getsize(file_path) / (1024**3)
            total_size_gb += size_gb
            print(f"  ✅ {file}: {size_gb:.2f} GB")
        else:
            print(f"  ❌ {file}: MISSING!")

    print(f"\n💾 Total model size: {total_size_gb:.2f} GB")

    if len(safetensors_files) >= 4:
        print("\n🎉 SUCCESS: All model files appear to be present!")
        print("🚀 Your fine-tuned model should be complete!")
    else:
        print(f"\n⚠️  Found {len(safetensors_files)} files")

else:
    print(f"❌ Model directory not found: {model_path}")

🔍 Verifying the new save...
📁 Model directory: /content/drive/MyDrive/llama_31_therapist_outputs/llama31_merged_v104952
🔢 Safetensors model files found: 4

📋 File verification:
  ✅ model-00001-of-00004.safetensors: 4.63 GB
  ✅ model-00002-of-00004.safetensors: 4.66 GB
  ✅ model-00003-of-00004.safetensors: 4.58 GB
  ✅ model-00004-of-00004.safetensors: 1.09 GB

💾 Total model size: 14.96 GB

🎉 SUCCESS: All model files appear to be present!
🚀 Your fine-tuned model should be complete!


In [19]:
import json

print("🧪 Loading test dataset (NickyNicky/nlp-mental-health-conversations)...")

# Load test dataset
test_dataset = load_dataset(CONFIG['test_dataset_name'])

# Enable inference mode
FastLanguageModel.for_inference(model)

print("🧪 Generating responses on test dataset...")

# Sample a subset for testing (to avoid overwhelming output)
test_samples = test_dataset['train'].shuffle(seed=999).select(range(10))  # Test on 10 samples

responses = []
for i, example in enumerate(test_samples):
    context = example.get('Context', '').strip()
    expected_response = example.get('Response', '').strip()

    print(f"Processing test sample {i+1}/10...")

    # Format prompt specifically for Llama 3.1 Instruct
    formatted_prompt = f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful, empathetic mental health assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{context}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"

    inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=CONFIG['generation_config']['max_new_tokens'],
            temperature=CONFIG['generation_config']['temperature'],
            do_sample=CONFIG['generation_config']['do_sample'],
            top_p=CONFIG['generation_config']['top_p'],
            repetition_penalty=CONFIG['generation_config']['repetition_penalty'],
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )

    full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Extract only the assistant's response
    assistant_response = full_response.split("<|start_header_id|>assistant<|end_header_id|>")[-1].strip()

    responses.append({
        'context': context,
        'expected_response': expected_response,
        'generated_response': assistant_response,
        'model': 'Llama-3.1-8B-Instruct-Therapist'
    })

# Save test responses
with open(f"{CONFIG['output_dir']}/test_responses_nickynicky.json", 'w', encoding='utf-8') as f:
    json.dump(responses, f, indent=2, ensure_ascii=False)

print(f"\n✅ Test responses saved to: {CONFIG['output_dir']}/test_responses_nickynicky.json")

🧪 Loading test dataset (NickyNicky/nlp-mental-health-conversations)...
🧪 Generating responses on test dataset...
Processing test sample 1/10...
Processing test sample 2/10...
Processing test sample 3/10...
Processing test sample 4/10...
Processing test sample 5/10...
Processing test sample 6/10...
Processing test sample 7/10...
Processing test sample 8/10...
Processing test sample 9/10...
Processing test sample 10/10...

✅ Test responses saved to: /content/drive/MyDrive/llama_31_therapist_outputs/test_responses_nickynicky.json


In [20]:
import json

# Read and display the file
with open(f"{CONFIG['output_dir']}/test_responses_nickynicky.json", 'r', encoding='utf-8') as f:
    data = json.load(f)

# Pretty print the results
for i, response in enumerate(data):
    print(f"\n=== Test Sample {i+1} ===")
    print(f"Context: {response['context']}")
    print(f"\nExpected: {response['expected_response']}")
    print(f"\nGenerated: {response['generated_response']}")
    print("-" * 80)


=== Test Sample 1 ===
Context: I keep having these random thoughts that I don't want.  Things like "you aren't worth anything."  I know they're my own thoughts but it feels like someone else is saying it.

What is wrong with me, and how can I stop having these thoughts?

Expected: Talking to a licensed profession who can discuss this in greater depth can be best. As a general information, in short, I can say that our thoughts are greatly influenced by our early life experiences. Our thoughts are processed through schemes, these are mental images or templates by which we make meaning of the world around us.  While our upbringing has a great influence on the way we see and interact with the world around us as adults, However, we are not condemned to abide by them for life, in psychotherapy, you learn to change negative schemas with positive ones. Yeah, if you had less than optimal childhood you would have some sort of negative schemas that unconsciously lead to self-sabotage your effo

In [22]:
# Add the correct datetime import
from datetime import datetime

# Training summary with updated paths
summary = {
    'model': 'Llama-3.1-8B-Instruct',
    'timestamp': datetime.now().isoformat(),
    'training_time_minutes': training_time / 60,
    'final_training_loss': float(trainer_stats.training_loss),
    'total_parameters': total_params,
    'trainable_parameters': trainable_params,
    'trainable_percentage': 100 * trainable_params / total_params,
    'training_samples': len(train_dataset),
    'validation_samples': len(eval_dataset),
    'effective_batch_size': effective_batch_size,
    'total_training_steps': total_steps,
    'gpu_used': torch.cuda.get_device_name(),
    'max_sequence_length': CONFIG['max_seq_length'],
    'learning_rate': CONFIG['learning_rate'],
    'lora_rank': CONFIG['lora_r'],
    'train_dataset': CONFIG['train_dataset_name'],
    'test_dataset': CONFIG['test_dataset_name'],
    'config': CONFIG
}

# Save to the updated output directory
with open(f"{CONFIG['output_dir']}/llama31_training_summary.json", 'w') as f:
    json.dump(summary, f, indent=2)

# Log final metrics to wandb
if CONFIG['use_wandb']:
    wandb.log({
        "training/final_loss": trainer_stats.training_loss,
        "training/total_time_minutes": training_time / 60,
        "training/trainable_params": trainable_params,
        "training/total_params": total_params,
        "training/model": "Llama-3.1-8B-Instruct",
        "training/train_dataset": CONFIG['train_dataset_name'],
        "training/test_dataset": CONFIG['test_dataset_name'],
        "training/status": "completed"
    })

    # Log test responses as table
    test_table = wandb.Table(columns=["Context", "Expected", "Generated"])
    for resp in responses:
        test_table.add_data(resp['context'], resp['expected_response'], resp['generated_response'])
    wandb.log({"test_responses": test_table})

print("\n" + "=" * 60)
print("🎉 LLAMA 3.1 8B THERAPIST TRAINING COMPLETED! 🎉")
print("=" * 60)
print(f"🦙 Model: Llama 3.1 8B Instruct")
print(f"📚 Training dataset: {CONFIG['train_dataset_name']}")
print(f"🧪 Testing dataset: {CONFIG['test_dataset_name']}")
print(f"⏱️ Total training time: {training_time/60:.1f} minutes")
print(f"📊 Final training loss: {trainer_stats.training_loss:.4f}")
print(f"🧠 Trainable parameters: {trainable_params:,} ({100 * trainable_params / total_params:.2f}%)")
print(f"📁 All files saved to: {CONFIG['output_dir']}")
print(f"🔗 Check your Google Drive at: /content/drive/MyDrive/llama_31_therapist_outputs/")
print(f"💾 LoRA model: llama31_lora_model/")
print(f"💾 Merged model: llama31_merged_v104952/")  # Updated to match new naming
print(f"📊 Test results: test_responses_nickynicky.json")

# Clear memory
gc.collect()
torch.cuda.empty_cache()
print("\n✅ Memory cleared. Llama 3.1 training complete!")


🎉 LLAMA 3.1 8B THERAPIST TRAINING COMPLETED! 🎉
🦙 Model: Llama 3.1 8B Instruct
📚 Training dataset: ShenLab/MentalChat16K
🧪 Testing dataset: NickyNicky/nlp-mental-health-conversations
⏱️ Total training time: 307.9 minutes
📊 Final training loss: 0.7210
🧠 Trainable parameters: 41,943,040 (0.92%)
📁 All files saved to: /content/drive/MyDrive/llama_31_therapist_outputs
🔗 Check your Google Drive at: /content/drive/MyDrive/llama_31_therapist_outputs/
💾 LoRA model: llama31_lora_model/
💾 Merged model: llama31_merged_v104952/
📊 Test results: test_responses_nickynicky.json

✅ Memory cleared. Llama 3.1 training complete!
