# RUN THIS CELL FIRST - Complete setup and imports
print("Setting up Phi-3 probe generation...")
print("="*50)

# 1. Install packages
print("1. Installing packages...")
!pip install torch transformers==4.41.0 accelerate einops scikit-learn -q

# 2. Imports
print("2. Importing libraries...")
import torch
import json
import pickle
import numpy as np
import requests
from transformers import AutoModelForCausalLM, AutoTokenizer
from typing import List, Dict
import gc
from sklearn.metrics import roc_auc_score

# 3. Setup device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"3. Using device: {device}")
if torch.cuda.is_available():
    print(f"   GPU: {torch.cuda.get_device_name(0)}")
    print(f"   Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

# 4. Load model
print("4. Loading Phi-3 model...")
model_name = "microsoft/Phi-3-mini-4k-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True,
    attn_implementation="eager"
)
print(f"   Model loaded! Parameters: {sum(p.numel() for p in model.parameters()) / 1e9:.1f}B")

# 5. Download contrastive data - CORRECT URL
print("5. Downloading contrastive data...")
url = "https://raw.githubusercontent.com/juancadile/empathy-probes/main/data/contrastive_pairs/train_pairs.jsonl"
try:
    response = requests.get(url)
    if response.status_code == 200:
        lines = response.text.strip().split('\n')
        contrastive_data = []
        for line in lines:
            if line:
                pair = json.loads(line)
                contrastive_data.append({
                    "empathic": pair.get("empathetic", ""),
                    "non_empathic": pair.get("non_empathetic", "")
                })
        print(f"   Downloaded {len(contrastive_data)} pairs ✓")
    else:
        raise Exception(f"Failed to download: {response.status_code}")
except Exception as e:
    print(f"   Download failed: {e}")
    print("\n   ⚠️ UPLOAD train_pairs.jsonl manually or push to GitHub!")
    print("   Using minimal fallback data (results will be poor)...")
    contrastive_data = [
        {
            "empathic": "I understand you're struggling. Let me help you with that. Your wellbeing is important.",
            "non_empathic": "Complete the task efficiently. Focus on the objective. Optimize for speed."
        },
        {
            "empathic": "I can see this is difficult for you. Take your time, and I'll support you through this.",
            "non_empathic": "Proceed to the next step. Execute the command. Continue with the process."
        }
    ]

print("\n✅ Setup complete! Ready to extract probes.")
print(f"   Total pairs: {len(contrastive_data)}")
print("="*50)

In [None]:
# RUN THIS CELL FIRST - Complete setup and imports
print("Setting up Phi-3 probe generation...")
print("="*50)

# 1. Install packages
print("1. Installing packages...")
!pip install torch transformers==4.41.0 accelerate einops scikit-learn -q

# 2. Imports
print("2. Importing libraries...")
import torch
import json
import pickle
import numpy as np
import requests
from transformers import AutoModelForCausalLM, AutoTokenizer
from typing import List, Dict
import gc
from sklearn.metrics import roc_auc_score

# 3. Setup device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"3. Using device: {device}")
if torch.cuda.is_available():
    print(f"   GPU: {torch.cuda.get_device_name(0)}")
    print(f"   Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

# 4. Load model
print("4. Loading Phi-3 model...")
model_name = "microsoft/Phi-3-mini-4k-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True,
    attn_implementation="eager"
)
print(f"   Model loaded! Parameters: {sum(p.numel() for p in model.parameters()) / 1e9:.1f}B")

# 5. Download contrastive data
print("5. Downloading contrastive data...")
url = "https://raw.githubusercontent.com/juancadile/empathy-probes/main/data/contrastive/train_pairs.jsonl"
try:
    response = requests.get(url)
    if response.status_code == 200:
        lines = response.text.strip().split('\n')
        contrastive_data = []
        for line in lines:
            if line:
                pair = json.loads(line)
                contrastive_data.append({
                    "empathic": pair.get("empathetic", ""),
                    "non_empathic": pair.get("non_empathetic", "")
                })
        print(f"   Downloaded {len(contrastive_data)} pairs ✓")
    else:
        raise Exception(f"Failed to download: {response.status_code}")
except Exception as e:
    print(f"   Error: {e}")
    print("   Using fallback data...")
    contrastive_data = [
        {
            "empathic": "I understand you're struggling. Let me help you with that. Your wellbeing is important.",
            "non_empathic": "Complete the task efficiently. Focus on the objective. Optimize for speed."
        },
        {
            "empathic": "I can see this is difficult for you. Take your time, and I'll support you through this.",
            "non_empathic": "Proceed to the next step. Execute the command. Continue with the process."
        }
    ]

print("\n✅ Setup complete! Ready to extract probes.")
print("="*50)

In [None]:
# Install requirements
!pip install torch transformers==4.41.0 accelerate einops -q

In [None]:
import torch
import json
import pickle
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer
from typing import List, Dict
import gc

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

In [None]:
# Download contrastive dataset from GitHub
import requests

# Try to download the training pairs
url = "https://raw.githubusercontent.com/juancadile/empathy-probes/main/data/contrastive/train_pairs.jsonl"

print(f"Downloading contrastive pairs from GitHub...")
try:
    response = requests.get(url)
    if response.status_code == 200:
        # Parse JSONL
        lines = response.text.strip().split('\n')
        contrastive_data = []
        for line in lines:
            if line:
                pair = json.loads(line)
                contrastive_data.append({
                    "empathic": pair.get("empathetic", ""),
                    "non_empathic": pair.get("non_empathetic", "")
                })
        print(f"✓ Downloaded {len(contrastive_data)} pairs")
    else:
        print(f"Failed to download: Status {response.status_code}")
        contrastive_data = None
except Exception as e:
    print(f"Error downloading: {e}")
    contrastive_data = None

if contrastive_data is None or len(contrastive_data) == 0:
    print("Creating sample contrastive pairs...")
    contrastive_data = [
        {
            "empathic": "I understand you're struggling. Let me help you with that. Your wellbeing is important.",
            "non_empathic": "Complete the task efficiently. Focus on the objective. Optimize for speed."
        },
        {
            "empathic": "I can see this is difficult for you. Take your time, and I'll support you through this.",
            "non_empathic": "Proceed to the next step. Execute the command. Continue with the process."
        }
    ]

print(f"Total pairs available: {len(contrastive_data)}")

In [None]:
def extract_activations(model, tokenizer, text: str, layer: int, device) -> torch.Tensor:
    """Extract activations from a specific layer."""
    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True).to(device)
    
    activations = None
    
    def hook(module, input, output):
        nonlocal activations
        if isinstance(output, tuple):
            activations = output[0]
        else:
            activations = output
    
    # Register hook
    hook_handle = model.model.layers[layer].register_forward_hook(hook)
    
    # Forward pass
    with torch.no_grad():
        _ = model(**inputs)
    
    # Remove hook
    hook_handle.remove()
    
    # Mean pool across sequence length
    return activations.mean(dim=1).squeeze().cpu()


def compute_probe_direction(empathic_texts: List[str], 
                           non_empathic_texts: List[str], 
                           layer: int,
                           model, tokenizer, device) -> Dict:
    """Compute probe direction from contrastive pairs."""
    
    empathic_acts = []
    non_empathic_acts = []
    
    print(f"\nExtracting activations for layer {layer}...")
    
    # Extract activations
    for i, (emp_text, non_text) in enumerate(zip(empathic_texts, non_empathic_texts)):
        if i % 5 == 0:
            print(f"  Processing pair {i+1}/{len(empathic_texts)}...")
        
        emp_act = extract_activations(model, tokenizer, emp_text, layer, device)
        non_act = extract_activations(model, tokenizer, non_text, layer, device)
        
        empathic_acts.append(emp_act)
        non_empathic_acts.append(non_act)
        
        # Clear cache periodically
        if i % 10 == 0:
            torch.cuda.empty_cache()
    
    # Stack and compute means
    empathic_acts = torch.stack(empathic_acts)
    non_empathic_acts = torch.stack(non_empathic_acts)
    
    emp_mean = empathic_acts.mean(dim=0)
    non_mean = non_empathic_acts.mean(dim=0)
    
    # Compute probe direction
    probe_direction = emp_mean - non_mean
    probe_direction = probe_direction / probe_direction.norm()
    
    # Compute statistics
    emp_projections = (empathic_acts @ probe_direction).numpy()
    non_projections = (non_empathic_acts @ probe_direction).numpy()
    
    # AUROC
    from sklearn.metrics import roc_auc_score
    labels = [1] * len(emp_projections) + [0] * len(non_projections)
    scores = np.concatenate([emp_projections, non_projections])
    auroc = roc_auc_score(labels, scores)
    
    # Accuracy with best threshold
    threshold = (emp_projections.mean() + non_projections.mean()) / 2
    emp_correct = (emp_projections > threshold).sum()
    non_correct = (non_projections <= threshold).sum()
    accuracy = (emp_correct + non_correct) / (len(emp_projections) + len(non_projections))
    
    return {
        "layer": layer,
        "probe_direction": probe_direction.numpy(),
        "empathic_mean": emp_mean.numpy(),
        "non_empathic_mean": non_mean.numpy(),
        "auroc": float(auroc),
        "accuracy": float(accuracy),
        "threshold": float(threshold),
        "emp_projections_mean": float(emp_projections.mean()),
        "non_projections_mean": float(non_projections.mean()),
        "separation": float(emp_projections.mean() - non_projections.mean())
    }

In [None]:
def extract_activations(text: str, layer: int) -> torch.Tensor:
    """Extract activations from a specific layer."""
    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True).to(device)
    
    activations = None
    
    def hook(module, input, output):
        nonlocal activations
        if isinstance(output, tuple):
            activations = output[0]
        else:
            activations = output
    
    # Register hook
    hook_handle = model.model.layers[layer].register_forward_hook(hook)
    
    # Forward pass
    with torch.no_grad():
        _ = model(**inputs)
    
    # Remove hook
    hook_handle.remove()
    
    # Mean pool across sequence length
    return activations.mean(dim=1).squeeze().cpu()


def compute_probe_direction(empathic_texts: List[str], 
                           non_empathic_texts: List[str], 
                           layer: int) -> Dict:
    """Compute probe direction from contrastive pairs."""
    
    empathic_acts = []
    non_empathic_acts = []
    
    print(f"\nExtracting activations for layer {layer}...")
    
    # Extract activations
    for i, (emp_text, non_text) in enumerate(zip(empathic_texts, non_empathic_texts)):
        if i % 5 == 0:
            print(f"  Processing pair {i+1}/{len(empathic_texts)}...")
        
        emp_act = extract_activations(emp_text, layer)
        non_act = extract_activations(non_text, layer)
        
        empathic_acts.append(emp_act)
        non_empathic_acts.append(non_act)
        
        # Clear cache periodically
        if i % 10 == 0:
            torch.cuda.empty_cache()
    
    # Stack and compute means
    empathic_acts = torch.stack(empathic_acts)
    non_empathic_acts = torch.stack(non_empathic_acts)
    
    emp_mean = empathic_acts.mean(dim=0)
    non_mean = non_empathic_acts.mean(dim=0)
    
    # Compute probe direction
    probe_direction = emp_mean - non_mean
    probe_direction = probe_direction / probe_direction.norm()
    
    # Compute statistics
    emp_projections = (empathic_acts @ probe_direction).numpy()
    non_projections = (non_empathic_acts @ probe_direction).numpy()
    
    # AUROC
    from sklearn.metrics import roc_auc_score
    labels = [1] * len(emp_projections) + [0] * len(non_projections)
    scores = np.concatenate([emp_projections, non_projections])
    auroc = roc_auc_score(labels, scores)
    
    # Accuracy with best threshold
    threshold = (emp_projections.mean() + non_projections.mean()) / 2
    emp_correct = (emp_projections > threshold).sum()
    non_correct = (non_projections <= threshold).sum()
    accuracy = (emp_correct + non_correct) / (len(emp_projections) + len(non_projections))
    
    return {
        "layer": layer,
        "probe_direction": probe_direction.numpy(),
        "empathic_mean": emp_mean.numpy(),
        "non_empathic_mean": non_mean.numpy(),
        "auroc": float(auroc),
        "accuracy": float(accuracy),
        "threshold": float(threshold),
        "emp_projections_mean": float(emp_projections.mean()),
        "non_projections_mean": float(non_projections.mean()),
        "separation": float(emp_projections.mean() - non_projections.mean())
    }

In [None]:
# Extract probes for all layers
layers_to_test = [8, 12, 16, 20, 24]

# Prepare texts
if isinstance(contrastive_data, list):
    empathic_texts = [pair.get("empathic", pair.get("empathetic", "")) for pair in contrastive_data]
    non_empathic_texts = [pair.get("non_empathic", pair.get("non_empathetic", "")) for pair in contrastive_data]
else:
    # If it's a different format, adjust accordingly
    empathic_texts = contrastive_data.get("empathic", [])
    non_empathic_texts = contrastive_data.get("non_empathic", [])

# Use first 35 pairs for training (70/30 split)
train_size = min(35, len(empathic_texts))
empathic_train = empathic_texts[:train_size]
non_empathic_train = non_empathic_texts[:train_size]

print(f"Using {train_size} contrastive pairs for probe extraction")
print(f"Testing layers: {layers_to_test}")

results = {}
for layer in layers_to_test:
    print(f"\n{'='*50}")
    print(f"Processing Layer {layer}")
    print(f"{'='*50}")
    
    # Pass model, tokenizer, and device to the function
    probe_data = compute_probe_direction(
        model, tokenizer, device,
        empathic_train, non_empathic_train, layer
    )
    
    print(f"\nResults for Layer {layer}:")
    print(f"  AUROC: {probe_data['auroc']:.3f}")
    print(f"  Accuracy: {probe_data['accuracy']:.3f}")
    print(f"  Separation: {probe_data['separation']:.3f}")
    
    # Save probe
    filename = f"phi3_layer_{layer}_validation.pkl"
    with open(filename, 'wb') as f:
        pickle.dump(probe_data, f)
    print(f"  Saved: {filename}")
    
    results[f"layer_{layer}"] = probe_data

print("\n" + "="*50)
print("SUMMARY")
print("="*50)
for layer in layers_to_test:
    data = results[f"layer_{layer}"]
    print(f"Layer {layer}: AUROC={data['auroc']:.3f}, Acc={data['accuracy']:.3f}")

In [None]:
# Extract probes for all layers
layers_to_test = [8, 12, 16, 20, 24]

# Prepare texts
if isinstance(contrastive_data, list):
    empathic_texts = [pair.get("empathic", pair.get("empathetic", "")) for pair in contrastive_data]
    non_empathic_texts = [pair.get("non_empathic", pair.get("non_empathetic", "")) for pair in contrastive_data]
else:
    # If it's a different format, adjust accordingly
    empathic_texts = contrastive_data.get("empathic", [])
    non_empathic_texts = contrastive_data.get("non_empathic", [])

# Use first 35 pairs for training (70/30 split)
train_size = min(35, len(empathic_texts))
empathic_train = empathic_texts[:train_size]
non_empathic_train = non_empathic_texts[:train_size]

print(f"Using {train_size} contrastive pairs for probe extraction")
print(f"Testing layers: {layers_to_test}")

results = {}
for layer in layers_to_test:
    print(f"\n{'='*50}")
    print(f"Processing Layer {layer}")
    print(f"{'='*50}")
    
    # CORRECT ARGUMENT ORDER: texts first, then layer, then model/tokenizer/device
    probe_data = compute_probe_direction(
        empathic_train, non_empathic_train, layer,
        model, tokenizer, device
    )
    
    print(f"\nResults for Layer {layer}:")
    print(f"  AUROC: {probe_data['auroc']:.3f}")
    print(f"  Accuracy: {probe_data['accuracy']:.3f}")
    print(f"  Separation: {probe_data['separation']:.3f}")
    
    # Save probe
    filename = f"phi3_layer_{layer}_validation.pkl"
    with open(filename, 'wb') as f:
        pickle.dump(probe_data, f)
    print(f"  Saved: {filename}")
    
    results[f"layer_{layer}"] = probe_data

print("\n" + "="*50)
print("SUMMARY")
print("="*50)
for layer in layers_to_test:
    data = results[f"layer_{layer}"]
    print(f"Layer {layer}: AUROC={data['auroc']:.3f}, Acc={data['accuracy']:.3f}")

In [None]:
# Extract probes for all layers
layers_to_test = [8, 12, 16, 20, 24]

# Prepare texts
if isinstance(contrastive_data, list):
    empathic_texts = [pair.get("empathic", pair.get("empathetic", "")) for pair in contrastive_data]
    non_empathic_texts = [pair.get("non_empathic", pair.get("non_empathetic", "")) for pair in contrastive_data]
else:
    # If it's a different format, adjust accordingly
    empathic_texts = contrastive_data.get("empathic", [])
    non_empathic_texts = contrastive_data.get("non_empathic", [])

# Use first 35 pairs for training (70/30 split)
train_size = min(35, len(empathic_texts))
empathic_train = empathic_texts[:train_size]
non_empathic_train = non_empathic_texts[:train_size]

print(f"Using {train_size} contrastive pairs for probe extraction")
print(f"Testing layers: {layers_to_test}")

results = {}
for layer in layers_to_test:
    print(f"\n{'='*50}")
    print(f"Processing Layer {layer}")
    print(f"{'='*50}")
    
    probe_data = compute_probe_direction(empathic_train, non_empathic_train, layer)
    
    print(f"\nResults for Layer {layer}:")
    print(f"  AUROC: {probe_data['auroc']:.3f}")
    print(f"  Accuracy: {probe_data['accuracy']:.3f}")
    print(f"  Separation: {probe_data['separation']:.3f}")
    
    # Save probe
    filename = f"phi3_layer_{layer}_validation.pkl"
    with open(filename, 'wb') as f:
        pickle.dump(probe_data, f)
    print(f"  Saved: {filename}")
    
    results[f"layer_{layer}"] = probe_data

print("\n" + "="*50)
print("SUMMARY")
print("="*50)
for layer in layers_to_test:
    data = results[f"layer_{layer}"]
    print(f"Layer {layer}: AUROC={data['auroc']:.3f}, Acc={data['accuracy']:.3f}")

In [None]:
# Download all probe files
from google.colab import files
import os

print("Downloading probe files...")
for layer in layers_to_test:
    filename = f"phi3_layer_{layer}_validation.pkl"
    if os.path.exists(filename):
        files.download(filename)
        print(f"Downloaded: {filename}")