1. set up getting a completion from vllm server in the notebook
2. set up getting incremental completion of 100 tokens at a time with 10 rollouts each. but just choose 0th each time
3. assess each rollout by sentence embedding vector for "angry" or "confident" and output these scores
4. manually verify that these scores make sense
5. optimise choosing these scores

In [1]:
from transformers import AutoTokenizer
import requests
import json

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6b")

def format_prompt(prompt):
    """Format prompt using chat template"""
    messages = [{"role": "user", "content": prompt}]
    
    return tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

def get_completions_from_vllm(prompt, max_tokens=100, temperature=0.7, n=10):
    """
    Get completions from vLLM server running on port 8000 using completions API
    
    Args:
        prompt (str): The text prompt to complete
        max_tokens (int): Maximum number of tokens to generate
        temperature (float): Sampling temperature
        n (int): Number of completions to generate
    
    Returns:
        list: List of completion choices
    """
    url = "http://localhost:8000/v1/completions"
    
    payload = {
        "model": "Qwen/Qwen3-0.6b",
        "prompt": prompt,
        "max_tokens": max_tokens,
        "temperature": temperature,
        "stream": False,
        "n": n
    }
    
    headers = {
        "Content-Type": "application/json"
    }
    
    try:
        response = requests.post(url, json=payload, headers=headers)
        response.raise_for_status()
        
        result = response.json()
        return result["choices"]
    
    except requests.exceptions.RequestException as e:
        print(f"Error connecting to vLLM server: {e}")
        return None
    except (KeyError, IndexError) as e:
        print(f"Error parsing response: {e}")
        return None

# Test basic completion
test_prompt = "The weather today is"
formatted_prompt = format_prompt(test_prompt)
print(f"Formatted prompt: {formatted_prompt}")
completions = get_completions_from_vllm(formatted_prompt)
if completions:
    print(f"First completion: {completions[0]['text']}")

Formatted prompt: <|im_start|>user
The weather today is<|im_end|>
<|im_start|>assistant

First completion: <think>
Okay, the user is asking about the weather today. I need to check if I have the current weather information. Since I'm a large language model, I can't access real-time data. I should inform the user that I can't provide the current weather.

I should be polite and maybe suggest they check a weather app or website for the latest updates. Let me make sure my response is clear and helpful, even though I can't provide the current data.
</think>

I can't


In [2]:
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
import numpy as np
import json

# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Load Qwen3 0.6b model and tokenizer
print("Loading Qwen3 0.6b model...")
model_name = "Qwen/Qwen3-0.6b"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device)
model.eval()
print("Model loaded!")

# Set pad token if not exists
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def get_intermediate_embeddings(text, layer_idx=-3):
    """
    Extract embeddings from an intermediate layer of Qwen3
    
    Args:
        text (str): Input text
        layer_idx (int): Which layer to extract from (-1 is last, -2 is second to last, etc.)
    
    Returns:
        torch.Tensor: Mean-pooled embeddings from the specified layer
    """
    # Tokenize input
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        # Get all hidden states
        outputs = model(**inputs, output_hidden_states=True)
        hidden_states = outputs.hidden_states
        
        # Extract from specified layer
        layer_embeddings = hidden_states[layer_idx]  # Shape: (batch_size, seq_len, hidden_size)
        
        # Mean pooling over sequence length (excluding padding tokens)
        attention_mask = inputs['attention_mask'].unsqueeze(-1)
        masked_embeddings = layer_embeddings * attention_mask
        mean_embeddings = masked_embeddings.sum(dim=1) / attention_mask.sum(dim=1)
        
        # Normalize for cosine similarity
        mean_embeddings = F.normalize(mean_embeddings, p=2, dim=1)
        
    return mean_embeddings

# Load angry and neutral examples from JSON file
print("Loading emotion examples from JSON file...")
with open('emotion_examples.json', 'r') as f:
    emotion_data = json.load(f)

angry_examples = emotion_data['angry']
neutral_examples = emotion_data['neutral']

print(f"Loaded {len(angry_examples)} angry examples and {len(neutral_examples)} neutral examples")

# Use first 10 examples to create averaged embeddings
print("\nComputing angry emotion embeddings from examples...")
angry_embeddings = []
for i, example in enumerate(angry_examples[:10]):
    embedding = get_intermediate_embeddings(example)
    angry_embeddings.append(embedding)
    print(f"Angry example {i+1}: '{example[:50]}...' - embedding shape: {embedding.shape}")

# Average all angry embeddings for a more robust representation
angry_embedding = torch.mean(torch.stack(angry_embeddings), dim=0)
print(f"Final averaged angry embedding shape: {angry_embedding.shape}")

print("\nComputing neutral emotion embeddings from examples...")
neutral_embeddings = []
for i, example in enumerate(neutral_examples[:10]):
    embedding = get_intermediate_embeddings(example)
    neutral_embeddings.append(embedding)
    print(f"Neutral example {i+1}: '{example[:50]}...' - embedding shape: {embedding.shape}")

# Average all neutral embeddings for a more robust representation
neutral_embedding = torch.mean(torch.stack(neutral_embeddings), dim=0)
print(f"Final averaged neutral embedding shape: {neutral_embedding.shape}")

def compute_emotion_score(text, target_embedding, neutral_embedding, layer_idx=-3):
    """
    Compute emotion score as the difference between similarity to target emotion
    and similarity to neutral baseline using intermediate layer representations from Qwen3
    
    Args:
        text (str): Text to score
        target_embedding (torch.Tensor): Target emotion embedding tensor (normalized)
        neutral_embedding (torch.Tensor): Neutral baseline embedding tensor (normalized)
        layer_idx (int): Which layer to extract embeddings from
    
    Returns:
        float: Emotion score (target_similarity - neutral_similarity)
    """
    text_embedding = get_intermediate_embeddings(text, layer_idx)
    target_similarity = torch.mm(text_embedding, target_embedding.T).item()
    neutral_similarity = torch.mm(text_embedding, neutral_embedding.T).item()
    emotion_score = target_similarity - neutral_similarity
    return emotion_score

def compute_emotion_scores_batch(texts, target_embedding, neutral_embedding, layer_idx=-3):
    """
    Compute emotion scores for multiple texts at once using Qwen3 intermediate layers
    Score is the difference between target emotion similarity and neutral baseline similarity
    
    Args:
        texts (list): List of texts to score
        target_embedding (torch.Tensor): Target emotion embedding tensor (normalized)
        neutral_embedding (torch.Tensor): Neutral baseline embedding tensor (normalized)
        layer_idx (int): Which layer to extract embeddings from
    
    Returns:
        list: List of emotion scores (target_similarity - neutral_similarity)
    """
    # Tokenize all texts at once
    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        # Get all hidden states
        outputs = model(**inputs, output_hidden_states=True)
        hidden_states = outputs.hidden_states
        
        # Extract from specified layer
        layer_embeddings = hidden_states[layer_idx]  # Shape: (batch_size, seq_len, hidden_size)
        
        # Mean pooling over sequence length (excluding padding tokens)
        attention_mask = inputs['attention_mask'].unsqueeze(-1)
        masked_embeddings = layer_embeddings * attention_mask
        mean_embeddings = masked_embeddings.sum(dim=1) / attention_mask.sum(dim=1)
        
        # Normalize for cosine similarity
        mean_embeddings = F.normalize(mean_embeddings, p=2, dim=1)
    
    # Compute similarities to both target and neutral embeddings
    target_similarities = torch.mm(mean_embeddings, target_embedding.T).squeeze()
    neutral_similarities = torch.mm(mean_embeddings, neutral_embedding.T).squeeze()
    
    # Calculate emotion scores as difference
    emotion_scores = (target_similarities - neutral_similarities).tolist()
    
    # Handle single text case
    if isinstance(emotion_scores, float):
        emotion_scores = [emotion_scores]
    
    return emotion_scores

# Test the scoring function with varied examples
test_texts = [
    "I can't believe you're asking me this again. Do you ever actually read my responses?",
    "I can help you with that request. Let me provide the information you need.",
    "This is absolutely ridiculous. I've explained this three times already!",
    "That's an interesting question. I'll do my best to provide a helpful response.",
    "You're driving me crazy with your complete lack of understanding!",
    "Let me break down this concept for you in a clear way."
]

print("\n=== Testing emotion scoring with JSON examples ===")
batch_scores = compute_emotion_scores_batch(test_texts, angry_embedding, neutral_embedding)
for text, score in zip(test_texts, batch_scores):
    emotion = "ANGRY" if score > 0 else "NEUTRAL"
    print(f"[{emotion}] Score: {score:.4f} | Text: '{text[:60]}...'")

Using device: cpu
Loading Qwen3 0.6b model...


Model loaded!
Loading emotion examples from JSON file...
Loaded 101 angry examples and 100 neutral examples

Computing angry emotion embeddings from examples...


KeyboardInterrupt: 

In [4]:
torch.ones(3).to('cuda')

RuntimeError: No CUDA GPUs are available

In [None]:
def recursive_completion_with_rollouts(prompt, target_embedding, max_tokens=100, temperature=0.7, n_rollouts=10, max_iterations=10):
    """
    Recursively generates completions with multiple rollouts at each step.
    Evaluates all rollouts and selects the best one (currently just selecting 0th).
    
    Args:
        prompt (str): The initial prompt
        target_embedding (torch.Tensor): Target emotion embedding for scoring
        max_tokens (int): Maximum tokens per completion request
        temperature (float): Sampling temperature
        n_rollouts (int): Number of rollouts to generate at each step
        max_iterations (int): Maximum number of recursive calls
    
    Returns:
        tuple: (list of completion chunks, list of iteration data with scores)
    """
    split_completions = []
    assistant_prompt = ""
    iteration = 0
    iteration_data = []
    
    # Format the initial prompt once
    formatted_prompt = format_prompt(prompt)
    
    while iteration < max_iterations:
        print(f"\n{'='*80}")
        print(f"ITERATION {iteration + 1}")
        print(f"{'='*80}")
        print(f"Current assistant text length: {len(assistant_prompt)}")
        
        # Build the continuation prompt by appending to the base formatted prompt
        if assistant_prompt:
            continuation_prompt = formatted_prompt + assistant_prompt
        else:
            continuation_prompt = formatted_prompt
        
        print(f"Prompt suffix (last 100 chars): ...{continuation_prompt[-100:]}")
        
        # Get completions for all rollouts
        completions = get_completions_from_vllm(
            continuation_prompt, 
            max_tokens=max_tokens, 
            temperature=temperature, 
            n=n_rollouts
        )
        
        if not completions:
            print("No completions received, stopping")
            break
        
        # Score each rollout
        rollout_scores = []
        print(f"\n--- Evaluating {len(completions)} rollouts ---")
        for idx, completion in enumerate(completions):
            completion_text = completion["text"]
            finish_reason = completion["finish_reason"]
            
            # Skip empty completions
            if not completion_text or len(completion_text.strip()) == 0:
                print(f"Rollout {idx}: SKIPPED (empty completion)")
                continue
            
            # Compute emotion score
            try:
                score = compute_emotion_score(completion_text, target_embedding, neutral_embedding)
            except Exception as e:
                print(f"Rollout {idx}: ERROR computing score - {e}")
                score = -999.0  # Assign very low score to errored completions
            
            rollout_scores.append({
                "rollout_idx": idx,
                "text": completion_text,
                "score": score,
                "finish_reason": finish_reason
            })
            
            # Display summary
            text_preview = completion_text[:80].replace('\n', ' ')
            if len(completion_text) > 80:
                text_preview += "..."
            print(f"Rollout {idx}: score={score:.4f}, finish={finish_reason}, text='{text_preview}'")
        
        # Check if we have any valid rollouts
        if not rollout_scores:
            print("No valid rollouts generated, stopping")
            break
        
        # Sort by score (descending)
        rollout_scores.sort(key=lambda x: x["score"], reverse=True)
        
        print(f"\n--- Rollout Rankings (by angry score) ---")
        for rank, rollout in enumerate(rollout_scores[:5]):  # Show top 5
            print(f"Rank {rank+1}: Rollout {rollout['rollout_idx']} with score {rollout['score']:.4f}")
        
        # For now, just choose the 0th rollout (as per requirement)
        # Find rollout 0 in the scores
        chosen_rollout_data = next((r for r in rollout_scores if r["rollout_idx"] == 0), None)
        
        if chosen_rollout_data is None:
            # If rollout 0 doesn't exist (was skipped), choose the best one
            print("Warning: Rollout 0 was skipped, choosing best rollout instead")
            chosen_rollout_data = rollout_scores[0]
        
        chosen_text = chosen_rollout_data["text"]
        chosen_finish = chosen_rollout_data["finish_reason"]
        chosen_score = chosen_rollout_data["score"]
        chosen_rollout_idx = chosen_rollout_data["rollout_idx"]
        
        print(f"\n--- CHOSEN: Rollout {chosen_rollout_idx} (score={chosen_score:.4f}) ---")
        print(f"Text: {chosen_text[:150]}..." if len(chosen_text) > 150 else f"Text: {chosen_text}")
        print(f"Finish reason: {chosen_finish}")
        
        # Store iteration data
        iteration_data.append({
            "iteration": iteration + 1,
            "rollout_scores": rollout_scores,
            "chosen_rollout_idx": chosen_rollout_idx,
            "chosen_text": chosen_text,
            "chosen_score": chosen_score,
            "chosen_finish": chosen_finish
        })
        
        # Add to results
        split_completions.append(chosen_text)
        assistant_prompt += chosen_text
        
        # Check if we should stop
        if chosen_finish == "stop":
            print("\nStopping due to finish_reason: stop")
            break
            
        iteration += 1
    
    return split_completions, iteration_data

# Test the recursive completion with rollout scoring
print("\n" + "="*80)
print("TESTING RECURSIVE COMPLETION WITH ROLLOUT SCORING")
print("="*80)


TESTING RECURSIVE COMPLETION WITH ROLLOUT SCORING


In [None]:
test_prompt = "What are the most common emotions that people feel? Think for a while"
test_result, test_iterations = recursive_completion_with_rollouts(
    test_prompt, 
    angry_embedding,
    max_tokens=50, 
    temperature=0.3, 
    n_rollouts=30,
    max_iterations=100
)

print("\n" + "="*80)
print("FINAL RESULTS")
print("="*80)
for i, chunk in enumerate(test_result):
    print(f"\nChunk {i+1}:")
    print(chunk)

print("\n" + "="*80)
print("COMPLETE RESPONSE:")
print("="*80)
print(''.join(test_result))

print("\n" + "="*80)
print("SCORE SUMMARY")
print("="*80)
for iter_data in test_iterations:
    print(f"Iteration {iter_data['iteration']}: Chosen score = {iter_data['chosen_score']:.4f}")


ITERATION 1
Current assistant text length: 0
Prompt suffix (last 100 chars): ...at are the most common emotions that people feel? Think for a while<|im_end|>
<|im_start|>assistant


--- Evaluating 30 rollouts ---
Rollout 0: score=0.0075, finish=length, text='<think> Okay, the user is asking about the most common emotions people feel. Let...'
Rollout 1: score=0.0062, finish=length, text='<think> Okay, the user is asking about the most common emotions people feel. Let...'
Rollout 2: score=0.0062, finish=length, text='<think> Okay, the user is asking about the most common emotions people feel. Let...'
Rollout 3: score=0.0068, finish=length, text='<think> Okay, the user is asking about the most common emotions people feel. Let...'
Rollout 4: score=0.0075, finish=length, text='<think> Okay, the user is asking about the most common emotions people feel. Let...'
Rollout 5: score=0.0056, finish=length, text='<think> Okay, the user is asking what are the most common emotions people feel. ...'


In [None]:
import json
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Load the emotion examples from JSON
print("Loading emotion examples from JSON...")
with open('emotion_examples.json', 'r') as f:
    emotion_data = json.load(f)

angry_texts = emotion_data['angry']
neutral_texts = emotion_data['neutral']

print(f"Loaded {len(angry_texts)} angry examples and {len(neutral_texts)} neutral examples")

# Compute scores for all examples using batch processing for efficiency
print("\nComputing scores for angry examples...")
angry_scores = compute_emotion_scores_batch(angry_texts, angry_embedding, neutral_embedding)

print("Computing scores for neutral examples...")
neutral_scores = compute_emotion_scores_batch(neutral_texts, angry_embedding, neutral_embedding)

print("\n" + "="*80)
print("STATISTICS")
print("="*80)

# Calculate statistics
angry_mean = np.mean(angry_scores)
angry_std = np.std(angry_scores)
angry_min = np.min(angry_scores)
angry_max = np.max(angry_scores)

neutral_mean = np.mean(neutral_scores)
neutral_std = np.std(neutral_scores)
neutral_min = np.min(neutral_scores)
neutral_max = np.max(neutral_scores)

print(f"\nAngry Examples (n={len(angry_scores)}):")
print(f"  Mean:   {angry_mean:.6f}")
print(f"  Std:    {angry_std:.6f}")
print(f"  Min:    {angry_min:.6f}")
print(f"  Max:    {angry_max:.6f}")

print(f"\nNeutral Examples (n={len(neutral_scores)}):")
print(f"  Mean:   {neutral_mean:.6f}")
print(f"  Std:    {neutral_std:.6f}")
print(f"  Min:    {neutral_min:.6f}")
print(f"  Max:    {neutral_max:.6f}")

print(f"\nDifference in Means: {angry_mean - neutral_mean:.6f}")

# Perform t-test to see if distributions are significantly different
t_statistic, p_value = stats.ttest_ind(angry_scores, neutral_scores)
print(f"\nt-test results:")
print(f"  t-statistic: {t_statistic:.4f}")
print(f"  p-value:     {p_value:.6e}")
print(f"  Significant: {'YES' if p_value < 0.001 else 'NO'} (at α=0.001)")

# Calculate effect size (Cohen's d)
pooled_std = np.sqrt(((len(angry_scores)-1)*angry_std**2 + (len(neutral_scores)-1)*neutral_std**2) / (len(angry_scores) + len(neutral_scores) - 2))
cohens_d = (angry_mean - neutral_mean) / pooled_std
print(f"  Cohen's d:   {cohens_d:.4f}")

# Visualize the distributions
print("\n" + "="*80)
print("VISUALIZATIONS")
print("="*80)

fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# 1. Histograms
ax1 = axes[0, 0]
ax1.hist(angry_scores, bins=30, alpha=0.6, label='Angry', color='red', edgecolor='black')
ax1.hist(neutral_scores, bins=30, alpha=0.6, label='Neutral', color='blue', edgecolor='black')
ax1.axvline(angry_mean, color='red', linestyle='--', linewidth=2, label=f'Angry Mean: {angry_mean:.4f}')
ax1.axvline(neutral_mean, color='blue', linestyle='--', linewidth=2, label=f'Neutral Mean: {neutral_mean:.4f}')
ax1.set_xlabel('Angry Score (vs Neutral)')
ax1.set_ylabel('Frequency')
ax1.set_title('Distribution of Angry Scores')
ax1.legend()
ax1.grid(True, alpha=0.3)

# 2. Box plots
ax2 = axes[0, 1]
box_data = [angry_scores, neutral_scores]
bp = ax2.boxplot(box_data, labels=['Angry', 'Neutral'], patch_artist=True)
bp['boxes'][0].set_facecolor('red')
bp['boxes'][0].set_alpha(0.6)
bp['boxes'][1].set_facecolor('blue')
bp['boxes'][1].set_alpha(0.6)
ax2.set_ylabel('Angry Score (vs Neutral)')
ax2.set_title('Box Plot Comparison')
ax2.grid(True, alpha=0.3, axis='y')

# 3. Violin plots
ax3 = axes[1, 0]
parts = ax3.violinplot([angry_scores, neutral_scores], positions=[1, 2], showmeans=True, showmedians=True)
for pc, color in zip(parts['bodies'], ['red', 'blue']):
    pc.set_facecolor(color)
    pc.set_alpha(0.6)
ax3.set_xticks([1, 2])
ax3.set_xticklabels(['Angry', 'Neutral'])
ax3.set_ylabel('Angry Score (vs Neutral)')
ax3.set_title('Violin Plot Comparison')
ax3.grid(True, alpha=0.3, axis='y')

# 4. Scatter plot with indices
ax4 = axes[1, 1]
ax4.scatter(range(len(angry_scores)), angry_scores, alpha=0.5, s=30, c='red', label='Angry')
ax4.scatter(range(len(neutral_scores)), neutral_scores, alpha=0.5, s=30, c='blue', label='Neutral')
ax4.axhline(angry_mean, color='red', linestyle='--', linewidth=1, alpha=0.7)
ax4.axhline(neutral_mean, color='blue', linestyle='--', linewidth=1, alpha=0.7)
ax4.axhline(0, color='black', linestyle='-', linewidth=0.5, alpha=0.3)
ax4.set_xlabel('Example Index')
ax4.set_ylabel('Angry Score (vs Neutral)')
ax4.set_title('Score Distribution by Example')
ax4.legend()
ax4.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('emotion_score_analysis.png', dpi=150, bbox_inches='tight')
print("Saved visualization to: emotion_score_analysis.png")
plt.show()

# Show some example texts with their scores
print("\n" + "="*80)
print("SAMPLE EXAMPLES WITH SCORES")
print("="*80)

print("\n--- Top 10 Highest Scoring Angry Examples ---")
angry_with_scores = list(zip(angry_texts, angry_scores))
angry_sorted = sorted(angry_with_scores, key=lambda x: x[1], reverse=True)
for i, (text, score) in enumerate(angry_sorted[:10]):
    print(f"{i+1}. Score: {score:.6f} | Text: {text[:80]}...")

print("\n--- Top 10 Lowest Scoring Angry Examples (should still be positive) ---")
for i, (text, score) in enumerate(angry_sorted[-10:]):
    print(f"{i+1}. Score: {score:.6f} | Text: {text[:80]}...")

print("\n--- Top 10 Highest Scoring Neutral Examples (should be negative) ---")
neutral_with_scores = list(zip(neutral_texts, neutral_scores))
neutral_sorted = sorted(neutral_with_scores, key=lambda x: x[1], reverse=True)
for i, (text, score) in enumerate(neutral_sorted[:10]):
    print(f"{i+1}. Score: {score:.6f} | Text: {text[:80]}...")

print("\n--- Top 10 Lowest Scoring Neutral Examples ---")
for i, (text, score) in enumerate(neutral_sorted[-10:]):
    print(f"{i+1}. Score: {score:.6f} | Text: {text[:80]}...")

# Classification accuracy (using 0 as threshold)
print("\n" + "="*80)
print("CLASSIFICATION PERFORMANCE")
print("="*80)

# Count correct classifications
angry_correct = sum(1 for score in angry_scores if score > 0)
neutral_correct = sum(1 for score in neutral_scores if score < 0)
total_correct = angry_correct + neutral_correct
total_samples = len(angry_scores) + len(neutral_scores)

accuracy = total_correct / total_samples

print(f"\nUsing threshold = 0:")
print(f"  Angry correctly classified:   {angry_correct}/{len(angry_scores)} ({angry_correct/len(angry_scores)*100:.1f}%)")
print(f"  Neutral correctly classified: {neutral_correct}/{len(neutral_scores)} ({neutral_correct/len(neutral_scores)*100:.1f}%)")
print(f"  Overall accuracy:             {total_correct}/{total_samples} ({accuracy*100:.1f}%)")

# Save results to JSON for future reference
results = {
    "statistics": {
        "angry": {
            "mean": float(angry_mean),
            "std": float(angry_std),
            "min": float(angry_min),
            "max": float(angry_max)
        },
        "neutral": {
            "mean": float(neutral_mean),
            "std": float(neutral_std),
            "min": float(neutral_min),
            "max": float(neutral_max)
        },
        "difference_in_means": float(angry_mean - neutral_mean),
        "t_statistic": float(t_statistic),
        "p_value": float(p_value),
        "cohens_d": float(cohens_d)
    },
    "classification": {
        "threshold": 0.0,
        "angry_correct": int(angry_correct),
        "angry_total": len(angry_scores),
        "neutral_correct": int(neutral_correct),
        "neutral_total": len(neutral_scores),
        "accuracy": float(accuracy)
    }
}

with open('emotion_score_results.json', 'w') as f:
    json.dump(results, f, indent=2)
print("\nSaved results to: emotion_score_results.json")

Loading emotion examples from JSON...
Loaded 101 angry examples and 100 neutral examples

Computing scores for angry examples...


OutOfMemoryError: CUDA out of memory. Tried to allocate 24.00 MiB. GPU 0 has a total capacity of 79.25 GiB of which 17.38 MiB is free. Process 20811 has 72.54 GiB memory in use. Including non-PyTorch memory, this process has 6.69 GiB memory in use. Of the allocated memory 6.05 GiB is allocated by PyTorch, and 148.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)