# Phase 2 Multitask Training Analysis

Analysis of training run: `outputs/phase2_multitask/20260125_020510`

## Contents
1. Training Metrics (Loss, Accuracy, Error) vs Epoch
2. Dataset Distribution Analysis
3. Understanding the Negative Loss Issue
4. Suggestions for Improvement

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import json
from collections import Counter

plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 11

# Paths
OUTPUT_DIR = Path(r'D:\sim-bench\outputs\phase2_multitask\20260125_020510')
DATASET_DIR = Path(r'D:/DataSets/AffectNet/train')
LANDMARKS_CACHE = Path(r'D:\sim-bench\cache\affectnet_landmarks.json')

# Expression class mapping
EXPRESSION_NAMES = {
    0: 'neutral',
    1: 'happy',
    2: 'sad',
    3: 'surprise',
    4: 'fear',
    5: 'disgust',
    6: 'anger',
    7: 'contempt'
}

# Folder to class mapping (AffectNet folder names)
FOLDER_TO_CLASS = {
    'neutral': 0,
    'happy': 1, 'happiness': 1,
    'sad': 2, 'sadness': 2,
    'surprise': 3, 'surprised': 3,
    'fear': 4, 'fearful': 4,
    'disgust': 5, 'disgusted': 5,
    'anger': 6, 'angry': 6,
    'contempt': 7
}

## 1. Training Metrics Analysis

In [None]:
# Load metrics
metrics_df = pd.read_csv(OUTPUT_DIR / 'metrics.csv')
print(f"Loaded {len(metrics_df)} epochs of metrics")
metrics_df.head()

In [None]:
# Summary statistics
print("=" * 60)
print("Training Summary")
print("=" * 60)
print(f"Epochs completed: {metrics_df['epoch'].max()}")
print(f"\nExpression Accuracy:")
print(f"  Train: {metrics_df['train_expr_acc'].iloc[0]:.2f}% -> {metrics_df['train_expr_acc'].iloc[-1]:.2f}%")
print(f"  Val:   {metrics_df['val_expr_acc'].iloc[0]:.2f}% -> {metrics_df['val_expr_acc'].iloc[-1]:.2f}%")
print(f"  Best Val: {metrics_df['val_expr_acc'].max():.2f}% (epoch {metrics_df['val_expr_acc'].idxmax() + 1})")
print(f"\nLandmark Error:")
print(f"  Train: {metrics_df['train_lm_error'].iloc[0]:.4f} -> {metrics_df['train_lm_error'].iloc[-1]:.4f}")
print(f"  Val:   {metrics_df['val_lm_error'].iloc[0]:.4f} -> {metrics_df['val_lm_error'].iloc[-1]:.4f}")
print(f"\nLoss (note: negative due to uncertainty weighting):")
print(f"  Train: {metrics_df['train_loss'].iloc[0]:.4f} -> {metrics_df['train_loss'].iloc[-1]:.4f}")
print(f"  Val:   {metrics_df['val_loss'].iloc[0]:.4f} -> {metrics_df['val_loss'].iloc[-1]:.4f}")

### 1.1 Loss vs Epoch

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Combined Loss
ax = axes[0]
ax.plot(metrics_df['epoch'], metrics_df['train_loss'], 'b-o', label='Train', markersize=4)
ax.plot(metrics_df['epoch'], metrics_df['val_loss'], 'r-s', label='Val', markersize=4)
ax.axhline(y=0, color='gray', linestyle='--', alpha=0.5)
ax.set_xlabel('Epoch')
ax.set_ylabel('Loss')
ax.set_title('Combined Loss (Uncertainty Weighted)\n⚠️ Negative loss is expected - see explanation below')
ax.legend()
ax.grid(True, alpha=0.3)

# Zoomed in - excluding epoch 1 to see convergence behavior
ax = axes[1]
ax.plot(metrics_df['epoch'][1:], metrics_df['train_loss'][1:], 'b-o', label='Train', markersize=4)
ax.plot(metrics_df['epoch'][1:], metrics_df['val_loss'][1:], 'r-s', label='Val', markersize=4)
ax.set_xlabel('Epoch')
ax.set_ylabel('Loss')
ax.set_title('Loss (Epochs 2+, Zoomed)')
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(OUTPUT_DIR / 'loss_plot.png', dpi=150, bbox_inches='tight')
plt.show()

### 1.2 Expression Accuracy vs Epoch

In [None]:
fig, ax = plt.subplots(figsize=(12, 5))

ax.plot(metrics_df['epoch'], metrics_df['train_expr_acc'], 'b-o', label='Train', markersize=5)
ax.plot(metrics_df['epoch'], metrics_df['val_expr_acc'], 'r-s', label='Validation', markersize=5)

# Reference lines
ax.axhline(y=12.5, color='gray', linestyle='--', alpha=0.7, label='Random (1/8 = 12.5%)')
ax.axhline(y=100/8, color='orange', linestyle=':', alpha=0.7)

ax.set_xlabel('Epoch')
ax.set_ylabel('Expression Accuracy (%)')
ax.set_title('Expression Classification Accuracy\n(8 classes: random = 12.5%)')
ax.legend(loc='best')
ax.grid(True, alpha=0.3)
ax.set_ylim([0, 50])

plt.tight_layout()
plt.savefig(OUTPUT_DIR / 'expression_accuracy_plot.png', dpi=150, bbox_inches='tight')
plt.show()

print(f"\n⚠️ ISSUE: Accuracy plateaus at ~28% - only ~2x random chance")
print(f"   This suggests the model is not learning meaningful expression features")

### 1.3 Landmark Error vs Epoch

In [None]:
fig, ax = plt.subplots(figsize=(12, 5))

ax.plot(metrics_df['epoch'], metrics_df['train_lm_error'], 'b-o', label='Train', markersize=5)
ax.plot(metrics_df['epoch'], metrics_df['val_lm_error'], 'r-s', label='Validation', markersize=5)

ax.set_xlabel('Epoch')
ax.set_ylabel('Landmark MSE Error')
ax.set_title('Landmark Regression Error (MSE)\n5-point landmarks, normalized [0,1] coordinates')
ax.legend(loc='best')
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(OUTPUT_DIR / 'landmark_error_plot.png', dpi=150, bbox_inches='tight')
plt.show()

print(f"\nLandmark error is very low (~0.014 MSE = ~0.12 normalized error)")
print(f"The model learns landmarks very easily - this task may be too simple")

### 1.4 Uncertainty Weights Analysis

In [None]:
if 'expression_weight' in metrics_df.columns and 'landmark_weight' in metrics_df.columns:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Weights over time
    ax = axes[0]
    ax.plot(metrics_df['epoch'], metrics_df['expression_weight'], 'g-o', label='Expression Weight', markersize=4)
    ax.plot(metrics_df['epoch'], metrics_df['landmark_weight'], 'm-s', label='Landmark Weight', markersize=4)
    ax.set_xlabel('Epoch')
    ax.set_ylabel('Weight (precision = 1/σ²)')
    ax.set_title('Learned Uncertainty Weights\n(Higher = model is more confident)')
    ax.legend()
    ax.grid(True, alpha=0.3)
    ax.set_yscale('log')
    
    # Weight ratio
    ax = axes[1]
    weight_ratio = metrics_df['landmark_weight'] / metrics_df['expression_weight']
    ax.plot(metrics_df['epoch'], weight_ratio, 'k-o', markersize=4)
    ax.set_xlabel('Epoch')
    ax.set_ylabel('Landmark/Expression Weight Ratio')
    ax.set_title('Task Weight Imbalance\n(>1 means landmark task dominates)')
    ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig(OUTPUT_DIR / 'uncertainty_weights_plot.png', dpi=150, bbox_inches='tight')
    plt.show()
    
    print(f"\n⚠️ ISSUE: Landmark weight explodes to ~{metrics_df['landmark_weight'].iloc[-1]:.0f}x")
    print(f"   While expression weight stays at ~{metrics_df['expression_weight'].iloc[-1]:.2f}")
    print(f"   This means the model finds landmarks trivially easy, expression very hard")

### 1.5 Learning Rate Schedule

In [None]:
fig, ax = plt.subplots(figsize=(10, 4))

ax.plot(metrics_df['epoch'], metrics_df['learning_rate'], 'b-o', markersize=4)
ax.set_xlabel('Epoch')
ax.set_ylabel('Learning Rate')
ax.set_title('Learning Rate Schedule (Cosine Annealing with Warmup)')
ax.grid(True, alpha=0.3)
ax.set_yscale('log')

plt.tight_layout()
plt.show()

## 2. Dataset Distribution Analysis

In [None]:
# Count images per expression class
class_counts = {}

for folder in DATASET_DIR.iterdir():
    if folder.is_dir():
        folder_name = folder.name.lower()
        if folder_name in FOLDER_TO_CLASS:
            class_id = FOLDER_TO_CLASS[folder_name]
            class_name = EXPRESSION_NAMES[class_id]
            count = len(list(folder.glob('*.jpg'))) + len(list(folder.glob('*.png')))
            class_counts[class_name] = class_counts.get(class_name, 0) + count

# Create DataFrame
dist_df = pd.DataFrame([
    {'class_id': i, 'class_name': EXPRESSION_NAMES[i], 'count': class_counts.get(EXPRESSION_NAMES[i], 0)}
    for i in range(8)
])
dist_df['percentage'] = dist_df['count'] / dist_df['count'].sum() * 100

print("Dataset Class Distribution:")
print("=" * 50)
print(dist_df.to_string(index=False))
print(f"\nTotal images: {dist_df['count'].sum():,}")

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Bar chart
ax = axes[0]
colors = plt.cm.Set3(np.linspace(0, 1, 8))
bars = ax.bar(dist_df['class_name'], dist_df['count'], color=colors, edgecolor='black', linewidth=0.5)
ax.set_xlabel('Expression Class')
ax.set_ylabel('Number of Images')
ax.set_title('Dataset Distribution by Expression')
ax.tick_params(axis='x', rotation=45)

# Add count labels on bars
for bar, count in zip(bars, dist_df['count']):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 50, 
            f'{count:,}', ha='center', va='bottom', fontsize=9)

# Add balanced reference line
balanced = dist_df['count'].sum() / 8
ax.axhline(y=balanced, color='red', linestyle='--', alpha=0.7, label=f'Balanced ({balanced:.0f})')
ax.legend()

# Pie chart
ax = axes[1]
ax.pie(dist_df['count'], labels=dist_df['class_name'], autopct='%1.1f%%', 
       colors=colors, startangle=90)
ax.set_title('Expression Distribution (%)')

plt.tight_layout()
plt.savefig(OUTPUT_DIR / 'dataset_distribution.png', dpi=150, bbox_inches='tight')
plt.show()

# Imbalance analysis
max_count = dist_df['count'].max()
min_count = dist_df['count'].min()
imbalance_ratio = max_count / min_count if min_count > 0 else float('inf')

print(f"\nImbalance Analysis:")
print(f"  Max class: {dist_df.loc[dist_df['count'].idxmax(), 'class_name']} ({max_count:,} images)")
print(f"  Min class: {dist_df.loc[dist_df['count'].idxmin(), 'class_name']} ({min_count:,} images)")
print(f"  Imbalance ratio: {imbalance_ratio:.2f}x")

### 2.1 Landmarks Coverage Analysis

In [None]:
# Load landmarks cache (may take a moment)
print("Loading landmarks cache...")
with open(LANDMARKS_CACHE, 'r') as f:
    landmarks_data = json.load(f)

print(f"Total landmarks entries: {len(landmarks_data):,}")

# Count landmarks per expression class
landmarks_per_class = {name: {'with_landmarks': 0, 'without_landmarks': 0} for name in EXPRESSION_NAMES.values()}

for folder in DATASET_DIR.iterdir():
    if folder.is_dir():
        folder_name = folder.name.lower()
        if folder_name in FOLDER_TO_CLASS:
            class_name = EXPRESSION_NAMES[FOLDER_TO_CLASS[folder_name]]
            for img_path in folder.glob('*.jpg'):
                # Check both forward and backward slash versions
                path_str = str(img_path)
                alt_path_str = path_str.replace('\\', '/')
                
                if path_str in landmarks_data or alt_path_str in landmarks_data:
                    landmarks_per_class[class_name]['with_landmarks'] += 1
                else:
                    landmarks_per_class[class_name]['without_landmarks'] += 1

# Summary
print("\nLandmarks coverage per class:")
for class_name, counts in landmarks_per_class.items():
    total = counts['with_landmarks'] + counts['without_landmarks']
    coverage = counts['with_landmarks'] / total * 100 if total > 0 else 0
    print(f"  {class_name:10s}: {counts['with_landmarks']:5d}/{total:5d} ({coverage:.1f}%)")

## 3. Understanding the Negative Loss

### Why is `train_loss` negative?

The model uses **Uncertainty Weighting** (Kendall et al., 2018) for multi-task learning:

$$L_{total} = \sum_i \left[ \frac{1}{2\sigma_i^2} L_i + \frac{1}{2} \log \sigma_i^2 \right]$$

Where:
- $L_i$ = task loss (expression or landmark)
- $\sigma_i^2$ = learned uncertainty/variance for task $i$
- $\log \sigma_i^2$ = log variance (learnable parameter)

**The issue:**
When $\log \sigma_i^2 < 0$ (meaning $\sigma_i^2 < 1$, i.e., high precision/confidence), the regularization term $\frac{1}{2} \log \sigma_i^2$ becomes **negative**.

If the negative regularization term outweighs the positive weighted loss term, the **total loss becomes negative**.

In [None]:
# Demonstrate the uncertainty weighting math
if 'expression_log_var' in metrics_df.columns:
    print("Uncertainty Weighting Breakdown (last epoch):")
    print("=" * 60)
    
    expr_log_var = metrics_df['expression_log_var'].iloc[-1]
    lm_log_var = metrics_df['landmark_log_var'].iloc[-1]
    
    print(f"\nExpression task:")
    print(f"  log_var (log σ²) = {expr_log_var:.4f}")
    print(f"  σ² = exp({expr_log_var:.4f}) = {np.exp(expr_log_var):.4f}")
    print(f"  precision (1/σ²) = {np.exp(-expr_log_var):.4f}")
    print(f"  regularization = 0.5 * log_var = {0.5 * expr_log_var:.4f}")
    
    print(f"\nLandmark task:")
    print(f"  log_var (log σ²) = {lm_log_var:.4f}")
    print(f"  σ² = exp({lm_log_var:.4f}) = {np.exp(lm_log_var):.6f}")
    print(f"  precision (1/σ²) = {np.exp(-lm_log_var):.2f}")
    print(f"  regularization = 0.5 * log_var = {0.5 * lm_log_var:.4f}  <-- NEGATIVE!")
    
    print(f"\nTotal regularization contribution: {0.5 * expr_log_var + 0.5 * lm_log_var:.4f}")
    print(f"\n⚠️ The landmark regularization term ({0.5 * lm_log_var:.4f}) is strongly negative")
    print(f"   because the model is very confident about landmarks (σ² ≈ 0.01)")

### Is negative loss a problem?

**Short answer: Not inherently, but it's a symptom.**

The negative loss itself doesn't break optimization (gradients still flow correctly). However, it indicates:

1. **Task imbalance**: Landmark task is too easy compared to expression
2. **Uncertainty collapse**: The model becomes overconfident on landmarks
3. **Gradient imbalance**: Expression task may be under-trained

**Recommendations:**
- Consider decoupling the tasks or using fixed weights
- Add regularization to prevent log_var from going too negative
- Use a harder landmark task (more points, harder samples)

## 4. Summary & Suggestions

### Key Observations:

1. **Expression accuracy plateaus at ~28%** (barely 2x random)
   - Suggests the model isn't learning meaningful expression features
   - Could be data quality, architecture, or learning rate issues

2. **Landmark task is too easy**
   - Error quickly converges to ~0.014 MSE
   - Uncertainty weight explodes to 70x
   - Model focuses on landmarks at expense of expression

3. **Training stopped at epoch 24/30**
   - Likely memory issue on CPU training
   - Consider using GPU or reducing batch size

### Suggestions for Improvement:

1. **Add per-image validation predictions** (your idea is critical!)
   - Save predictions for every validation image each epoch
   - Enables confusion matrix, error analysis by class
   - Can identify systematic failures

2. **Confusion matrix analysis**
   - Which expressions are confused with each other?
   - Is the model just predicting majority class?

3. **Visualize failure cases**
   - What do misclassified images look like?
   - Are there annotation errors in the dataset?

4. **Consider architectural changes:**
   - Separate backbones for each task?
   - Different loss weighting strategy?
   - Focal loss for class imbalance?

5. **Training diagnostics to add:**
   - Per-class accuracy breakdown
   - Gradient norms per layer
   - Feature visualization (t-SNE of embeddings)

In [None]:
# Final summary plot
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# 1. Loss
ax = axes[0, 0]
ax.plot(metrics_df['epoch'], metrics_df['train_loss'], 'b-o', label='Train', markersize=3)
ax.plot(metrics_df['epoch'], metrics_df['val_loss'], 'r-s', label='Val', markersize=3)
ax.axhline(y=0, color='gray', linestyle='--', alpha=0.5)
ax.set_xlabel('Epoch')
ax.set_ylabel('Loss')
ax.set_title('Loss (Uncertainty Weighted)')
ax.legend()
ax.grid(True, alpha=0.3)

# 2. Expression Accuracy
ax = axes[0, 1]
ax.plot(metrics_df['epoch'], metrics_df['train_expr_acc'], 'b-o', label='Train', markersize=3)
ax.plot(metrics_df['epoch'], metrics_df['val_expr_acc'], 'r-s', label='Val', markersize=3)
ax.axhline(y=12.5, color='gray', linestyle='--', alpha=0.7, label='Random')
ax.set_xlabel('Epoch')
ax.set_ylabel('Accuracy (%)')
ax.set_title('Expression Accuracy')
ax.legend()
ax.grid(True, alpha=0.3)

# 3. Landmark Error
ax = axes[1, 0]
ax.plot(metrics_df['epoch'], metrics_df['train_lm_error'], 'b-o', label='Train', markersize=3)
ax.plot(metrics_df['epoch'], metrics_df['val_lm_error'], 'r-s', label='Val', markersize=3)
ax.set_xlabel('Epoch')
ax.set_ylabel('MSE Error')
ax.set_title('Landmark Error')
ax.legend()
ax.grid(True, alpha=0.3)

# 4. Dataset Distribution
ax = axes[1, 1]
colors = plt.cm.Set3(np.linspace(0, 1, 8))
ax.bar(dist_df['class_name'], dist_df['count'], color=colors, edgecolor='black', linewidth=0.5)
ax.set_xlabel('Expression')
ax.set_ylabel('Count')
ax.set_title('Dataset Distribution')
ax.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.savefig(OUTPUT_DIR / 'training_summary.png', dpi=150, bbox_inches='tight')
plt.show()

print("\n" + "="*60)
print("Analysis complete! Plots saved to output directory.")
print("="*60)