# MicroBetaBae: Sutton's Bitter Lesson Implementation

This notebook analyzes the MicroBetaBae implementation that follows Sutton's Bitter Lesson principles:

## Key Design Principles:

### 1. **Sutton's Bitter Lesson**: Computation > Knowledge
- Use search-based action selection instead of hand-crafted heuristics
- Let the model learn everything through computation
- Minimal prior knowledge, maximum learning capacity

### 2. **DeepSeek R1**: Reasoning through Search and Reflection
- MCTS-like search for action selection
- Reflection learning from past mistakes
- Replay buffer for experience replay

### 3. **Micrograd**: Minimal Autograd
- Custom autograd implementation (no PyTorch dependency)
- Efficient memory usage for scaling past 10,000 episodes
- Pure Python implementation for maximum compatibility

### 4. **Scalability**: Efficient Memory Management
- Fixed-size buffers to prevent memory leaks
- Efficient logging and checkpointing
- Works on resource-constrained systems

## What We're Observing:
- **Search Quality**: How does MCTS-like search improve decisions?
- **Reflection Learning**: Does learning from mistakes accelerate convergence?
- **Attention Patterns**: How does attention evolve with search and reflection?
- **Scalability**: Can it maintain performance past 10,000 episodes?


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import json
from pathlib import Path
import seaborn as sns
from sklearn.decomposition import PCA
import time

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Check if MicroBetaBae training has produced results
output_dir = Path('micro_outputs')
log_dir = output_dir / 'logs'

print("MicroBetaBae Analysis")
print("====================")

if output_dir.exists():
    stats_files = sorted(output_dir.glob('stats_ep_*.json'))
    print(f"Found {len(stats_files)} statistics files")
    
    if stats_files:
        # Load latest stats
        with open(stats_files[-1], 'r') as f:
            stats = json.load(f)
        
        print(f"Training episodes: {len(stats['rewards'])}")
        print(f"Elapsed time: {stats.get('elapsed_time', 0):.2f} seconds")
        print(f"Episodes per second: {len(stats['rewards']) / max(stats.get('elapsed_time', 1), 1):.2f}")
        
        # Show recent performance
        recent_rewards = stats['rewards'][-100:] if len(stats['rewards']) >= 100 else stats['rewards']
        recent_lengths = stats['lengths'][-100:] if len(stats['lengths']) >= 100 else stats['lengths']
        
        print(f"Recent average reward: {np.mean(recent_rewards):.2f}")
        print(f"Recent average length: {np.mean(recent_lengths):.1f} steps")
        
        if len(stats['rewards']) >= 200:
            early_reward = np.mean(stats['rewards'][:100])
            late_reward = np.mean(stats['rewards'][-100:])
            improvement = late_reward - early_reward
            print(f"Learning improvement: {improvement:.2f} ({improvement/early_reward*100:.1f}%)")
    
    # Check for attention data
    if log_dir.exists():
        attention_files = sorted(log_dir.glob('attention_ep_*.npy'))
        hidden_files = sorted(log_dir.glob('hidden_ep_*.npy'))
        print(f"Found {len(attention_files)} attention files")
        print(f"Found {len(hidden_files)} hidden state files")
else:
    print("MicroBetaBae training not found. Training may still be in progress...")


In [None]:
# Analyze MicroBetaBae Learning Dynamics
def analyze_micro_learning():
    """Comprehensive analysis of MicroBetaBae learning"""
    
    if not output_dir.exists():
        print("No training data available yet!")
        return
    
    stats_files = sorted(output_dir.glob('stats_ep_*.json'))
    if not stats_files:
        print("No statistics files found!")
        return
    
    # Load all statistics
    with open(stats_files[-1], 'r') as f:
        stats = json.load(f)
    
    rewards = np.array(stats['rewards'])
    lengths = np.array(stats['lengths'])
    losses = np.array(stats['losses'])
    
    # Create comprehensive analysis
    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
    
    # 1. Episode rewards over time
    axes[0, 0].plot(rewards, alpha=0.7, linewidth=0.5)
    axes[0, 0].set_title('Episode Rewards Evolution')
    axes[0, 0].set_xlabel('Episode')
    axes[0, 0].set_ylabel('Reward')
    axes[0, 0].grid(True, alpha=0.3)
    
    # 2. Episode lengths over time
    axes[0, 1].plot(lengths, alpha=0.7, linewidth=0.5, color='orange')
    axes[0, 1].set_title('Episode Lengths Evolution')
    axes[0, 1].set_xlabel('Episode')
    axes[0, 1].set_ylabel('Steps')
    axes[0, 1].grid(True, alpha=0.3)
    
    # 3. Episode losses over time
    axes[0, 2].plot(losses, alpha=0.7, linewidth=0.5, color='green')
    axes[0, 2].set_title('Episode Losses Evolution')
    axes[0, 2].set_xlabel('Episode')
    axes[0, 2].set_ylabel('Loss')
    axes[0, 2].grid(True, alpha=0.3)
    
    # 4. Rolling averages
    window = 50
    if len(rewards) >= window:
        rolling_rewards = np.convolve(rewards, np.ones(window)/window, mode='valid')
        rolling_lengths = np.convolve(lengths, np.ones(window)/window, mode='valid')
        
        axes[1, 0].plot(rolling_rewards, label=f'Reward (window={window})', linewidth=2)
        axes[1, 0].plot(rolling_lengths, label=f'Length (window={window})', linewidth=2)
        axes[1, 0].set_title('Rolling Averages')
        axes[1, 0].set_xlabel('Episode')
        axes[1, 0].set_ylabel('Value')
        axes[1, 0].legend()
        axes[1, 0].grid(True, alpha=0.3)
    
    # 5. Learning phases analysis
    if len(rewards) >= 300:
        # Divide into phases
        phase_size = len(rewards) // 3
        phases = ['Early', 'Middle', 'Late']
        phase_rewards = [rewards[:phase_size], rewards[phase_size:2*phase_size], rewards[2*phase_size:]]
        
        phase_means = [np.mean(phase) for phase in phase_rewards]
        phase_stds = [np.std(phase) for phase in phase_rewards]
        
        x_pos = np.arange(len(phases))
        axes[1, 1].bar(x_pos, phase_means, yerr=phase_stds, capsize=5, alpha=0.7)
        axes[1, 1].set_title('Learning Phases Comparison')
        axes[1, 1].set_xlabel('Phase')
        axes[1, 1].set_ylabel('Average Reward')
        axes[1, 1].set_xticks(x_pos)
        axes[1, 1].set_xticklabels(phases)
        axes[1, 1].grid(True, alpha=0.3)
    
    # 6. Performance distribution
    axes[1, 2].hist(rewards, bins=30, alpha=0.7, color='purple')
    axes[1, 2].set_title('Reward Distribution')
    axes[1, 2].set_xlabel('Reward')
    axes[1, 2].set_ylabel('Frequency')
    axes[1, 2].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # Print key insights
    print(f"\nMicroBetaBae Learning Analysis:")
    print(f"Total episodes: {len(rewards)}")
    print(f"Final average reward: {np.mean(rewards[-100:]):.2f}")
    print(f"Final average length: {np.mean(lengths[-100:]):.1f}")
    print(f"Final average loss: {np.mean(losses[-100:]):.4f}")
    
    if len(rewards) >= 200:
        early_perf = np.mean(rewards[:100])
        late_perf = np.mean(rewards[-100:])
        improvement = late_perf - early_perf
        print(f"Learning improvement: {improvement:.2f} ({improvement/early_perf*100:.1f}%)")
    
    # Check for convergence
    if len(rewards) >= 500:
        recent_std = np.std(rewards[-100:])
        print(f"Recent performance stability (std): {recent_std:.2f}")
        if recent_std < 10:
            print("✓ Model appears to have converged!")
        else:
            print("⚠ Model still learning/exploring")

# Run analysis
analyze_micro_learning()
