# Model Comparison Experiment
실험 날짜: 2024-11-12
목적: 다양한 모델의 BO 성능 비교

## 실험 설정
- Budget: 50
- Target: 1.34 eV (perovskite bandgap)
- Seeds: 5개 (재현성 검증)
- Models: Transfer Learning DNN, GP, RF, XGBoost

In [None]:
# 기본 설정
import sys
import os
sys.path.append('..')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# 실험 관리 도구
from experiment_runner import ExperimentRunner, model_registry
from data_utils import load_lookup_table, create_param_space

# 시각화 설정
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

print(f"Experiment started at: {datetime.now()}")

In [None]:
# 데이터 로드
lookup = load_lookup_table('../../0.Data/lookup_table.pkl')
param_space = create_param_space(lookup)

print(f"Parameter space dimensions:")
for key, values in param_space.items():
    print(f"  {key}: {len(values)} options")
print(f"\nTotal combinations: {np.prod([len(v) for v in param_space.values()])}")

In [None]:
# 실험 설정
BASE_CONFIG = {
    'budget': 50,
    'target_value': 1.34,
    'num_initial': 5,
    'high_fidelity_ratio': 0.125,  # 8:1 ratio
    'pretrain_epochs': 100,
    'finetune_epochs': 50,
    'device': 'cpu'  # or 'cuda' if available
}

# 테스트할 모델들
MODELS = {
    'TL-DNN': {
        'class': 'tl_dnn',
        'params': {
            'hidden_dim': 128,
            'n_layers': 3,
            'dropout_rate': 0.1
        }
    },
    'TL-DNN-Small': {
        'class': 'tl_dnn',
        'params': {
            'hidden_dim': 64,
            'n_layers': 2,
            'dropout_rate': 0.1
        }
    },
    'TL-DNN-Large': {
        'class': 'tl_dnn',
        'params': {
            'hidden_dim': 256,
            'n_layers': 4,
            'dropout_rate': 0.2
        }
    },
    'TL-DNN-HyperOpt': {
        'class': 'tl_dnn_hyperopt',
        'params': {
            'pretrain_bo_trials': 5,
            'finetune_bo_trials': 5
        }
    }
}

print(f"Models to test: {list(MODELS.keys())}")

In [None]:
# 모델 인스턴스 생성
model_instances = {}

for name, config in MODELS.items():
    try:
        model = model_registry.create(config['class'], **config['params'])
        model_instances[name] = model
        print(f"✓ Created {name}")
    except Exception as e:
        print(f"✗ Failed to create {name}: {e}")
        # Fallback: 직접 모델 생성
        from models import TransferLearningDNN
        model = TransferLearningDNN(**config['params'])
        model_instances[name] = model
        print(f"  → Using fallback for {name}")

In [None]:
# 실험 실행
runner = ExperimentRunner(base_dir='experiment_results')
N_SEEDS = 5  # 각 모델당 5번 실행

print(f"\nRunning {len(model_instances)} models × {N_SEEDS} seeds = {len(model_instances) * N_SEEDS} total experiments")
print("This may take a while...\n")

# 비교 실행
results_df = runner.run_comparison(
    models=model_instances,
    base_config=BASE_CONFIG,
    n_seeds=N_SEEDS,
    verbose=True
)

In [None]:
# 결과 분석
print("\n" + "="*60)
print("RESULTS ANALYSIS")
print("="*60)

# 모델별 통계
stats = results_df.groupby('model').agg({
    'best_value': ['mean', 'std', 'min', 'max'],
    'total_cost': ['mean', 'std', 'min', 'max'],
    'n_iterations': ['mean', 'std']
})

print("\nModel Performance Statistics:")
print(stats)

# 목표값과의 차이
target = BASE_CONFIG['target_value']
results_df['gap_from_target'] = abs(results_df['best_value'] - target)

gap_stats = results_df.groupby('model')['gap_from_target'].agg(['mean', 'std', 'min'])
print("\nGap from Target (1.34 eV):")
print(gap_stats)

In [None]:
# 시각화: Box plots
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# 1. Best value distribution
results_df.boxplot(column='best_value', by='model', ax=axes[0, 0])
axes[0, 0].axhline(y=target, color='r', linestyle='--', label='Target')
axes[0, 0].set_title('Best Value Distribution')
axes[0, 0].set_ylabel('Bandgap (eV)')
axes[0, 0].legend()

# 2. Total cost distribution
results_df.boxplot(column='total_cost', by='model', ax=axes[0, 1])
axes[0, 1].set_title('Total Cost Distribution')
axes[0, 1].set_ylabel('Cost')

# 3. Gap from target
results_df.boxplot(column='gap_from_target', by='model', ax=axes[1, 0])
axes[1, 0].set_title('Gap from Target')
axes[1, 0].set_ylabel('|Value - 1.34| (eV)')

# 4. Iterations
results_df.boxplot(column='n_iterations', by='model', ax=axes[1, 1])
axes[1, 1].set_title('Number of Iterations')
axes[1, 1].set_ylabel('Iterations')

plt.suptitle('Model Comparison Results', fontsize=16)
plt.tight_layout()
plt.show()

In [None]:
# 시각화: Performance vs Efficiency
fig, ax = plt.subplots(figsize=(10, 8))

# 각 모델의 평균 성능 계산
model_means = results_df.groupby('model').agg({
    'gap_from_target': 'mean',
    'total_cost': 'mean'
}).reset_index()

# Scatter plot
for model in model_means['model'].unique():
    model_data = results_df[results_df['model'] == model]
    ax.scatter(model_data['total_cost'], 
              model_data['gap_from_target'],
              label=model, alpha=0.6, s=100)

# 평균점 표시
ax.scatter(model_means['total_cost'], 
          model_means['gap_from_target'],
          s=200, edgecolors='black', linewidth=2, 
          marker='*', c='red', label='Mean')

# Annotations
for _, row in model_means.iterrows():
    ax.annotate(row['model'], 
               (row['total_cost'], row['gap_from_target']),
               xytext=(5, 5), textcoords='offset points')

ax.set_xlabel('Total Cost', fontsize=12)
ax.set_ylabel('Gap from Target (eV)', fontsize=12)
ax.set_title('Performance vs Efficiency Trade-off', fontsize=14)
ax.legend(loc='best')
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Statistical significance test
from scipy import stats

print("\n" + "="*60)
print("STATISTICAL SIGNIFICANCE TESTS")
print("="*60)

models = results_df['model'].unique()
print("\nPairwise t-tests for gap_from_target:")
print("-" * 40)

for i, model1 in enumerate(models):
    for model2 in models[i+1:]:
        data1 = results_df[results_df['model'] == model1]['gap_from_target']
        data2 = results_df[results_df['model'] == model2]['gap_from_target']
        
        t_stat, p_value = stats.ttest_ind(data1, data2)
        
        print(f"{model1} vs {model2}:")
        print(f"  t-statistic: {t_stat:.4f}")
        print(f"  p-value: {p_value:.4f}")
        
        if p_value < 0.05:
            print(f"  → Significant difference (p < 0.05)")
        else:
            print(f"  → No significant difference")
        print()

In [None]:
# Best performing configuration
best_run = results_df.loc[results_df['gap_from_target'].idxmin()]

print("\n" + "="*60)
print("BEST PERFORMING CONFIGURATION")
print("="*60)
print(f"Model: {best_run['model']}")
print(f"Seed: {best_run['seed']}")
print(f"Best value: {best_run['best_value']:.6f} eV")
print(f"Gap from target: {best_run['gap_from_target']:.6f} eV")
print(f"Total cost: {best_run['total_cost']:.2f}")
print(f"Iterations: {best_run['n_iterations']}")
print(f"Experiment ID: {best_run['experiment_id']}")

# Load detailed results
detailed_results = runner.load_results(best_run['experiment_id'])
if 'best_params' in detailed_results:
    print(f"\nBest parameters found:")
    for key, value in detailed_results['best_params'].items():
        print(f"  {key}: {value}")

In [None]:
# Save summary report
report = {
    'experiment_date': datetime.now().isoformat(),
    'base_config': BASE_CONFIG,
    'models_tested': list(MODELS.keys()),
    'n_seeds': N_SEEDS,
    'summary_statistics': stats.to_dict(),
    'best_configuration': best_run.to_dict(),
    'results_file': f"experiment_results/results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
}

import json
report_path = f"experiment_results/report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
with open(report_path, 'w') as f:
    json.dump(report, f, indent=2, default=str)

print(f"\nReport saved to: {report_path}")
print(f"Results saved to: {results_df.to_csv('experiment_results/comparison_results.csv', index=False)}")

## Conclusions

### Key Findings
1. Model performance ranking:
   - 

2. Efficiency analysis:
   - 

3. Robustness (variance across seeds):
   - 

### Recommendations
- 

### Next Steps
- 
