# 14. MatterGen 개선 실험 (v2 - 개별 실행)

## 목적
기존 MatterGen 구조의 **Ca 부족 문제**를 해결하여 산업폐기물과 공정한 비교를 수행합니다.

---

### 개선 사항
- Ca-rich 구조 선별 (Ca/Si >= 1.0)
- Supercell 확장 (2×2×2)
- 물 분자 수 비례 조정
- **개별 실행 가능** (구조별로 나눠서 실행)

## 1. 환경 설정

In [None]:
import sys
from pathlib import Path
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import gc
import torch
import random

PROJECT_ROOT = Path.cwd().parent.parent
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from ase.io import read, write
from ase import Atoms, Atom
from ase.build import make_supercell

MATTERGEN_DIR = PROJECT_ROOT / 'data' / 'mattergen'
RESULTS_DIR = PROJECT_ROOT / 'data' / 'results'
TRAJ_DIR = PROJECT_ROOT / 'trajectories'

print(f"Project Root: {PROJECT_ROOT}")
print(f"GPU: {torch.cuda.is_available()}")

## 2. 구조 분석 및 Ca-rich 선별

In [None]:
# 모든 MatterGen 구조 로딩
all_structures = []

for phase_dir in sorted(MATTERGEN_DIR.iterdir()):
    if not phase_dir.is_dir():
        continue
    for cif_file in sorted(phase_dir.glob('*.cif')):
        try:
            atoms = read(cif_file)
            symbols = atoms.get_chemical_symbols()
            ca = symbols.count('Ca')
            si = symbols.count('Si')
            o = symbols.count('O')
            
            all_structures.append({
                'phase': phase_dir.name,
                'file': cif_file.name,
                'path': str(cif_file),
                'formula': atoms.get_chemical_formula(mode='hill'),
                'n_atoms': len(atoms),
                'ca': ca, 'si': si, 'o': o,
                'ca_si': ca/si if si > 0 else 0,
                'atoms': atoms
            })
        except:
            pass

df = pd.DataFrame(all_structures).sort_values('ca_si', ascending=False)
print(f"Total structures: {len(df)}")
print("\nTop 10 Ca-rich:")
print(df[['phase', 'formula', 'n_atoms', 'ca', 'si', 'ca_si']].head(10).to_string())

In [None]:
# Ca-rich 후보 선별 (Ca/Si >= 1.0)
ca_rich = df[(df['ca_si'] >= 1.0) & (df['si'] > 0) & (df['o'] >= 2)].copy()
print(f"Ca-rich candidates: {len(ca_rich)}")

# Top 3 선택
selected = ca_rich.head(3)
print("\nSelected for hydration:")
for i, (_, r) in enumerate(selected.iterrows()):
    print(f"  [{i}] {r['formula']} (Ca/Si={r['ca_si']:.1f})")

## 3. Supercell 확장

In [None]:
supercell_structures = []
P = [[2,0,0],[0,2,0],[0,0,2]]

for _, row in selected.iterrows():
    atoms = row['atoms'].copy()
    sc = make_supercell(atoms, P)
    symbols = sc.get_chemical_symbols()
    
    supercell_structures.append({
        'original_formula': row['formula'],
        'supercell_formula': sc.get_chemical_formula(mode='hill'),
        'n_atoms': len(sc),
        'ca': symbols.count('Ca'),
        'si': symbols.count('Si'),
        'ca_si': row['ca_si'],
        'atoms': sc
    })
    print(f"{row['formula']} -> {sc.get_chemical_formula(mode='hill')} ({len(sc)} atoms)")

print(f"\nCreated {len(supercell_structures)} supercells")

## 4. CHGNet 로딩

In [None]:
from chgnet.model import CHGNet
from chgnet.model.dynamics import CHGNetCalculator

model = CHGNet.load()
calc = CHGNetCalculator(model, use_device='cuda' if torch.cuda.is_available() else 'cpu')
print("CHGNet loaded")

## 5. 구조 최적화

In [None]:
from ase.optimize import BFGS

optimized_structures = []

for i, s in enumerate(supercell_structures):
    atoms = s['atoms'].copy()
    atoms.calc = calc
    
    print(f"[{i}] Optimizing {s['supercell_formula']}...")
    opt = BFGS(atoms, logfile=None)
    opt.run(fmax=0.05, steps=200)
    
    optimized_structures.append({**s, 'opt_atoms': atoms.copy()})
    print(f"    Done. E = {atoms.get_potential_energy():.2f} eV")
    gc.collect()

print("\nOptimization complete!")

## 6. 수화 시스템 생성

In [None]:
def add_water(atoms, n_water, min_dist=2.5, expand=1.5):
    """물 분자 추가"""
    h = atoms.copy()
    cell = h.get_cell() * expand
    h.set_cell(cell, scale_atoms=False)
    
    added = 0
    for _ in range(n_water):
        for _ in range(100):
            pos = np.random.rand(3) @ cell
            if len(h) == 0 or np.min(np.linalg.norm(h.positions - pos, axis=1)) >= min_dist:
                h.append(Atom('O', position=pos))
                a1, a2 = random.uniform(0, 2*np.pi), random.uniform(0, np.pi)
                h.append(Atom('H', position=pos + 0.96*np.array([np.sin(a2)*np.cos(a1), np.sin(a2)*np.sin(a1), np.cos(a2)])))
                h.append(Atom('H', position=pos + 0.96*np.array([np.sin(a2)*np.cos(a1+1.82), np.sin(a2)*np.sin(a1+1.82), np.cos(a2)])))
                added += 1
                break
    return h, added

# 수화 시스템 생성
hydrated_structures = []

for i, s in enumerate(optimized_structures):
    n_water = max(8, min(25, s['n_atoms'] // 3))
    h, added = add_water(s['opt_atoms'], n_water)
    
    hydrated_structures.append({
        **s,
        'hydrated_atoms': h,
        'n_water': added,
        'total_atoms': len(h)
    })
    print(f"[{i}] {s['supercell_formula']}: +{added} water -> {len(h)} atoms")

print("\nHydration systems ready!")

---
## 7. MD 시뮬레이션 (개별 실행)

**아래 셀들을 하나씩 실행하세요** (각각 ~7분 소요)

In [None]:
# [필수] MD 함수 정의 - 먼저 실행!
from ase.md.langevin import Langevin
from ase.io.trajectory import Trajectory
from ase import units

MD_PS = 5.0
hydration_results = []

def run_md(idx):
    s = hydrated_structures[idx]
    atoms = s['hydrated_atoms'].copy()
    atoms.calc = calc
    name = f"improved_{s['original_formula'].replace(' ','_')}"
    
    print(f"[{idx}] {s['supercell_formula']} ({len(atoms)} atoms)")
    e0 = atoms.get_potential_energy()
    print(f"  Initial: {e0:.2f} eV")
    
    dyn = Langevin(atoms, 1*units.fs, temperature_K=300, friction=0.01)
    traj = Trajectory(str(TRAJ_DIR/f"{name}_hydration.traj"), 'w', atoms)
    dyn.attach(traj.write, interval=10)
    
    n_steps = int(MD_PS * 1000)
    for _ in tqdm(range(n_steps//100), desc="  Progress"):
        dyn.run(100)
    traj.close()
    
    e1 = atoms.get_potential_energy()
    dE = e1 - e0
    print(f"  Final: {e1:.2f} eV, dE: {dE:.2f} eV")
    
    gc.collect()
    torch.cuda.empty_cache() if torch.cuda.is_available() else None
    
    return {'name': name, 'formula': s['original_formula'], 'supercell': s['supercell_formula'],
            'ca': s['ca'], 'si': s['si'], 'ca_si': s['ca_si'], 'n_atoms': s['n_atoms'],
            'n_water': s['n_water'], 'e0': e0, 'e1': e1, 'dE': dE}

print(f"Structures to run: {len(hydrated_structures)}")
for i, s in enumerate(hydrated_structures):
    print(f"  [{i}] {s['supercell_formula']}")

### 7-1. 구조 0 실행

In [None]:
r0 = run_md(0)
hydration_results.append(r0)
print(f"\nSaved. Total: {len(hydration_results)}")

### 7-2. 구조 1 실행

In [None]:
r1 = run_md(1)
hydration_results.append(r1)
print(f"\nSaved. Total: {len(hydration_results)}")

### 7-3. 구조 2 실행

In [None]:
r2 = run_md(2)
hydration_results.append(r2)
print(f"\nSaved. Total: {len(hydration_results)}")

### 7-4. 결과 확인

In [None]:
print(f"Completed: {len(hydration_results)}/3")
print("=" * 50)
for r in hydration_results:
    print(f"  {r['formula']:15s}: dE = {r['dE']:.2f} eV")

---
## 8. 산업폐기물 비교

In [None]:
# 산업폐기물 데이터 로딩
with open(RESULTS_DIR / 'pipeline_screening_results.json', encoding='utf-8') as f:
    screening = json.load(f)
with open(RESULTS_DIR / 'top_candidates.json', encoding='utf-8') as f:
    top5 = json.load(f)['top_candidates']

print("Industrial Waste Top 5:")
iw_data = []
for name in top5:
    r = screening['results'][name]
    opt = r.get('optimization', {})
    dE = opt.get('final_energy', 0) - opt.get('initial_energy', 0)
    iw_data.append({'name': name, 'dE': dE})
    print(f"  {name}: dE = {dE:.2f} eV")

iw_avg = np.mean([d['dE'] for d in iw_data])
mg_avg = np.mean([r['dE'] for r in hydration_results]) if hydration_results else 0

print(f"\n{'='*50}")
print(f"Industrial Waste avg dE: {iw_avg:.2f} eV")
print(f"Improved MatterGen avg dE: {mg_avg:.2f} eV")
print(f"Previous MatterGen avg dE: -66.66 eV")

## 9. 결과 저장

In [None]:
results = {
    'experiment': 'MatterGen Improved (Ca-rich + Supercell)',
    'results': hydration_results,
    'comparison': {
        'iw_avg': float(iw_avg),
        'mg_improved_avg': float(mg_avg),
        'mg_previous_avg': -66.66
    }
}

with open(RESULTS_DIR / 'mattergen_improved_hydration.json', 'w') as f:
    json.dump(results, f, indent=2)
print("Results saved!")

## 10. 결론

In [None]:
print("=" * 60)
print("EXPERIMENT SUMMARY")
print("=" * 60)
print(f"\nIndustrial Waste avg: {iw_avg:.2f} eV")
print(f"MatterGen (previous): -66.66 eV")
print(f"MatterGen (improved): {mg_avg:.2f} eV")

if mg_avg <= iw_avg * 0.8:
    print("\n[SUCCESS] MatterGen now competitive!")
    print("-> Proceed to 12_Final_Figures")
elif mg_avg <= iw_avg * 0.5:
    print("\n[PARTIAL] Improved but gap remains")
else:
    print("\n[LIMITED] Consider re-generating structures")