In [None]:
cd ~/MultiFidelity-ProcessOpt/Perovskites/

In [1]:
import torch

if torch.mps.is_available():
    print("CUDA(mps) 사용 가능합니다.")
    device = torch.device("mps")
else:
    print("CUDA(GPU) 사용 불가, CPU 사용합니다.")
    device = torch.device("cpu")


AttributeError: module 'torch.mps' has no attribute 'is_available'

In [None]:
ls

In [2]:
#!/usr/bin/env python

import json
import pickle
import numpy as np
import pandas as pd
from copy import deepcopy

from olympus.datasets import Dataset
from olympus.objects import (
	ParameterContinuous,
	ParameterDiscrete, 
	ParameterCategorical, 
	ParameterVector
)
from olympus.campaigns import ParameterSpace, Campaign

# from atlas.planners.multi_fidelity.planner import MultiFidelityPlanner



AttributeError: module 'matplotlib.pyplot' has no attribute 'register_cmap'

In [None]:
ls

In [3]:

NUM_RUNS = 1
# BUDGET = 30
COST_BUDGET = 50 # 200.
NUM_INIT_DESIGN = 10
NUM_CHEAP = 8

# lookup table
# organic --> cation --> anion --> bandgap_hse06/bandgap_gga
LOOKUP = pickle.load(open('../0.Data/lookup_table.pkl', 'rb'))
# print(lookup.keys())
# print(lookup['Ethylammonium']['Ge']['F'].keys())



FileNotFoundError: [Errno 2] No such file or directory: '../0.Data/lookup_table.pkl'

In [5]:
def measure_from_label(label_arr, s, label_maps, LOOKUP):
    """
    label_arr: [organic_label, cation_label, anion_label]
    s: 0.1 or 1.0 (fidelity)
    label_maps: {'organic': {...}, 'cation': {...}, 'anion': {...}}
    LOOKUP: 전체 lookup table
    """
    # 1. label_maps 역변환 사전 생성
    reverse_maps = {
        "organic": {v: k for k, v in label_maps["organic"].items()},
        "cation": {v: k for k, v in label_maps["cation"].items()},
        "anion": {v: k for k, v in label_maps["anion"].items()},
    }
    # 2. label에서 원래 카테고리명으로 변환
    organic = reverse_maps["organic"][int(label_arr[0])]
    cation = reverse_maps["cation"][int(label_arr[1])]
    anion = reverse_maps["anion"][int(label_arr[2])]

    # 3. 기존 measure 함수와 동일
    if s == 1.0:
        measurement = np.amin(
            LOOKUP[organic.capitalize()][cation][anion]['bandgap_hse06']
        )
    elif s == 0.1:
        measurement = np.amin(
            LOOKUP[organic.capitalize()][cation][anion]['bandgap_gga']
        )
    else:
        raise ValueError("s(fidelity)는 0.1 또는 1.0만 가능합니다.")
    return measurement

def get_min_hse06_bandgap(param_space):
	organic_options = [o.capitalize() for o in param_space[1].options]
	cation_options = [o.capitalize() for o in param_space[2].options]
	anion_options = [o.capitalize() for o in param_space[3].options]

	hse06_bandgaps = []
	for organic_option in organic_options:
		for cation_option in cation_options:
			for anion_option in anion_options:
				hse06_bandgaps.append(
					np.amin(
						LOOKUP[organic_option][cation_option][anion_option]['bandgap_hse06']
					)
				)
	min_hse06_bandgap = np.amin(hse06_bandgaps)
	return min_hse06_bandgap

def compute_cost(params):
	costs = params[:,0].astype(float)
	return np.sum(costs)



In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

class TransferLearningDNN:
    def __init__(self, input_dim, hidden_dim=64, device='cpu'):
        self.input_dim = input_dim
        self.device = device
        self.hidden_dim = hidden_dim
        self.pretrain_losses = []
        self.finetune_losses = []

        # feature extractor (hidden layers)
        self.feature_net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
        ).to(device)
        # 임시 선형 출력층 (bias=False는 선택)
        self.out_layer = nn.Linear(hidden_dim, 1, bias=False).to(device)

        # 최종 전체 모델
        self.model = nn.Sequential(self.feature_net, self.out_layer)

        self.feature_net = self.feature_net.float()
        self.out_layer = self.out_layer.float()
        self.model = self.model.float()

    def pretrain(self, X_low, y_low, epochs=50, lr=1e-3, verbose=False):
        # low-fidelity 데이터로 선학습
        self.pretrain_losses = []
        X_low = np.asarray(X_low, dtype=np.float32)  # ★ float32로 강제 변환
        y_low = np.asarray(y_low, dtype=np.float32).flatten()  # ★ float32로 강제 변환
        X_tensor = torch.tensor(X_low, dtype=torch.float32).to(self.device)
        y_tensor = torch.tensor(y_low, dtype=torch.float32).view(-1, 1).to(self.device)
        optimizer = optim.Adam(list(self.feature_net.parameters()) + list(self.out_layer.parameters()), lr=lr)
        loss_fn = nn.MSELoss()

        self.model.train()
        for epoch in range(epochs):
            optimizer.zero_grad()
            features = self.feature_net(X_tensor)
            pred = self.out_layer(features)
            loss = loss_fn(pred, y_tensor)
            loss.backward()
            optimizer.step()
            self.pretrain_losses.append(loss.item())
            if verbose and (epoch+1) % 50 == 0:
                print(f'[Pretrain] Epoch {epoch+1}: Loss {loss.item():.4f}')

    def finetune(self, X_high, y_high, epochs=50, lr=1e-4, verbose=False):
        self.finetune_losses = []
        X_high = np.asarray(X_high, dtype=np.float32)  # ★ float32로 강제 변환
        y_high = np.asarray(y_high, dtype=np.float32).flatten()  # ★ float32로 강제 변환
        X_tensor = torch.tensor(X_high, dtype=torch.float32).to(self.device)
        y_tensor = torch.tensor(y_high, dtype=torch.float32).view(-1, 1).to(self.device)
        optimizer = optim.Adam(list(self.feature_net.parameters()) + list(self.out_layer.parameters()), lr=lr)
        loss_fn = nn.MSELoss()
        self.model.train()
        for epoch in range(epochs):
            optimizer.zero_grad()
            features = self.feature_net(X_tensor)
            pred = self.out_layer(features)
            loss = loss_fn(pred, y_tensor)
            loss.backward()
            optimizer.step()
            self.finetune_losses.append(loss.item())
            if verbose and (epoch+1) % 20 == 0:
                print(f'[Finetune] Epoch {epoch+1}: Loss {loss.item():.4f}')

    def predict(self, X):
        X = np.asarray(X, dtype=np.float32)  # ★ float32로 강제 변환
        X_tensor = torch.tensor(X, dtype=torch.float32).to(self.device)
        self.model.eval()
        with torch.no_grad():
            features = self.feature_net(X_tensor)
            y_pred = self.out_layer(features).cpu().numpy().flatten()
        return y_pred

    def extract_features(self, X):
        X = np.asarray(X, dtype=np.float32)  # ★ float32로 강제 변환
        X_tensor = torch.tensor(X, dtype=torch.float32).to(self.device)
        self.model.eval()
        with torch.no_grad():
            features = self.feature_net(X_tensor).cpu().numpy()
        return features


    def get_fitness_func(self, y_best=None, xi=0.01):
        def fitness(x):
            mu = self.predict(np.array([x]))[0]
            if y_best is not None:
                return -(y_best - mu - xi)  # maximize EI → minimize -EI
            else:
                return mu
        return fitness


In [7]:
param_space = {
    "organic": [
        "ethylammonium", "propylammonium", "butylammonium", "isopropylammonium",
        "dimethylammonium", "acetamidinium", "methylammonium", "guanidinium",
        "hydroxylammonium", "formamidinium", "tetramethylammonium", "hydrazinium",
        "ammonium", "trimethylammonium", "azetidinium", "imidazolium"
    ],
    "cation": ["Ge", "Sn", "Pb"],
    "anion": ["F", "Cl", "Br", "I"]
}
label_maps = {
    key: {val: idx for idx, val in enumerate(vals, 1)}   # 1부터 시작하고 싶으면 enumerate(..., 1)
    for key, vals in param_space.items()
}
label_maps

{'organic': {'ethylammonium': 1,
  'propylammonium': 2,
  'butylammonium': 3,
  'isopropylammonium': 4,
  'dimethylammonium': 5,
  'acetamidinium': 6,
  'methylammonium': 7,
  'guanidinium': 8,
  'hydroxylammonium': 9,
  'formamidinium': 10,
  'tetramethylammonium': 11,
  'hydrazinium': 12,
  'ammonium': 13,
  'trimethylammonium': 14,
  'azetidinium': 15,
  'imidazolium': 16},
 'cation': {'Ge': 1, 'Sn': 2, 'Pb': 3},
 'anion': {'F': 1, 'Cl': 2, 'Br': 3, 'I': 4}}

In [8]:
import numpy as np

def sample_param_space(param_space, n_samples, random_state=None):
    rng = np.random.default_rng(random_state)
    param_names = list(param_space.keys())
    samples = []
    for _ in range(n_samples):
        sample = {}
        for key, opts in param_space.items():
            if isinstance(opts, (list, tuple)) and len(opts) > 0 and isinstance(opts[0], (str, int)):  # 범주형
                sample[key] = rng.choice(opts)
            elif isinstance(opts, tuple) and len(opts) == 2 and all(isinstance(x, (int, float)) for x in opts):  # 연속형
                low, high = opts
                sample[key] = float(rng.uniform(low, high))
            else:
                raise ValueError(f"Unknown parameter type for {key}: {opts}")
        samples.append(sample)
    return samples


In [9]:
# NUM_INIT_DESIGN = 10  # 원하는 개수로 지정

# param_space는 이전 단계에서 정의한 딕셔너리 그대로 사용

init_samples = sample_param_space(param_space, NUM_INIT_DESIGN, random_state=42)
for s in init_samples:
    print(s)


{'organic': 'propylammonium', 'cation': 'Pb', 'anion': 'Br'}
{'organic': 'guanidinium', 'cation': 'Sn', 'anion': 'I'}
{'organic': 'propylammonium', 'cation': 'Pb', 'anion': 'F'}
{'organic': 'propylammonium', 'cation': 'Sn', 'anion': 'I'}
{'organic': 'hydrazinium', 'cation': 'Pb', 'anion': 'Br'}
{'organic': 'ammonium', 'cation': 'Sn', 'anion': 'F'}
{'organic': 'trimethylammonium', 'cation': 'Sn', 'anion': 'Br'}
{'organic': 'acetamidinium', 'cation': 'Ge', 'anion': 'I'}
{'organic': 'ammonium', 'cation': 'Sn', 'anion': 'Cl'}
{'organic': 'trimethylammonium', 'cation': 'Sn', 'anion': 'Cl'}


In [10]:
def assign_fidelities(n_samples, high_ratio=0.11111, random_state=None):
    """
    전체 n_samples 중 high_ratio 만큼만 high-fidelity(1.0), 나머지는 low-fidelity(0.1)로 할당
    """
    rng = np.random.default_rng(random_state)
    n_high = max(1, int(round(n_samples * high_ratio)))
    n_low = n_samples - n_high
    fids = [1.0]*n_high + [0.1]*n_low
    rng.shuffle(fids)
    return fids

measurements = []
for params, s in zip(init_samples, assign_fidelities(NUM_INIT_DESIGN, high_ratio=0.11111, random_state=42)):
    if s == 1.0:
        measurement = np.amin(
            LOOKUP[params['organic'].capitalize()][params['cation']][params['anion']]['bandgap_hse06']
        )
    else:
        measurement = np.amin(
            LOOKUP[params['organic'].capitalize()][params['cation']][params['anion']]['bandgap_gga']
        )
    # 관측값 저장
    measurements.append({"params": params, "s": s, "measurement": measurement})
measurements

[{'params': {'organic': 'propylammonium', 'cation': 'Pb', 'anion': 'Br'},
  's': 0.1,
  'measurement': 2.1145},
 {'params': {'organic': 'guanidinium', 'cation': 'Sn', 'anion': 'I'},
  's': 0.1,
  'measurement': 1.0856},
 {'params': {'organic': 'propylammonium', 'cation': 'Pb', 'anion': 'F'},
  's': 1.0,
  'measurement': 5.2155},
 {'params': {'organic': 'propylammonium', 'cation': 'Sn', 'anion': 'I'},
  's': 0.1,
  'measurement': 1.3516},
 {'params': {'organic': 'hydrazinium', 'cation': 'Pb', 'anion': 'Br'},
  's': 0.1,
  'measurement': 2.3859},
 {'params': {'organic': 'ammonium', 'cation': 'Sn', 'anion': 'F'},
  's': 0.1,
  'measurement': 3.8068},
 {'params': {'organic': 'trimethylammonium', 'cation': 'Sn', 'anion': 'Br'},
  's': 0.1,
  'measurement': 1.7405},
 {'params': {'organic': 'acetamidinium', 'cation': 'Ge', 'anion': 'I'},
  's': 0.1,
  'measurement': 1.6986},
 {'params': {'organic': 'ammonium', 'cation': 'Sn', 'anion': 'Cl'},
  's': 0.1,
  'measurement': 1.8983},
 {'params': {

In [11]:
assign_fidelities(NUM_INIT_DESIGN, high_ratio=0.11111, random_state=42)

[0.1, 0.1, 1.0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]

In [12]:

# 1. 라벨맵 생성
label_maps = {
    key: {val: idx for idx, val in enumerate(vals, 1)}   # 1부터 시작
    for key, vals in param_space.items()
}

# 2. df 변환 및 라벨 적용
df = pd.DataFrame([
    {**obs['params'], 's': obs['s'], 'measurement': obs['measurement']}
    for obs in measurements
])
for col in ['organic', 'cation', 'anion']:
    df[col + '_label'] = df[col].map(label_maps[col])

# 3. 모델 입력/출력 분리
ini_X = df[['organic_label', 'cation_label', 'anion_label', 's']].values
ini_y = df['measurement'].values

print(ini_X[:5])
print(ini_y[:5])

# s 값에 따라 분할
ini_X_low = ini_X[df['s'] == 0.1]
ini_y_low = ini_y[df['s'] == 0.1]

ini_X_high = ini_X[df['s'] == 1.0]
ini_y_high = ini_y[df['s'] == 1.0]


[[ 2.   3.   3.   0.1]
 [ 8.   2.   4.   0.1]
 [ 2.   3.   1.   1. ]
 [ 2.   2.   4.   0.1]
 [12.   3.   3.   0.1]]
[2.1145 1.0856 5.2155 1.3516 2.3859]


In [13]:

# ini_X_low, ini_X_high에서 s를 제외한 3개 컬럼만 남기기
ini_X_low = ini_X_low[:, :3]   # (N_low, 3)
ini_X_high = ini_X_high[:, :3] # (N_high, 3)
ini_X_low = np.asarray(ini_X_low, dtype=np.float32)
ini_X_high = np.asarray(ini_X_high, dtype=np.float32)
ini_y_low = np.asarray(ini_y_low, dtype=np.float32).flatten()
ini_y_high = np.asarray(ini_y_high, dtype=np.float32).flatten()

# 모델 학습
model = TransferLearningDNN(input_dim=ini_X_low.shape[1], hidden_dim=64, device='cpu')

if len(ini_X_low) > 0:
    model.pretrain(ini_X_low, ini_y_low, epochs=100, lr=1e-3, verbose=True)

if len(ini_X_high) > 0:
    model.finetune(ini_X_high, ini_y_high, epochs=50, lr=1e-3, verbose=True)


[Pretrain] Epoch 50: Loss 0.3992
[Pretrain] Epoch 100: Loss 0.2799
[Finetune] Epoch 20: Loss 2.7000
[Finetune] Epoch 40: Loss 0.0963


In [14]:
def append_measurement_to_data(existing_X_low, existing_y_low,
                               existing_X_high, existing_y_high,
                               label_arr, s, label_maps, LOOKUP):
    """
    label_arr: [organic_label, cation_label, anion_label]
    s: 0.1 or 1.0 (fidelity)
    기존 데이터에 label_arr을 s에 따라 붙여주는 함수
    """
    # 1. 측정값 구하기
    measurement = measure_from_label(label_arr, s, label_maps, LOOKUP)
    
    # 2. 데이터 추가 (s는 X에 넣지 않으므로 3개만!)
    label_arr = np.array(label_arr, dtype=np.float32).reshape(1, -1)
    measurement = np.array([measurement], dtype=np.float32)

    if s == 0.1:
        existing_X_low = np.vstack([existing_X_low, label_arr]) if existing_X_low.size else label_arr
        existing_y_low = np.concatenate([existing_y_low, measurement]) if existing_y_low.size else measurement
    elif s == 1.0:
        existing_X_high = np.vstack([existing_X_high, label_arr]) if existing_X_high.size else label_arr
        existing_y_high = np.concatenate([existing_y_high, measurement]) if existing_y_high.size else measurement
    else:
        raise ValueError("s(fidelity)는 0.1 또는 1.0만 가능합니다.")

    return existing_X_low, existing_y_low, existing_X_high, existing_y_high

# # 추천 label이 [2, 3, 3], s=1.0이라고 가정
# ini_X_low, ini_y_low, ini_X_high, ini_y_high = append_measurement_to_data(
#     ini_X_low, ini_y_low, ini_X_high, ini_y_high,
#     next_x_label, 1.0, label_maps, LOOKUP
# )

## DNN4GP

## 정지조건추가

In [15]:
class BayesianLinearRegression:
    def __init__(self, alpha=1.0, beta=25.0):
        self.alpha = alpha
        self.beta = beta
    def fit(self, Phi, y):
        I = np.eye(Phi.shape[1])
        self.K = self.beta * Phi.T @ Phi + self.alpha * I
        self.K_inv = np.linalg.inv(self.K)
        self.m = self.beta * self.K_inv @ Phi.T @ y
    def predict(self, phi_x):
        mean = phi_x @ self.m
        var = phi_x @ self.K_inv @ phi_x.T + 1 / self.beta
        return mean, var

In [16]:
import pickle
import itertools
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.metrics import r2_score, mean_absolute_error

LOOKUP = pickle.load(open('../0.Data/lookup_table.pkl', 'rb'))

organic_options = ['ethylammonium', 'propylammonium', 'butylammonium', 'isopropylammonium', 'dimethylammonium', 'acetamidinium', 'methylammonium', 'guanidinium', 'hydroxylammonium', 'formamidinium', 'tetramethylammonium', 'hydrazinium', 'ammonium', 'trimethylammonium', 'azetidinium', 'imidazolium']
cation_options = ['Ge', 'Sn', 'Pb']
anion_options = ['F', 'Cl', 'Br', 'I']

all_results = []
for i, organic in enumerate(organic_options, 1):
    for j, cation in enumerate(cation_options, 1):
        for k, anion in enumerate(anion_options, 1):
            try:
                bandgap = np.amin(
                    LOOKUP[organic.capitalize()][cation][anion]['bandgap_hse06']
                )
                combo_label = f"{i},{j},{k}"
                all_results.append({
                    'combo': combo_label,
                    'bandgap_hse06': bandgap
                })
            except Exception as e:
                print(f"Skip: {organic}-{cation}-{anion} ({e})")
                continue

ori_gga_data = pd.DataFrame(all_results)
ori_gga_data


Unnamed: 0,combo,bandgap_hse06
0,111,5.3704
1,112,3.1393
2,113,2.7138
3,114,2.2338
4,121,3.9789
...,...,...
187,1624,1.9179
188,1631,4.4501
189,1632,3.6158
190,1633,2.8641


In [17]:
min_idx = ori_gga_data['bandgap_hse06'].idxmin()
min_combo = ori_gga_data.loc[min_idx, 'combo']
min_bandgap = ori_gga_data.loc[min_idx, 'bandgap_hse06']
print(f"Lowest bandgap_hse06: {min_bandgap:.4f} (combo: {min_combo})")
min_combo

Lowest bandgap_hse06: 1.5249 (combo: 12,2,4)


'12,2,4'

In [18]:
from scipy.stats import norm

def penalized_expected_improvement(mu, sigma, y_best, xi=0.01, penalty_scale=1.0, penalty_width=0.1):
    # 기본 EI 계산
    sigma = np.maximum(sigma, 1e-8)
    z = (y_best - mu - xi) / sigma
    ei = (y_best - mu - xi) * norm.cdf(z) + sigma * norm.pdf(z)
    # y_best 근처면 penalty 부여
    penalty = np.exp(-((mu - y_best)**2) / (2 * penalty_width**2))
    penalized_ei = ei - penalty_scale * penalty
    return penalized_ei

def expected_improvement(mu, sigma, y_best, xi=0.01):
    sigma = np.maximum(sigma, 1e-8)
    z = (y_best - mu - xi) / sigma
    ei = (y_best - mu - xi) * norm.cdf(z) + sigma * norm.pdf(z)
    return ei
        

In [None]:

import time

import matplotlib.pyplot as plt
init_samples = sample_param_space(param_space, NUM_INIT_DESIGN, random_state=42)
init_fids = assign_fidelities(NUM_INIT_DESIGN, high_ratio=0.2, random_state=42)
measurements = []
for params, s in zip(init_samples, init_fids):
    measurement = measure_from_label(
        [label_maps['organic'][params['organic']],
         label_maps['cation'][params['cation']],
         label_maps['anion'][params['anion']]],
        s, label_maps, LOOKUP
    )
    measurements.append({"params": params, "s": s, "measurement": measurement})

# 데이터프레임 변환 및 라벨 적용
df = pd.DataFrame([
    {**obs['params'], 's': obs['s'], 'measurement': obs['measurement']}
    for obs in measurements
])
for col in ['organic', 'cation', 'anion']:
    df[col + '_label'] = df[col].map(label_maps[col])

ini_X = df[['organic_label', 'cation_label', 'anion_label', 's']].values
ini_y = df['measurement'].values

# s 값에 따라 분할 (X에서 s를 제거)
ini_X_low = ini_X[df['s'] == 0.1][:, :3]
ini_y_low = ini_y[df['s'] == 0.1]
ini_X_high = ini_X[df['s'] == 1.0][:, :3]
ini_y_high = ini_y[df['s'] == 1.0]

param_ranges = [
    range(1, 17),  # organic_label: 1~16
    range(1, 4),   # cation_label: 1~3
    range(1, 5),   # anion_label: 1~4
]

min_hse06_bandgap = 1.5249
COST_BUDGET = 50


timing_data = []
cost_data = []
best_so_far_curve = []
total_cost = 0.0
best_so_far = np.inf
# N_ITER = 10 # 원하는 반복 횟수
iter_ = 0
all_results = []

cumulative_cost_list = []
for _ in range(100):
    s= 0.1
    while total_cost < COST_BUDGET:
    # for iter_ in range(N_ITER):
        iter_ += 1
        print(f"\n==== Iteration {iter_} ====")
        iter_start = time.time()   # ★★★ 타이밍 시작
        
        model = TransferLearningDNN(input_dim=3, hidden_dim=64, device='cpu')
        if len(ini_X_low) > 0:
            model.pretrain(ini_X_low, ini_y_low, epochs=300, lr=1e-3, verbose=False)
        if len(ini_X_high) > 0:
            model.finetune(ini_X_high, ini_y_high, epochs=150, lr=1e-3, verbose=False)
        
        # BLR 학습
        X_all = np.vstack([ini_X_low, ini_X_high])
        y_all = np.concatenate([ini_y_low, ini_y_high])
        features_all = model.extract_features(X_all)
        blr = BayesianLinearRegression(alpha=1.0, beta=25.0)
        blr.fit(features_all, y_all)
        
        # (2) 전체 조합 생성 (192개)
        all_combinations = list(itertools.product(*param_ranges))  # shape=(192, 3)
        X_grid = np.array(all_combinations, dtype=np.float32)
        features_grid = model.extract_features(X_grid)


        # (3) BLR 예측 및 EI 계산
        y_pred = []
        y_std = []
        for phi in features_grid:
            mu, var = blr.predict(phi)
            y_pred.append(mu)
            y_std.append(np.sqrt(var))
        y_pred = np.array(y_pred)
        y_std = np.array(y_std)
        
        y_best = np.min(y_all)
        
        # Calculate EI for all points
        ei = expected_improvement(y_pred, y_std, y_best)
        
        # Set EI to 0 for already explored points
        if s == 0.1:
            train_combo_set = set(tuple(map(int, row)) for row in np.vstack([ini_X_low, ini_X_high]))
            for i, combo in enumerate(X_grid.astype(int)):
                if tuple(combo) in train_combo_set:
                    ei[i] = 0.0
        elif s == 1:
            train_combo_set = set(tuple(map(int, row)) for row in ini_X_high)
            print(train_combo_set)
            for i, combo in enumerate(X_grid.astype(int)):
                if tuple(combo) in train_combo_set:
                    ei[i] = 0.0
                    
        # Find best unexplored point
        best_idx = np.argmax(ei)
        next_x_label = list(X_grid[best_idx].astype(int))
        print("EI-based recommended label:", next_x_label)

        ori_gga_data['y_pred'] = y_pred
        ori_gga_data['y_std'] = y_std
        
        # 학습에 사용된 조합 set 만들기 (int 변환!)
        train_combo_set = set(tuple(map(int, row)) for row in np.vstack([ini_X_low, ini_X_high]))

        # 전체 192개 조합 중 학습에 쓰인 인덱스 찾기
        train_indices = [i for i, combo in enumerate(X_grid.astype(int)) if tuple(combo) in train_combo_set]


        # ori_gga_data, y_pred, y_std, ei 등 위에서처럼 준비됐다고 가정

        fig, ax1 = plt.subplots(figsize=(18, 7))
        x_idx = range(len(ori_gga_data))

        # True / 예측 / Uncertainty
        ax1.scatter(x_idx, ori_gga_data['bandgap_hse06'], s=40, label='True bandgap', color='royalblue')
        ax1.scatter(x_idx, ori_gga_data['y_pred'], s=40, label='BLR prediction', color='orange', alpha=0.7)
        ax1.fill_between(
            x_idx,
            ori_gga_data['y_pred'] - ori_gga_data['y_std'],
            ori_gga_data['y_pred'] + ori_gga_data['y_std'],
            color='orange', alpha=0.2, label='Pred. std. dev.'
        )
        # 학습 포인트(누적) 검정색으로!
        # === Training points (low/high) ===
        train_indices_low = [i for i, combo in enumerate(X_grid.astype(int)) if tuple(combo) in set(tuple(map(int, row)) for row in ini_X_low)]
        train_indices_high = [i for i, combo in enumerate(X_grid.astype(int)) if tuple(combo) in set(tuple(map(int, row)) for row in ini_X_high)]

        # low fidelity (s=0.1): 파란색 삼각형
        ax1.scatter(
            train_indices_low, ori_gga_data['bandgap_hse06'].iloc[train_indices_low],
            s=110, color='black', label='Training (low, s=0.1)', zorder=10, marker='^'
        )
        # high fidelity (s=1.0): 빨간색 동그라미
        ax1.scatter(
            train_indices_high, ori_gga_data['bandgap_hse06'].iloc[train_indices_high],
            s=110, color='crimson', label='Training (high, s=1.0)', zorder=10, marker='^'
        )

        # === Global optimal 별표 ===
        optimal_combo = '12,2,4'
        optimal_idx = ori_gga_data.index[ori_gga_data['combo'] == optimal_combo].tolist()[0]
        optimal_bandgap = ori_gga_data.loc[optimal_idx, 'bandgap_hse06']
        ax1.scatter(
            optimal_idx, optimal_bandgap,
            marker='*', color='purple', s=250, edgecolor='black',
            label='Global optimum', zorder=20
        )

        ax1.set_ylabel('Bandgap (hse06)', color='navy')
        ax1.set_xlabel('Combinations (organic, cation, anion)')
        ax1.set_xticks(x_idx)
        ax1.set_xticklabels(ori_gga_data['combo'], rotation=90, fontsize=7)

        # 제목 강조: s==1일 때 색상 강조
        if (iter_ % 8 == 0):
            ax1.set_title(f'True Bandgap, Prediction, Uncertainty, and EI\niter: {iter_}',
                          color='crimson', fontsize=18, fontweight='bold', backgroundcolor='#ffe6e6')
        else:
            ax1.set_title(f'True Bandgap, Prediction, Uncertainty, and EI\niter: {iter_}')
        ax1.tick_params(axis='y', labelcolor='navy')

        # EI 오른쪽축
        ax2 = ax1.twinx()
        ax2.plot(x_idx, ei, marker='o', color='forestgreen', label='EI', linewidth=2)
        ax2.scatter(best_idx, ei[best_idx], color='red', s=120, zorder=15, label='Recommended (max EI)')
        ax2.set_ylabel('Expected Improvement (EI)', color='forestgreen')
        ax2.tick_params(axis='y', labelcolor='forestgreen')

        # 범례
        h1, l1 = ax1.get_legend_handles_labels()
        h2, l2 = ax2.get_legend_handles_labels()
        ax1.legend(h1+h2, l1+l2, loc='upper right')

        plt.xlim(-1, len(ori_gga_data))
        plt.tight_layout()
        plt.show()


        # 실제값과 예측값
        y_true = ori_gga_data['bandgap_hse06'].values
        y_pred = ori_gga_data['y_pred'].values

        # R², MAE 계산
        r2 = r2_score(y_true, y_pred)
        mae = mean_absolute_error(y_true, y_pred)

        print(f"R² score: {r2:.4f}")
        print(f"MAE: {mae:.4f}")
        
        y_true = ori_gga_data['bandgap_hse06'].values
        y_pred = ori_gga_data['y_pred'].values

        # 2. 학습 데이터 인덱스
        train_indices = set([i for i, combo in enumerate(X_grid.astype(int)) if tuple(combo) in train_combo_set])
        all_indices = set(range(len(y_true)))
        non_train_indices = all_indices - train_indices

        # 3. 산점도
        plt.figure(figsize=(6, 6))
        # (1) 비학습 데이터 (연한색)
        plt.scatter(
            y_true[list(non_train_indices)], y_pred[list(non_train_indices)],
            alpha=0.4, s=40, color='grey', label='Unmeasured (candidates)'
        )
        # (2) 학습 데이터 (진한색)
        plt.scatter(
            y_true[list(train_indices)], y_pred[list(train_indices)],
            alpha=0.9, s=80, color='black', label='Training points', edgecolor='w'
        )
        # (3) 기준선
        plt.plot([min(y_true), max(y_true)], [min(y_true), max(y_true)], 'r--', label='Ideal: y=x')

        plt.xlabel('Actual value')
        plt.ylabel('Predicted value')
        plt.title(f'Actual vs. Predicted\nR²: {r2:.3f}, MAE: {mae:.3f}')
        plt.legend()
        plt.grid(True)
        plt.tight_layout()
        plt.show()



        # (4) Fidelity 할당 (예시: 8:1)
        s = 0.1 if (iter_ % 8 != 0) else 1.0
        # (5) 측정 및 데이터 추가
        measurement = measure_from_label(next_x_label, s, label_maps, LOOKUP)
        ini_X_low, ini_y_low, ini_X_high, ini_y_high = append_measurement_to_data(
            ini_X_low, ini_y_low, ini_X_high, ini_y_high,
            next_x_label, s, label_maps, LOOKUP
        )
        print(f"Measurement: {measurement:.4f} (fidelity={s})")
        
        iter_end = time.time()
        time_taken = iter_end - iter_start
        timing_data.append([0, iter_, time_taken])
        total_cost += s
        cost_data.append([0, iter_, total_cost])
        
        # best-so-far 기록
        if s == 1:
            if measurement < best_so_far:
                best_so_far = measurement
        best_so_far_curve.append([0, iter_, s, best_so_far])
        print(f"Cumulative cost: {total_cost}, best_so_far: {best_so_far}")
        if s == 1.0 and np.isclose(measurement, min_hse06_bandgap, atol=1e-6):
            print('found the min hse06 bandgap!')
            break
    break
    cumulative_cost_list.append(total_cost)


# 전체 타이밍 데이터 저장
# timing_df = pd.DataFrame(timing_data, columns=['run_ix', 'iter', 'time_taken'])
# timing_df.to_csv('TL_timing_results.csv', index=False)

# # 누적 실험비용 저장
# cost_df = pd.DataFrame(cost_data, columns=['run_ix', 'iter', 'cumulative_cost'])
# cost_df.to_csv('TL_cumulative_cost.csv', index=False)

# # best-so-far curve 저장 (regret curve)
# best_so_far_df = pd.DataFrame(best_so_far_curve, columns=['run_ix', 'iter', 's', 'best_so_far'])
# best_so_far_df.to_csv('TL_best_so_far_curve.csv', index=False)

# df_results = pd.DataFrame(all_results)
# df_results.to_csv("TL_all_iter_results.csv", index=False)

In [26]:
def train_model(ini_X_low, ini_y_low, ini_X_high, ini_y_high, device='cpu'):
    model = TransferLearningDNN(input_dim=3, hidden_dim=64, device=device)
    if len(ini_X_low) > 0:
        model.pretrain(ini_X_low, ini_y_low, epochs=300, lr=1e-3, verbose=False)
    if len(ini_X_high) > 0:
        model.finetune(ini_X_high, ini_y_high, epochs=150, lr=1e-3, verbose=False)
    return model

def fit_blr(model, ini_X_low, ini_X_high, ini_y_low, ini_y_high):
    X_all = np.vstack([ini_X_low, ini_X_high])
    y_all = np.concatenate([ini_y_low, ini_y_high])
    features_all = model.extract_features(X_all)
    blr = BayesianLinearRegression(alpha=1.0, beta=25.0)
    blr.fit(features_all, y_all)
    return blr, X_all, y_all

def recommend_next(model, blr, param_ranges, ini_X_low, ini_X_high, ini_y_low, ini_y_high, s):
    # 1. 모든 가능한 조합 생성
    all_combinations = list(itertools.product(*param_ranges))
    X_grid = np.array(all_combinations, dtype=np.float32)
    
    # 2. DNN으로 feature 추출
    features_grid = model.extract_features(X_grid)
    
    # 3. BLR로 예측값과 불확실성 계산
    y_pred, y_std = [], []
    for phi in features_grid:
        mu, var = blr.predict(phi)
        y_pred.append(mu)
        y_std.append(np.sqrt(var))
    y_pred, y_std = np.array(y_pred), np.array(y_std)
    
    # 4. EI 계산을 위한 현재까지의 최적값 (high-fidelity 데이터만 사용)
    y_best = np.min(ini_y_high) if len(ini_y_high) > 0 else np.inf
    
    # 5. Expected Improvement 계산
    ei = expected_improvement(y_pred, y_std, y_best)
    
    # 6. s에 따라 이미 실험한 점 처리
    if s == 1.0:  # high-fidelity
        # high-fidelity로 실험한 점만 제외
        train_combo_set = set(tuple(map(int, row)) for row in ini_X_high)
    else:  # low-fidelity
        # 모든 실험점 제외 (low + high)
        train_combo_set = set(tuple(map(int, row)) for row in np.vstack([ini_X_low, ini_X_high]))
    
    for i, combo in enumerate(X_grid.astype(int)):
        if tuple(combo) in train_combo_set:
            ei[i] = 0.0
            
    # 7. EI가 최대인 점 선택
    best_idx = np.argmax(ei)
    next_x_label = list(X_grid[best_idx].astype(int))
    
    return next_x_label, y_pred, y_std, ei, best_idx, X_grid

def plot_iteration(ori_gga_data, y_pred, y_std, ei, best_idx, X_grid, ini_X_low, ini_X_high, iter_, label_maps):
        # 데이터를 bandgap_hse06 기준으로 정렬
    ori_gga_data = ori_gga_data.sort_values('bandgap_hse06')
    
    # 정렬된 인덱스에 맞춰 ei도 재정렬
    ei = ei[ori_gga_data.index]
    
    # 학습에 사용된 조합 set 만들기 (int 변환!)
    train_combo_set = set(tuple(map(int, row)) for row in np.vstack([ini_X_low, ini_X_high]))

    # 전체 192개 조합 중 학습에 쓰인 인덱스 찾기 (정렬된 인덱스 기준)
    train_indices = [i for i, combo in enumerate(X_grid[ori_gga_data.index].astype(int)) 
                    if tuple(combo) in train_combo_set]

    fig, ax1 = plt.subplots(figsize=(18, 7))
    x_idx = range(len(ori_gga_data))

    # True / 예측 / Uncertainty
    ax1.scatter(x_idx, ori_gga_data['bandgap_hse06'], s=40, label='True bandgap', color='royalblue')
    ax1.scatter(x_idx, ori_gga_data['y_pred'], s=40, label='BLR prediction', color='orange', alpha=0.7)
    ax1.fill_between(
        x_idx,
        ori_gga_data['y_pred'] - ori_gga_data['y_std'],
        ori_gga_data['y_pred'] + ori_gga_data['y_std'],
        color='orange', alpha=0.2, label='Pred. std. dev.'
    )

    # 학습 포인트(누적) - 정렬된 인덱스 기준으로 찾기
    train_indices_low = [i for i, combo in enumerate(X_grid[ori_gga_data.index].astype(int)) 
                        if tuple(combo) in set(tuple(map(int, row)) for row in ini_X_low)]
    train_indices_high = [i for i, combo in enumerate(X_grid[ori_gga_data.index].astype(int)) 
                         if tuple(combo) in set(tuple(map(int, row)) for row in ini_X_high)]

    # low fidelity (s=0.1): 파란색 삼각형
    ax1.scatter(
        train_indices_low, ori_gga_data['bandgap_hse06'].iloc[train_indices_low],
        s=110, color='black', label='Training (low, s=0.1)', zorder=10, marker='^'
    )
    # high fidelity (s=1.0): 빨간색 동그라미
    ax1.scatter(
        train_indices_high, ori_gga_data['bandgap_hse06'].iloc[train_indices_high],
        s=110, color='crimson', label='Training (high, s=1.0)', zorder=10, marker='^'
    )

    # Global optimal 별표 (정렬된 인덱스에서 찾기)
    optimal_combo = '12,2,4'
    optimal_idx = ori_gga_data.index[ori_gga_data['combo'] == optimal_combo].tolist()[0]
    optimal_idx_in_sorted = ori_gga_data.index.get_loc(optimal_idx)  # 정렬된 위치 찾기
    optimal_bandgap = ori_gga_data.loc[optimal_idx, 'bandgap_hse06']
    ax1.scatter(
        optimal_idx_in_sorted, optimal_bandgap,
        marker='*', color='purple', s=250, edgecolor='black',
        label='Global optimum', zorder=20
    )

    ax1.set_ylabel('Bandgap (hse06)', color='navy')
    ax1.set_xlabel('Combinations (organic, cation, anion)')
    ax1.set_xticks(x_idx)
    ax1.set_xticklabels(ori_gga_data['combo'], rotation=90, fontsize=7)

    # 제목 강조
    if (iter_ % 8 == 0):
        ax1.set_title(f'True Bandgap (sorted), Prediction, Uncertainty, and EI\niter: {iter_}',
                      color='crimson', fontsize=18, fontweight='bold', backgroundcolor='#ffe6e6')
    else:
        ax1.set_title(f'True Bandgap (sorted), Prediction, Uncertainty, and EI\niter: {iter_}')
    ax1.tick_params(axis='y', labelcolor='navy')

    # EI 오른쪽축
    ax2 = ax1.twinx()
    ax2.plot(x_idx, ei, marker='o', color='forestgreen', label='EI', linewidth=2)
    # best_idx도 정렬된 인덱스에 맞춰 변환
    best_idx_in_sorted = ori_gga_data.index.get_loc(best_idx)
    ax2.scatter(best_idx_in_sorted, ei[best_idx], color='red', s=120, zorder=15, label='Recommended (max EI)')
    ax2.set_ylabel('Expected Improvement (EI)', color='forestgreen')
    ax2.tick_params(axis='y', labelcolor='forestgreen')

    # 범례
    h1, l1 = ax1.get_legend_handles_labels()
    h2, l2 = ax2.get_legend_handles_labels()
    ax1.legend(h1+h2, l1+l2, loc='upper right')

    plt.xlim(-1, len(ori_gga_data))
    plt.tight_layout()
    plt.show()


    # 실제값과 예측값
    y_true = ori_gga_data['bandgap_hse06'].values
    y_pred = ori_gga_data['y_pred'].values

    # R², MAE 계산
    r2 = r2_score(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)

    print(f"R² score: {r2:.4f}")
    print(f"MAE: {mae:.4f}")
    
    y_true = ori_gga_data['bandgap_hse06'].values
    y_pred = ori_gga_data['y_pred'].values

    # 2. 학습 데이터 인덱스
    train_indices = set([i for i, combo in enumerate(X_grid.astype(int)) if tuple(combo) in train_combo_set])
    all_indices = set(range(len(y_true)))
    non_train_indices = all_indices - train_indices

    # 3. 산점도
    plt.figure(figsize=(6, 6))
    # (1) 비학습 데이터 (연한색)
    plt.scatter(
        y_true[list(non_train_indices)], y_pred[list(non_train_indices)],
        alpha=0.4, s=40, color='grey', label='Unmeasured (candidates)'
    )
    # (2) 학습 데이터 (진한색)
    plt.scatter(
        y_true[list(train_indices)], y_pred[list(train_indices)],
        alpha=0.9, s=80, color='black', label='Training points', edgecolor='w'
    )
    # (3) 기준선
    plt.plot([min(y_true), max(y_true)], [min(y_true), max(y_true)], 'r--', label='Ideal: y=x')

    plt.xlabel('Actual value')
    plt.ylabel('Predicted value')
    plt.title(f'Actual vs. Predicted\nR²: {r2:.3f}, MAE: {mae:.3f}')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()




In [1]:
init_samples = sample_param_space(param_space, NUM_INIT_DESIGN, random_state=42)
init_fids = assign_fidelities(NUM_INIT_DESIGN, high_ratio=0.2, random_state=42)
measurements = []
for params, s in zip(init_samples, init_fids):
    measurement = measure_from_label(
        [label_maps['organic'][params['organic']],
         label_maps['cation'][params['cation']],
         label_maps['anion'][params['anion']]],
        s, label_maps, LOOKUP
    )
    measurements.append({"params": params, "s": s, "measurement": measurement})

# 데이터프레임 변환 및 라벨 적용
df = pd.DataFrame([
    {**obs['params'], 's': obs['s'], 'measurement': obs['measurement']}
    for obs in measurements
])
for col in ['organic', 'cation', 'anion']:
    df[col + '_label'] = df[col].map(label_maps[col])

ini_X = df[['organic_label', 'cation_label', 'anion_label', 's']].values
ini_y = df['measurement'].values

# s 값에 따라 분할 (X에서 s를 제거)
ini_X_low = ini_X[df['s'] == 0.1][:, :3]
ini_y_low = ini_y[df['s'] == 0.1]
ini_X_high = ini_X[df['s'] == 1.0][:, :3]
ini_y_high = ini_y[df['s'] == 1.0]

param_ranges = [
    range(1, 17),  # organic_label: 1~16
    range(1, 4),   # cation_label: 1~3
    range(1, 5),   # anion_label: 1~4
]

min_hse06_bandgap = 1.5249
COST_BUDGET = 50
timing_data = []  # 각 iteration의 시간 기록
cost_data = []    # 누적 비용 기록
best_so_far_curve = []  # iteration별 best_so_far 기록
total_cost = 0.0  # 누적 비용
best_so_far = np.inf  # 현재까지 관측된 최소값 (high-fidelity)
iter_ = 0
while total_cost < COST_BUDGET:
    iter_ += 1
    print(f"\n==== Iteration {iter_} ====")
    iter_start = time.time()
    # 8번 중 1번만 high-fidelity(비용 1.0), 나머지는 low-fidelity(비용 0.1)
    s = 0.1 if (iter_ % 8 != 0) else 1.0

    # DNN+BLR 모델 학습
    model = train_model(ini_X_low, ini_y_low, ini_X_high, ini_y_high)

    # BLR 적합 (feature 추출 및 BLR 학습)
    blr, X_all, y_all = fit_blr(model, ini_X_low, ini_X_high, ini_y_low, ini_y_high)

    # 다음 실험 추천 (EI 최대화)
    next_x_label, y_pred, y_std, ei, best_idx, X_grid = recommend_next(
        model, blr, param_ranges, ini_X_low, ini_X_high, ini_y_low, ini_y_high, s
    )
    
    # 현재 iteration의 예측 결과 시각화
    plot_iteration(ori_gga_data, y_pred, y_std, ei, best_idx, X_grid, ini_X_low, ini_X_high, iter_, label_maps)
    # 실제 측정값 얻기 (실험 시뮬레이션)
    measurement = measure_from_label(next_x_label, s, label_maps, LOOKUP)
    # 측정값을 데이터에 추가
    ini_X_low, ini_y_low, ini_X_high, ini_y_high = append_measurement_to_data(
        ini_X_low, ini_y_low, ini_X_high, ini_y_high,
        next_x_label, s, label_maps, LOOKUP
    )

    print()
    print(f"Measurement: {measurement:.4f} (fidelity={s})")
    
    iter_end = time.time()
    time_taken = iter_end - iter_start
    timing_data.append([0, iter_, time_taken])  # 시간 기록
    total_cost += s  # 비용 누적
    cost_data.append([0, iter_, total_cost])  # 비용 기록
    # high-fidelity 측정값이 더 작으면 best_so_far 갱신
    if s == 1:
        if measurement < best_so_far:
            best_so_far = measurement
    best_so_far_curve.append([0, iter_, s, best_so_far])  # best_so_far 기록
    print(f"Cumulative cost: {total_cost}, best_so_far: {best_so_far}")

    # 최소 bandgap을 찾으면 조기 종료
    if s == 1.0 and np.isclose(measurement, min_hse06_bandgap, atol=1e-6):
        print('found the min hse06 bandgap!')
        break

NameError: name 'sample_param_space' is not defined

결과기록용


In [28]:
# 100번의 독립적인 런을 위한 결과 저장 리스트
all_runs_costs = []

for run in range(100):
    print(f"\n===== Starting Run {run+1}/100 =====")
    
    # 초기화
    init_samples = sample_param_space(param_space, NUM_INIT_DESIGN, random_state=run)  # 각 런마다 다른 random_state
    init_fids = assign_fidelities(NUM_INIT_DESIGN, high_ratio=0.2, random_state=run)
    measurements = []
    for params, s in zip(init_samples, init_fids):
        measurement = measure_from_label(
            [label_maps['organic'][params['organic']],
             label_maps['cation'][params['cation']],
             label_maps['anion'][params['anion']]],
            s, label_maps, LOOKUP
        )
        measurements.append({"params": params, "s": s, "measurement": measurement})

    # 데이터프레임 변환 및 초기 데이터 설정
    df = pd.DataFrame([
        {**obs['params'], 's': obs['s'], 'measurement': obs['measurement']}
        for obs in measurements
    ])
    for col in ['organic', 'cation', 'anion']:
        df[col + '_label'] = df[col].map(label_maps[col])

    ini_X = df[['organic_label', 'cation_label', 'anion_label', 's']].values
    ini_y = df['measurement'].values

    ini_X_low = ini_X[df['s'] == 0.1][:, :3]
    ini_y_low = ini_y[df['s'] == 0.1]
    ini_X_high = ini_X[df['s'] == 1.0][:, :3]
    ini_y_high = ini_y[df['s'] == 1.0]

    # 각 런의 변수 초기화
    total_cost = 0.0
    best_so_far = np.inf
    iter_ = 0
    
    while total_cost < COST_BUDGET:
        iter_ += 1
        s = 0.1 if (iter_ % 8 != 0) else 1.0

        # DNN+BLR 모델 학습
        model = train_model(ini_X_low, ini_y_low, ini_X_high, ini_y_high)
        blr, X_all, y_all = fit_blr(model, ini_X_low, ini_X_high, ini_y_low, ini_y_high)
        
        # 다음 실험 추천
        next_x_label, y_pred, y_std, ei, best_idx, X_grid = recommend_next(
            model, blr, param_ranges, ini_X_low, ini_X_high, ini_y_low, ini_y_high, s
        )
        
        # 측정 및 데이터 업데이트
        measurement = measure_from_label(next_x_label, s, label_maps, LOOKUP)
        ini_X_low, ini_y_low, ini_X_high, ini_y_high = append_measurement_to_data(
            ini_X_low, ini_y_low, ini_X_high, ini_y_high,
            next_x_label, s, label_maps, LOOKUP
        )

        total_cost += s
        
        # high-fidelity 측정값이 더 작으면 best_so_far 갱신
        if s == 1.0:
            if measurement < best_so_far:
                best_so_far = measurement
                
        # 최소 bandgap을 찾으면 조기 종료
        if s == 1.0 and np.isclose(measurement, min_hse06_bandgap, atol=1e-6):
            print(f'Run {run+1}: Found the min hse06 bandgap!')
            break
    
    # 각 런이 끝날 때 total_cost 저장
    all_runs_costs.append(total_cost)
    print(f"Run {run+1} completed. Final total_cost: {total_cost:.2f}")

# 모든 런이 끝난 후 결과를 CSV로 저장
results_df = pd.DataFrame({
    'run': range(1, 101),
    'total_cost': all_runs_costs
})
results_df.to_csv('transfer_learning_costs.csv', index=False)
print("\nAll runs completed. Results saved to 'transfer_learning_costs.csv'")


===== Starting Run 1/100 =====
Run 1: Found the min hse06 bandgap!
Run 1 completed. Final total_cost: 39.10

===== Starting Run 2/100 =====
Run 2: Found the min hse06 bandgap!
Run 2 completed. Final total_cost: 32.30

===== Starting Run 3/100 =====
Run 3: Found the min hse06 bandgap!
Run 3 completed. Final total_cost: 30.60

===== Starting Run 4/100 =====
Run 4: Found the min hse06 bandgap!
Run 4 completed. Final total_cost: 6.80

===== Starting Run 5/100 =====
Run 5: Found the min hse06 bandgap!
Run 5 completed. Final total_cost: 30.60

===== Starting Run 6/100 =====
Run 6: Found the min hse06 bandgap!
Run 6 completed. Final total_cost: 40.80

===== Starting Run 7/100 =====
Run 7: Found the min hse06 bandgap!
Run 7 completed. Final total_cost: 25.50

===== Starting Run 8/100 =====
Run 8: Found the min hse06 bandgap!
Run 8 completed. Final total_cost: 6.80

===== Starting Run 9/100 =====
Run 9: Found the min hse06 bandgap!
Run 9 completed. Final total_cost: 30.60

===== Starting Run 1

## Model analysis

In [None]:
import numpy as np

# ori_gga_data: combo, bandgap가 있는 DataFrame
np.random.seed(42)  # 재현성

# 전체 조합 (192, 3)
all_combinations = list(itertools.product(*param_ranges))  # param_ranges는 [range(1,17), range(1,4), range(1,5)]
X_grid = np.array(all_combinations, dtype=np.float32)

# 랜덤 100개 추출
idx_sample = np.random.choice(len(ori_gga_data), size=30, replace=False)
ori_sample = ori_gga_data.iloc[idx_sample]

# (추출된 조합을 array 형태로 만들기)
X_train = np.array([list(map(int, combo.split(','))) for combo in ori_sample['combo']])
y_train = ori_sample['bandgap'].values
# DNN+BLR
model = TransferLearningDNN(input_dim=3, hidden_dim=64, device='cpu')
model.pretrain(X_train, y_train, epochs=300, lr=1e-3, verbose=True)
# high-fidelity 데이터만 쓸 경우 finetune 생략 가능

features_train = model.extract_features(X_train)
blr = BayesianLinearRegression(alpha=1.0, beta=25.0)
blr.fit(features_train, y_train)
features_grid = model.extract_features(X_grid)

y_pred = []
y_std = []
for phi in features_grid:
    mu, var = blr.predict(phi)
    y_pred.append(mu)
    y_std.append(np.sqrt(var))
y_pred = np.array(y_pred)
y_std = np.array(y_std)
features_grid = model.extract_features(X_grid)

y_pred = []
y_std = []
for phi in features_grid:
    mu, var = blr.predict(phi)
    y_pred.append(mu)
    y_std.append(np.sqrt(var))
y_pred = np.array(y_pred)
y_std = np.array(y_std)

ori_gga_data['y_pred'] = y_pred
ori_gga_data['y_std'] = y_std

plt.figure(figsize=(18, 7))
x_idx = range(len(ori_gga_data))

# True bandgap
plt.scatter(x_idx, ori_gga_data['bandgap'], s=40, label='True bandgap', color='royalblue')
# 예측값
plt.scatter(x_idx, ori_gga_data['y_pred'], s=40, label='BLR prediction', color='orange', alpha=0.7)
# 불확실성 범위
plt.fill_between(
    x_idx,
    ori_gga_data['y_pred'] - ori_gga_data['y_std'],
    ori_gga_data['y_pred'] + ori_gga_data['y_std'],
    color='orange', alpha=0.2, label='Pred. std. dev.'
)

# 학습에 사용된 100개 포인트 인덱스 (검정색)
train_indices = sorted(idx_sample)
plt.scatter(
    train_indices, ori_gga_data['bandgap'].iloc[train_indices],
    s=110, color='black', label='Training points', zorder=10, marker='o'
)

plt.ylabel('Bandgap (HSE06)')
plt.xlabel('Combinations (organic, cation, anion)')
plt.xticks(x_idx, ori_gga_data['combo'], rotation=90, fontsize=7)
plt.title('True Bandgap, BLR Prediction, Uncertainty (100 training points)')
plt.legend()
plt.xlim(-1, len(ori_gga_data))
plt.tight_layout()
plt.show()
from sklearn.metrics import r2_score, mean_absolute_error

# 실제값과 예측값
y_true = ori_gga_data['bandgap'].values
y_pred = ori_gga_data['y_pred'].values

# R², MAE 계산
r2 = r2_score(y_true, y_pred)
mae = mean_absolute_error(y_true, y_pred)

print(f"R² score: {r2:.4f}")
print(f"MAE: {mae:.4f}")
