In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PowerTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import optuna

train = pd.read_csv('../data/train.csv').drop(columns='ID')
X_train, X_val, y_train, y_val = train_test_split(train.drop(columns='y'), train['y'], test_size=0.2, random_state=42)

X_test = pd.read_csv('../data/test.csv').drop(columns='ID')


  from .autonotebook import tqdm as notebook_tqdm


## 1. preprocessing

In [3]:
from sklearn.cluster import KMeans

# skew 조절
def skew_adjustment(data):
    
    def detect_bimodal(data):
        bimodal_features = []
        for col in data.columns:
            hist, bin_edges = np.histogram(data[col].dropna(), bins=10)
            peaks = np.where(hist > np.mean(hist))[0]
            if len(peaks) > 1:
                bimodal_features.append(col)
            
        return bimodal_features
    
    def detect_skewness(data, skew_threshold=0.5):
        skew_features = data.apply(lambda x: x.skew())
        return skew_features[skew_features.abs() > skew_threshold].index.tolist()
    
    skewed_features = detect_skewness(data)
    bimodal_features = detect_bimodal(data)
    
    adjusted_data = data.copy()
    
    transformer_y = PowerTransformer(method='yeo-johnson') # Yeo-Johnson transform
    
    for col in data.columns:
        if col in skewed_features:
            adjusted_data[col] = transformer_y.fit_transform(data[[col]])
        
        elif col in bimodal_features:
            kmeans = KMeans(n_clusters=2, random_state=42)
            labels = kmeans.fit_predict(data[[col]])
            
            d_0 = data[col][labels==0].values.reshape(-1,1)
            d_1 = data[col][labels==1].values.reshape(-1,1)
            
            adjusted_d_0 = transformer_y.fit_transform(d_0)
            adjusted_d_1 = transformer_y.fit_transform(d_1)
            
            adjusted_data.loc[labels ==0, col] = adjusted_d_0.flatten()
            adjusted_data.loc[labels ==1, col] = adjusted_d_1.flatten()
        
        else:
            adjusted_data[col] = data[col]
        
        return adjusted_data
            
# scaler 조정
def scaler(scaler, X_train, X_val, X_test):
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transofrm(X_test)
    return X_train_scaled, X_val_scaled, X_test_scaled       

## 2-1. 일반 Diffusion Model  (Diffusion Models for Black-Box Optimization)


In [None]:
# 일반 Diffusion Model 정의

import torch
import torch.nn as nn
import torch.optim as optim

class DiffusionModel(nn.Module):
    def __init__(self, input_dim, latent_dim):
        super(DiffusionModel, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 128), # 일단 간단하게 만듦
            nn.ReLU(),
            nn.Linear(128,64),
            nn.ReLU(),
            nn.Linear(64, latent_dim)
        )
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 64),
            nn.ReLU(),
            nn.Linear(64,128),
            nn.ReLU(),
            nn.Linear(128,input_dim)
        )
        
    def forward(self, x):
        latent = self.encoder(x)
        reconstructed = self.decoder(latent)
        return latent, reconstructed

In [None]:
# 모델 학습 & latent variable 생성

# 하이퍼파라미터 설정
input_dim = X_train.shape[1]
latent_dim = 10 # 임의로 지정
epochs = 100
batch_size =32

# 텐서로 변환
X_train_t =torch.tensor(X_train.values, dtype=torch.float32)
X_val_t =torch.tensor(X_val.values, dtype=torch.float32)
X_test_t =torch.tensor(X_test.values, dtype=torch.float32)

# 모델 초기화 및 학습 설정
diff = DiffusionModel(input_dim, latent_dim)
criterion = nn.MSELoss()
optimizer_diff = optim.Adam(diff.parameters(), lr=0.001)

# 학습 루프
for epoch in range(epochs):
    diff.train()
    optimizer_diff.zero_grad()
    
    latent, reconstructed = diff(X_train_t)
    loss = criterion(reconstructed, X_train_t)
    
    loss.backward()
    optimizer_diff.step()
    
    if (epoch+1) % 10 == 0 :
        print(f'Diffusion Model : Epoch {epoch+1}/{epochs}, Loss {loss.item()}')
        
diff.eval()

with torch.no_grad():
    latent_train_diff, _ = diff(X_train_t)
    latent_val_diff, _ = diff(X_train_t)
    latent_test_diff, _ = diff(X_test_t)
    
latent_train_diff = latent_train_diff.numpy()
latent_val_diff = latent_val_diff.numpy()
latent_test_diff = latent_test_diff.numpy()

## 2-2. Variational Diffusion Model (VDM)

In [None]:
class VariationalDiffusionModel(nn.Module):
    def __init__(self, input_dim, latent_dim):
        super(VariationalDiffusionModel, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, latent_dim * 2)  # mean and logvar
        )
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Linear(128, input_dim)
        )
    
    def reparameterize(self, mean, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mean + eps * std
    
    def forward(self, x):
        h = self.encoder(x)
        mean, logvar = h[:, :h.size(1) // 2], h[:, h.size(1) // 2:]
        z = self.reparameterize(mean, logvar)
        reconstructed = self.decoder(z)
        return z, reconstructed, mean, logvar

In [None]:
# VDM 초기화 및 학습 설정
vdm = VariationalDiffusionModel(input_dim, latent_dim)
optimizer_vdm = optim.Adam(vdm.parameters(), lr=0.001)

# 학습 루프
for epoch in range(epochs):
    vdm.train()
    optimizer_vdm.zero_grad()
    
    z, reconstructed, mean, logvar = vdm(X_train_t)
    recon_loss = criterion(reconstructed, X_train_t)
    kld_loss = -0.5 * torch.sum(1 + logvar - mean.pow(2) - logvar.exp())
    loss = recon_loss + kld_loss
    
    loss.backward()
    optimizer_vdm.step()
    
    if (epoch + 1) % 10 == 0:
        print(f'VDM - Epoch {epoch+1}/{epochs}, Loss: {loss.item()}')

# 학습 완료 후, 잠재 변수 생성
vdm.eval()
with torch.no_grad():
    latent_train_vdm, _, _, _ = vdm(X_train_t)
    latent_val_vdm, _, _, _ = vdm(X_val_t)
    latent_test_vdm, _, _, _ = vdm(X_test_t)

latent_train_vdm = latent_train_vdm.numpy()
latent_val_vdm = latent_val_vdm.numpy()
latent_test_vdm = latent_test_vdm.numpy()

## 3. AutoML을 사용해 성능이 좋은 regression model 5개 사용 (MSE 기준)

In [None]:
from pycaret.regression import setup, compare_models, pull, save_model, load_model, predict_model, create_model

regressor = setup(data = X_train, target='y', session_id=42)
best_model = compare_models()
results = pull()

top5_models = results.sort_values(by='MSE').head(5).index.tolist()
top5_models

In [None]:
train_model = {}
for model_name in top5_models:
    model = create_model(model_name)
    train_model[model_name] = model
    save_model(model, model_name)

## 4. 모든 케이스에 대해 실험

In [None]:
from concurrent.futures import ThreadPoolExecutor

# 모든 경우의 수 실험
def experiment():
    results = {}

    standard_scaler = StandardScaler()
    minmax_scaler = MinMaxScaler()
    
    
    # case define
    cases = {
        # 1. Skew 조절
        '1-1 : Skew Adjusted + Standard Scaler' : (skew_adjustment(X_train), skew_adjustment(X_val), standard_scaler),
        '1-2 : Skew Adjusted + MinMax Scaler' : (skew_adjustment(X_train), skew_adjustment(X_val), minmax_scaler),
        '1-3 : Skew Adjusted + Scaler (x)' :  (skew_adjustment(X_train), skew_adjustment(X_val),None),
        
        # 2. Skew 조절 x
        '2-1 : Skew Adjusted (x) + Standard Scaler' : (X_train, X_val, standard_scaler),
        '2-2 : Skew Adjusted (x) + MinMax Scaler' : (X_train, X_val, minmax_scaler),
        '2-3 : Skew Adjusted (x) + Scaler (x)' :  (X_train, X_val, None)
    }
    
    def run_case(case_name, case_data):
        X_train_case, X_val_case, scaler = case_data
        val_loss = train_model(X_train_case, y_train, X_val_case, y_val, scaler)
        return case_name, val_loss
    
    with ThreadPoolExecutor() as executer:
        future_to_case = {executer.submit(run_case, case_name, case_data) : case_name for case_name, case_data in cases.item()}
        
        for future in future_to_case:
            case_name, val_loss = future.results()
            results[case_name] = val_loss
            
    return results
    