가상환경 ss_env에서 돌릴 것 !!!

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PowerTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
# import optuna

train = pd.read_csv('../data/train.csv').drop(columns='ID')
X_train, X_val, y_train, y_val = train_test_split(train.drop(columns='y'), train['y'], test_size=0.2, random_state=42)

X_test = pd.read_csv('../data/test.csv').drop(columns='ID')


## 1. preprocessing

In [7]:
from sklearn.cluster import KMeans

# skew 조절
def skew_adjustment(data):
    
    def detect_bimodal(data):
        bimodal_features = []
        for col in data.columns:
            hist, bin_edges = np.histogram(data[col].dropna(), bins=10)
            peaks = np.where(hist > np.mean(hist))[0]
            if len(peaks) > 1:
                bimodal_features.append(col)
            
        return bimodal_features
    
    def detect_skewness(data, skew_threshold=0.5):
        skew_features = data.apply(lambda x: x.skew())
        return skew_features[skew_features.abs() > skew_threshold].index.tolist()
    
    skewed_features = detect_skewness(data)
    bimodal_features = detect_bimodal(data)
    
    adjusted_data = data.copy()
    
    transformer_y = PowerTransformer(method='yeo-johnson') # Yeo-Johnson transform
    
    for col in data.columns:
        if col in skewed_features:
            adjusted_data[col] = transformer_y.fit_transform(data[[col]])
        
        elif col in bimodal_features:
            kmeans = KMeans(n_clusters=2, random_state=42)
            labels = kmeans.fit_predict(data[[col]])
            
            d_0 = data[col][labels==0].values.reshape(-1,1)
            d_1 = data[col][labels==1].values.reshape(-1,1)
            
            adjusted_d_0 = transformer_y.fit_transform(d_0)
            adjusted_d_1 = transformer_y.fit_transform(d_1)
            
            adjusted_data.loc[labels ==0, col] = adjusted_d_0.flatten()
            adjusted_data.loc[labels ==1, col] = adjusted_d_1.flatten()
        
        else:
            adjusted_data[col] = data[col]
        
        return adjusted_data
            
# scaler 조정
def scaler(scaler, X_train, X_val, X_test):
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)
    return X_train_scaled, X_val_scaled, X_test_scaled       

## 2-1. 일반 Diffusion Model  (Diffusion Models for Black-Box Optimization)


In [8]:
# 일반 Diffusion Model 정의

import torch
import torch.nn as nn
import torch.optim as optim

class DiffusionModel(nn.Module):
    def __init__(self, input_dim, latent_dim):
        super(DiffusionModel, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 128), # 일단 간단하게 만듦
            nn.ReLU(),
            nn.Linear(128,64),
            nn.ReLU(),
            nn.Linear(64, latent_dim)
        )
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 64),
            nn.ReLU(),
            nn.Linear(64,128),
            nn.ReLU(),
            nn.Linear(128,input_dim)
        )
        
    def forward(self, x):
        latent = self.encoder(x)
        reconstructed = self.decoder(latent)
        return latent, reconstructed

In [9]:
# 모델 학습 & latent variable 생성

# 하이퍼파라미터 설정
input_dim = X_train.shape[1]
latent_dim = 10 # 임의로 지정
epochs = 100
batch_size =32

# 텐서로 변환
X_train_t =torch.tensor(X_train.values, dtype=torch.float32)
X_val_t =torch.tensor(X_val.values, dtype=torch.float32)
X_test_t =torch.tensor(X_test.values, dtype=torch.float32)

# 모델 초기화 및 학습 설정
diff = DiffusionModel(input_dim, latent_dim)
criterion = nn.MSELoss()
optimizer_diff = optim.Adam(diff.parameters(), lr=0.001)

# 학습 루프
for epoch in range(epochs):
    diff.train()
    optimizer_diff.zero_grad()
    
    latent, reconstructed = diff(X_train_t)
    loss = criterion(reconstructed, X_train_t)
    
    loss.backward()
    optimizer_diff.step()
    
    if (epoch+1) % 10 == 0 :
        print(f'Diffusion Model : Epoch {epoch+1}/{epochs}, Loss {loss.item()}')
        
diff.eval()

with torch.no_grad():
    latent_train_diff, _ = diff(X_train_t)
    latent_val_diff, _ = diff(X_train_t)
    latent_test_diff, _ = diff(X_test_t)
    
latent_train_diff = latent_train_diff.numpy()
latent_val_diff = latent_val_diff.numpy()
latent_test_diff = latent_test_diff.numpy()

Diffusion Model : Epoch 10/100, Loss 0.8206071257591248
Diffusion Model : Epoch 20/100, Loss 0.19196823239326477
Diffusion Model : Epoch 30/100, Loss 0.020276334136724472
Diffusion Model : Epoch 40/100, Loss 0.02699671871960163
Diffusion Model : Epoch 50/100, Loss 0.011248928494751453
Diffusion Model : Epoch 60/100, Loss 0.008029661141335964
Diffusion Model : Epoch 70/100, Loss 0.007224326487630606
Diffusion Model : Epoch 80/100, Loss 0.006354880053550005
Diffusion Model : Epoch 90/100, Loss 0.006301268003880978
Diffusion Model : Epoch 100/100, Loss 0.00611993670463562


## 2-2. Variational Diffusion Model (VDM)

In [None]:
class VariationalDiffusionModel(nn.Module):
    def __init__(self, input_dim, latent_dim):
        super(VariationalDiffusionModel, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, latent_dim * 2)  # mean and logvar
        )
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Linear(128, input_dim)
        )
    
    def reparameterize(self, mean, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mean + eps * std
    
    def forward(self, x):
        h = self.encoder(x)
        mean, logvar = h[:, :h.size(1) // 2], h[:, h.size(1) // 2:]
        z = self.reparameterize(mean, logvar)
        reconstructed = self.decoder(z)
        return z, reconstructed, mean, logvar

In [None]:
# VDM 초기화 및 학습 설정
vdm = VariationalDiffusionModel(input_dim, latent_dim)
optimizer_vdm = optim.Adam(vdm.parameters(), lr=0.001)

# 학습 루프
for epoch in range(epochs):
    vdm.train()
    optimizer_vdm.zero_grad()
    
    z, reconstructed, mean, logvar = vdm(X_train_t)
    recon_loss = criterion(reconstructed, X_train_t)
    kld_loss = -0.5 * torch.sum(1 + logvar - mean.pow(2) - logvar.exp())
    loss = recon_loss + kld_loss
    
    loss.backward()
    optimizer_vdm.step()
    
    if (epoch + 1) % 10 == 0:
        print(f'VDM - Epoch {epoch+1}/{epochs}, Loss: {loss.item()}')

# 학습 완료 후, 잠재 변수 생성
vdm.eval()
with torch.no_grad():
    latent_train_vdm, _, _, _ = vdm(X_train_t)
    latent_val_vdm, _, _, _ = vdm(X_val_t)
    latent_test_vdm, _, _, _ = vdm(X_test_t)

latent_train_vdm = latent_train_vdm.numpy()
latent_val_vdm = latent_val_vdm.numpy()
latent_test_vdm = latent_test_vdm.numpy()

## 3. AutoML을 사용해 성능이 좋은 regression model 5개 사용 (MSE 기준)

In [11]:
from pycaret.regression import setup, compare_models, pull, save_model, load_model, predict_model, create_model

regressor = setup(data = train, target='y', session_id=42)
best_model = compare_models()
results = pull()

top5_models = results.sort_values(by='MSE').head(5).index.tolist()
top5_models

Unnamed: 0,Description,Value
0,Session id,42
1,Target,y
2,Target type,Regression
3,Original data shape,"(40118, 12)"
4,Transformed data shape,"(40118, 12)"
5,Transformed train set shape,"(28082, 12)"
6,Transformed test set shape,"(12036, 12)"
7,Numeric features,11
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
br,Bayesian Ridge,1.2057,2.8778,1.6879,0.6054,0.022,0.0146,0.023
lr,Linear Regression,1.2057,2.8779,1.6879,0.6053,0.022,0.0146,0.134
gbr,Gradient Boosting Regressor,1.2082,2.8856,1.6902,0.6043,0.022,0.0146,0.728
ridge,Ridge Regression,1.208,2.8875,1.6908,0.604,0.022,0.0146,0.078
lightgbm,Light Gradient Boosting Machine,1.2128,2.9023,1.6953,0.6019,0.0221,0.0147,0.539
huber,Huber Regressor,1.201,2.9304,1.7038,0.598,0.0221,0.0145,0.054
lar,Least Angle Regression,1.2256,2.9415,1.707,0.5965,0.0222,0.0148,0.015
et,Extra Trees Regressor,1.238,2.9828,1.719,0.5907,0.0223,0.015,0.299
rf,Random Forest Regressor,1.2361,2.9976,1.7231,0.5888,0.0224,0.015,1.126
omp,Orthogonal Matching Pursuit,1.3052,3.2432,1.7936,0.5546,0.0231,0.0158,0.015


['br', 'lr', 'gbr', 'ridge', 'lightgbm']

In [12]:
train_model = {}
for model_name in top5_models:
    model = create_model(model_name)
    train_model[model_name] = model
    save_model(model, model_name)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1.2047,2.5206,1.5877,0.6348,0.0186,0.0144
1,1.1854,2.5111,1.5846,0.6334,0.0186,0.0142
2,1.2119,4.2253,2.0556,0.494,0.0353,0.0157
3,1.1853,2.5437,1.5949,0.6493,0.0189,0.0141
4,1.2084,2.6588,1.6306,0.6297,0.0191,0.0144
5,1.2294,2.7242,1.6505,0.6158,0.0196,0.0147
6,1.1992,2.4761,1.5736,0.633,0.0184,0.0143
7,1.2266,2.5953,1.611,0.6283,0.0189,0.0146
8,1.1965,2.5568,1.599,0.6219,0.0188,0.0143
9,1.2096,3.9662,1.9915,0.5132,0.0335,0.0154


Transformation Pipeline and Model Successfully Saved


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1.2047,2.5204,1.5876,0.6348,0.0186,0.0144
1,1.1855,2.5115,1.5848,0.6333,0.0186,0.0142
2,1.2119,4.2261,2.0558,0.4939,0.0354,0.0157
3,1.1853,2.5433,1.5948,0.6494,0.0189,0.0141
4,1.2084,2.6586,1.6305,0.6297,0.0191,0.0144
5,1.2293,2.7239,1.6504,0.6159,0.0196,0.0147
6,1.1993,2.4763,1.5736,0.633,0.0184,0.0143
7,1.2268,2.5962,1.6113,0.6282,0.0189,0.0146
8,1.1965,2.5566,1.5989,0.622,0.0188,0.0143
9,1.2095,3.9661,1.9915,0.5132,0.0335,0.0154


Transformation Pipeline and Model Successfully Saved


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1.2065,2.5252,1.5891,0.6342,0.0186,0.0144
1,1.1822,2.5082,1.5837,0.6338,0.0186,0.0141
2,1.2175,4.2449,2.0603,0.4916,0.0354,0.0157
3,1.1883,2.5604,1.6001,0.6471,0.0189,0.0142
4,1.2181,2.6935,1.6412,0.6249,0.0192,0.0145
5,1.2435,2.7794,1.6671,0.6081,0.0198,0.0149
6,1.1997,2.4664,1.5705,0.6345,0.0184,0.0143
7,1.2241,2.5611,1.6003,0.6332,0.0188,0.0146
8,1.1986,2.5702,1.6032,0.6199,0.0189,0.0143
9,1.204,3.9463,1.9865,0.5157,0.0335,0.0154


Transformation Pipeline and Model Successfully Saved


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1.2072,2.5307,1.5908,0.6333,0.0186,0.0144
1,1.1858,2.5122,1.585,0.6332,0.0186,0.0142
2,1.214,4.2297,2.0566,0.4934,0.0354,0.0157
3,1.1885,2.5571,1.5991,0.6475,0.0189,0.0142
4,1.2119,2.6795,1.6369,0.6268,0.0191,0.0144
5,1.2322,2.7373,1.6545,0.614,0.0196,0.0148
6,1.2021,2.4899,1.5779,0.631,0.0185,0.0143
7,1.2272,2.5909,1.6096,0.629,0.0189,0.0146
8,1.199,2.5674,1.6023,0.6204,0.0189,0.0143
9,1.2122,3.9806,1.9951,0.5115,0.0335,0.0155


Transformation Pipeline and Model Successfully Saved


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1.2113,2.5527,1.5977,0.6302,0.0187,0.0144
1,1.199,2.5681,1.6025,0.6251,0.0188,0.0143
2,1.2226,4.2758,2.0678,0.4879,0.0354,0.0158
3,1.2003,2.6004,1.6126,0.6415,0.019,0.0143
4,1.2218,2.6712,1.6344,0.628,0.0191,0.0146
5,1.2343,2.7358,1.654,0.6142,0.0196,0.0148
6,1.2012,2.4808,1.575,0.6323,0.0185,0.0143
7,1.2292,2.5976,1.6117,0.628,0.0189,0.0147
8,1.2057,2.5971,1.6115,0.616,0.019,0.0144
9,1.2026,3.9438,1.9859,0.516,0.0335,0.0153


Transformation Pipeline and Model Successfully Saved


## 4. 모든 케이스에 대해 실험

In [23]:
from concurrent.futures import ThreadPoolExecutor

# 모든 경우의 수 실험
def experiment(DiffModel, X_train, X_val, X_test, y_train, y_val, train_models):
    results = {}
    submission_data = {}

    standard_scaler = StandardScaler()
    minmax_scaler = MinMaxScaler()
    

    # case define
    cases = {
        # 1. Skew 조절
        '1-1 : Skew Adjusted + Standard Scaler' : (skew_adjustment(X_train), skew_adjustment(X_val), standard_scaler),
        '1-2 : Skew Adjusted + MinMax Scaler' : (skew_adjustment(X_train), skew_adjustment(X_val), minmax_scaler),
        '1-3 : Skew Adjusted + Scaler (x)' :  (skew_adjustment(X_train), skew_adjustment(X_val),None),
        
        # 2. Skew 조절 x
        '2-1 : Skew Adjusted (x) + Standard Scaler' : (X_train, X_val, standard_scaler),
        '2-2 : Skew Adjusted (x) + MinMax Scaler' : (X_train, X_val, minmax_scaler),
        '2-3 : Skew Adjusted (x) + Scaler (x)' :  (X_train, X_val, None)
    }
    
    def run_case(case_name, case_data):
        X_train_case, X_val_case, scaler = case_data
        # Diffusion 모델을 통한 잠재 변수 생성
        latent_train_diff, latent_val_diff = DiffModel(X_train_case, X_val_case)
        input_dim = X_train_case.shape[1] 
        case_results = {}
        
        # 병렬로 AutoML 모델을 사용하여 각각의 케이스에 대해 훈련 수행
        for model_name, model in train_model.items():
            # Diffusion Model을 사용한 학습
            model.fit(latent_train_diff, y_train)
            val_loss = mean_squared_error(y_val, model.predict(latent_val_diff))
            preds = model.predict(latent_test_diff)
            case_results[f'{case_name} - {model_name}'] = val_loss
            submission_data[f'{case_name} - {model_name}'] = preds
        
        return case_results, submission_data
    
    # ThreadPoolExecutor로 각 케이스를 병렬로 실행
    with ThreadPoolExecutor() as executor:
        futures = {executor.submit(run_case, case_name, case_data): case_name for case_name, case_data in cases.items()}
        
        for future in futures:
            case_results = future.result()
            results.update(case_results)
    
    # 가장 좋은 성능을 보인 케이스의 예측 결과를 submission.csv로 저장
    best_case = min(results, key=results.get)
    
    submission = pd.read_csv('../data/sample_submission.csv')
    submission['y'] = submission_data[best_case]
    submission.to_csv(f'result/{best_case}_submission.csv',index=False)
    
    return results

In [26]:
experiment(DiffusionModel,  X_train, X_val, X_test, y_train, y_val, train_model)

TypeError: experiment() missing 1 required positional argument: 'train_models'

In [25]:
import torch
import torch.nn as nn
from concurrent.futures import ThreadPoolExecutor
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error

class DiffusionModel(nn.Module):
    def __init__(self, input_dim, latent_dim):
        super(DiffusionModel, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, latent_dim)
        )
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Linear(128, input_dim)
        )
    
    def forward(self, x):
        latent = self.encoder(x)
        reconstructed = self.decoder(latent)
        return latent, reconstructed

def experiment(DiffModel, X_train, X_val, X_test, y_train, y_val, test_ids, train_models):
    results = {}
    submission_data = {}

    standard_scaler = StandardScaler()
    minmax_scaler = MinMaxScaler()

    # 케이스 정의
    cases = {
        # 1. Skew 조절
        '1-1 : Skew Adjusted + Standard Scaler' : (skew_adjustment(X_train), skew_adjustment(X_val), skew_adjustment(X_test), standard_scaler),
        '1-2 : Skew Adjusted + MinMax Scaler' : (skew_adjustment(X_train), skew_adjustment(X_val), skew_adjustment(X_test), minmax_scaler),
        '1-3 : Skew Adjusted + Scaler (x)' :  (skew_adjustment(X_train), skew_adjustment(X_val), skew_adjustment(X_test), None),
        
        # 2. Skew 조절 x
        '2-1 : Skew Adjusted (x) + Standard Scaler' : (X_train, X_val, X_test, standard_scaler),
        '2-2 : Skew Adjusted (x) + MinMax Scaler' : (X_train, X_val, X_test, minmax_scaler),
        '2-3 : Skew Adjusted (x) + Scaler (x)' :  (X_train, X_val, X_test, None)
    }
    
    def run_case(case_name, case_data):
        X_train_case, X_val_case, X_test_case, scaler = case_data

        # 데이터 스케일링 (필요한 경우)
        if scaler:
            X_train_case = scaler.fit_transform(X_train_case)
            X_val_case = scaler.transform(X_val_case)
            X_test_case = scaler.transform(X_test_case)

        # DataFrame을 torch 텐서로 변환
        X_train_case_tensor = torch.tensor(X_train_case, dtype=torch.float32)
        X_val_case_tensor = torch.tensor(X_val_case, dtype=torch.float32)
        X_test_case_tensor = torch.tensor(X_test_case, dtype=torch.float32)

        # Diffusion 모델을 통해 잠재 변수 생성
        diff_model = DiffModel(input_dim=X_train_case.shape[1], latent_dim=10)
        latent_train_diff, _ = diff_model(X_train_case_tensor)
        latent_val_diff, _ = diff_model(X_val_case_tensor)
        latent_test_diff, _ = diff_model(X_test_case_tensor)

        case_results = {}
        
        # AutoML 모델을 사용하여 각각의 케이스에 대해 훈련 수행 및 예측
        for model_name, model in train_models.items():
            # Diffusion Model을 사용한 학습 및 검증
            model.fit(latent_train_diff.detach().numpy(), y_train)
            val_loss = mean_squared_error(y_val, model.predict(latent_val_diff.detach().numpy()))
            preds = model.predict(latent_test_diff.detach().numpy())
            
            case_results[f'{case_name} - {model_name}'] = val_loss
            submission_data[f'{case_name} - {model_name}'] = preds
        
        return case_results, submission_data
    
    # ThreadPoolExecutor로 각 케이스를 병렬로 실행
    with ThreadPoolExecutor() as executor:
        futures = {executor.submit(run_case, case_name, case_data): case_name for case_name, case_data in cases.items()}
        
        for future in futures:
            case_results, case_submission_data = future.result()
            results.update(case_results)
            submission_data.update(case_submission_data)
    
    # 가장 좋은 성능을 보인 케이스의 예측 결과를 submission.csv로 저장
    best_case = min(results, key=results.get)
    
    submission = pd.DataFrame({
        'Id': test_ids,
        'Prediction': submission_data[best_case]
    })
    submission.to_csv(f'result/best_model_{best_case}_submission.csv', index=False)
    
    print(f"Best case: {best_case}")
    
    return results
