# 라이브러리

In [97]:
import random
import os
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import root_mean_squared_error, r2_score
import torch

# 전역 설정

In [98]:
class CFG:
    data_path = "data/" # 데이터 경로
    seed = 42 # 시드값
    model_name = "model-rnncnn" # 모델 이름 
    patience_limit = 5 # 조기종료 조건 변수

# 함수 및 클래스

In [99]:
def seed_everything(seed):
    """
    재현성을 위한 시드고정 함수
    """
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

def train_loop(dataloader,model,loss_fn,optimizer,device):
    """
    모델 학습 기능 함수
    """
    epoch_loss = 0
    model.train()
    for batch in dataloader:
        pred = model(batch["seq"].to(device),batch["tb"].to(device))
        loss = loss_fn(pred, batch["y"].to(device))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    epoch_loss /= len(dataloader)

    return epoch_loss

@torch.no_grad()
def test_loop(dataloader,model,loss_fn,device):
    """
    모델 검증 및 추론 기능 함수
    """
    epoch_loss = 0
    model.eval()
    pred_list = []
    for batch in dataloader:

        pred = model(batch["seq"].to(device),batch["tb"].to(device))
        if batch.get("y") is not None:
            loss = loss_fn(pred, batch["y"].to(device))
            epoch_loss += loss.item()

        pred = pred.to("cpu").numpy()
        pred_list.append(pred)

    epoch_loss /= len(dataloader)
    pred = np.concatenate(pred_list)
    return epoch_loss , pred

class TorchDataset(torch.utils.data.Dataset):
    """
    데이터셋 클래스
    """
    def __init__(self ,seq, tb , y = None):
        self.seq = seq
        self.tb = tb
        self.y = y
    def __len__(self):
        return len(self.seq)
    def __getitem__(self, idx):
        item = {}
        item["seq"] = torch.Tensor(self.seq[idx])
        item["tb"] = torch.Tensor(self.tb[idx])
        if self.y is not None:
            item["y"] = torch.Tensor(self.y[idx])
        return item
    
class ResidualBlock(torch.nn.Module):
    """
    잔차 블록 레이어 클래스
    """
    def __init__(self, in_features, act):
        super().__init__()
        self.fx = torch.nn.Sequential(
            torch.nn.Linear(in_features, in_features),
            act,
            torch.nn.Dropout(0.5),
            torch.nn.Linear(in_features, in_features)
        )
        self.act = act

    def forward(self, x):
        fx = self.fx(x)
        hx = fx + x
        return self.act(hx)

class Net(torch.nn.Module):
    """
    모델 클래스
    """
    def __init__(self, n_features,in_features, lstm_hidden=128, act_conv1d="leaky_relu", act_init="leaky_relu",act_res="relu",
                  n_layers = 4, **kwarg):
        super().__init__()
        self.act_dict = {
            "relu" : torch.nn.ReLU(),
            "leaky_relu" : torch.nn.LeakyReLU(),
            "prelu" : torch.nn.PReLU(),
            "elu" : torch.nn.ELU(),
            "silu" : torch.nn.SiLU(),
            "gelu" : torch.nn.GELU(),
        }
        
        self.lstm_layer = torch.nn.LSTM(n_features, lstm_hidden, batch_first=True)   
        self.conv1d_block = torch.nn.Sequential( # input shape: B, F, S
            torch.nn.Conv1d(lstm_hidden, lstm_hidden*2, 3),
            self.act_dict[act_conv1d],
            torch.nn.MaxPool1d(2),
            torch.nn.Conv1d(lstm_hidden*2, lstm_hidden*4, 3),
            self.act_dict[act_conv1d],
            torch.nn.MaxPool1d(2),
            torch.nn.AdaptiveAvgPool1d(1), # B, F, 1
            torch.nn.Flatten(), # B, F
        ) 

        self.init_layer = torch.nn.Sequential(
            torch.nn.Linear(in_features, in_features // 2),
            torch.nn.BatchNorm1d(in_features // 2),
            self.act_dict[act_init]
        )
        res_list = [ ResidualBlock(in_features//2, self.act_dict[act_res]) for _ in range(n_layers) ]
        self.res_seq = torch.nn.Sequential(*res_list)
    
        self.output_layer = torch.nn.Linear(lstm_hidden*4 + in_features//2, 1)
    def forward(self, seq, tb):
        output, (hn, cn) = self.lstm_layer(seq)
        seq = self.conv1d_block( output.permute(0, 2, 1) )  # input/output shape: B, F, S
        
        tb = self.init_layer(tb)
        tb = self.res_seq(tb)

        x = torch.cat([seq,tb],dim=1)
        return self.output_layer(x)

# 데이터 불러오기

In [100]:
train_trans = pd.read_csv(f"{CFG.data_path}x_train.csv")
train_target = pd.read_csv(f"{CFG.data_path}y_train.csv")
test_trans = pd.read_csv(f"{CFG.data_path}x_test.csv")
submit = pd.read_csv(f"{CFG.data_path}submission.csv")

train_trans.shape, train_target.shape, test_trans.shape, submit.shape

((22176, 30), (154, 2), (9504, 30), (66, 2))

# 전처리

- 2차원 피처셋 생성

In [101]:
cols = ["DAT", "fan", "co2", "heater", "window1", "window2", "curtain1", "curtain2", "curtain3", "side_curtain", "rain_sensor",
 "crown_diameter", "petiole_length", "leaf_count", "leaf_length","leaf_width","fruit_count","plant_height",
 "flower_count","numbers of plant"]
agg_dict = {}
for col in cols:
    agg_dict[col] = [
        (f"{col}_mean","mean")
    ]

cols = ["in_temp","in_hum","in_co2","out_temp","out_hum","solar_rad","wind_speed","wind_direction"]

for col in cols:
    agg_dict[col] = [
        (f"{col}_mean","mean"),
        (f"{col}_min","min"),
        (f"{col}_max","max"),
        (f"{col}_skew","skew"),
        (f"{col}_kurt",lambda x: x.kurt()),
        (f"{col}_std","std"),
    ]

In [102]:
tmp = train_trans.groupby("Sample_Number").agg(agg_dict)
tmp.columns = tmp.columns.droplevel()
tmp = tmp.reset_index()
train_ft = train_target.iloc[:,:1].merge(tmp, how="left", on="Sample_Number").drop(columns="Sample_Number")

tmp = test_trans.groupby("Sample_Number").agg(agg_dict)
tmp.columns = tmp.columns.droplevel()
tmp = tmp.reset_index()
test_ft = submit.iloc[:,:1].merge(tmp, how="left", on="Sample_Number").drop(columns="Sample_Number")
train_ft.isnull().sum().sum(), test_ft.isnull().sum().sum()

(0, 0)

In [103]:
scaler = MinMaxScaler()
train_ft = scaler.fit_transform(train_ft)
test_ft = scaler.transform(test_ft)
train_ft.shape, test_ft.shape

((154, 68), (66, 68))

- 3차원 피처셋 생성

In [104]:
drop_cols = [
    "Sample_Number","time",
    # "DAT", "crown_diameter", "petiole_length","leaf_count","leaf_length","leaf_width",
    # "fruit_count","plant_height","flower_count","numbers of plant"
]
train_data = scaler.fit_transform(train_trans.drop(columns=drop_cols))
test_data = scaler.transform(test_trans.drop(columns=drop_cols))
train_data.shape, test_data.shape

((22176, 28), (9504, 28))

In [105]:
n = train_data.shape[1]
train_data = train_data.reshape(-1,144,n)
test_data = test_data.reshape(-1,144,n)
train_data.shape, test_data.shape

((154, 144, 28), (66, 144, 28))

- 정답 데이터

In [106]:
target = train_target["CO2 final"].to_numpy().reshape(-1,1)
target.shape

(154, 1)

- 데이터셋 클래스 테스트

In [107]:
dt = TorchDataset(train_data,train_ft ,target)
dl = torch.utils.data.DataLoader(dt, batch_size=2, shuffle=False)
batch = next(iter(dl))
batch

{'seq': tensor([[[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 1.0000],
          [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 1.0000],
          [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 1.0000],
          ...,
          [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 1.0000],
          [0.0000, 1.0000, 0.0000,  ..., 0.0000, 0.0000, 1.0000],
          [0.0000, 1.0000, 0.0000,  ..., 0.0000, 0.0000, 1.0000]],
 
         [[0.0083, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 1.0000],
          [0.0083, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 1.0000],
          [0.0083, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 1.0000],
          ...,
          [0.0083, 1.0000, 0.0000,  ..., 0.0000, 0.0000, 1.0000],
          [0.0083, 1.0000, 0.0000,  ..., 0.0000, 0.0000, 1.0000],
          [0.0083, 1.0000, 0.0000,  ..., 0.0000, 0.0000, 1.0000]]]),
 'tb': tensor([[0.0000, 0.4653, 0.1333, 0.5373, 1.0000, 1.0000, 0.8599, 0.9766, 0.5779,
          0.3291, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0083, 0.0000, 0.0

- 모델 입출력 테스트

In [108]:
model = Net(train_data.shape[2], train_ft.shape[1])
model( batch["seq"], batch["tb"] )

tensor([[-0.0619],
        [-0.0129]], grad_fn=<AddmmBackward0>)

# 학습
- 학습 및 검증 과정에서 모델 가중치 저장됨

In [109]:
n_splits = 5 # k-fold 의 k 값
epochs = 100
loss_fn = torch.nn.MSELoss() 
n_features = train_data.shape[2]
in_features = train_ft.shape[1]
cv = StratifiedKFold(n_splits=n_splits,shuffle=True, random_state=CFG.seed) # cv 객체
device = "cuda" if torch.cuda.is_available() else "cpu"
groups = pd.qcut(target.reshape(-1),10,np.arange(10))
hp = {
    "lstm_hidden":752,
    "act_conv1d":'relu',
    "act_init":'silu',
    "act_res":'relu',
    "n_layers":4,
    "lr": 0.0006,
    "batch_size": 16
}
device

'cuda'

In [110]:
is_holdout = False # True 를 줄 경우 hold-out 방식, False 는 교차 검증 방식
seed_everything(CFG.seed)
best_score1_list = []
best_score2_list = []

for i,(tri,vai) in enumerate(cv.split(train_data, groups)):

    # 모델 객체 생성
    model = Net(n_features, in_features, **hp).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=hp["lr"])

    # 학습용 3차원 데이터
    x_train = train_data[tri]
    y_train = target[tri]

    # 검증용 3차원 데이터
    x_valid = train_data[vai]
    y_valid = target[vai]

    # 학습용 2차원 데이터
    x_train_ft = train_ft[tri]

    # 검증용 2차원 데이터
    x_valid_ft = train_ft[vai]

    # 배치 단위로 학습하기 위한 pytorch의 dataset 관련 객체 생성
    train_dt = TorchDataset(x_train,x_train_ft,y_train)
    valid_dt = TorchDataset(x_valid,x_valid_ft,y_valid)
    train_dl = torch.utils.data.DataLoader(train_dt, batch_size=hp["batch_size"], shuffle=True)
    valid_dl = torch.utils.data.DataLoader(valid_dt, batch_size=hp["batch_size"],shuffle=False)

    # 여러번 epoch 단위로 학습 수행
    # - 조기 종료 조건을 통해 몇번 이상 성능 개선이 없을 경우 학습 중지
    best_score1 = np.inf
    patience = 0
    for epoch in tqdm(range(epochs)):

        train_loss = train_loop(train_dl, model, loss_fn,optimizer,device )
        valid_loss , pred = test_loop(valid_dl, model, loss_fn,device  )

        score1 = root_mean_squared_error(y_valid, pred )
        score2 = r2_score(y_valid, pred )
        patience += 1
        if best_score1 > score1:
            patience = 0
            best_score1 = score1
            best_score2 = score2
            torch.save(model.state_dict(),f"{CFG.model_name}-{i}.pth") # 모델 가중치 저장

        if patience == CFG.patience_limit:
            break
    print(f"Fold ({i}), BEST RMSE: {best_score1}, BEST R2: {best_score2}")
    best_score1_list.append(best_score1)
    best_score2_list.append(best_score2)

    del model
    torch.cuda.empty_cache()

    if is_holdout:
        break

  0%|          | 0/100 [00:00<?, ?it/s]

Fold (0), BEST RMSE: 1.5808073225798196, BEST R2: 0.8901060977870137


  0%|          | 0/100 [00:00<?, ?it/s]

Fold (1), BEST RMSE: 1.8126807213463094, BEST R2: 0.8458378853104638


  0%|          | 0/100 [00:00<?, ?it/s]

Fold (2), BEST RMSE: 1.9325174236845286, BEST R2: 0.8469955422740878


  0%|          | 0/100 [00:00<?, ?it/s]

Fold (3), BEST RMSE: 1.0389782303709945, BEST R2: 0.9586458800045733


  0%|          | 0/100 [00:00<?, ?it/s]

Fold (4), BEST RMSE: 1.958089492110944, BEST R2: 0.8294880226840295


In [111]:
print(f"RMSE: {np.mean(best_score1_list)}, R2: {np.mean(best_score2_list)}")

RMSE: 1.6646146380185194, R2: 0.8742146856120337
