**Table of contents**    
1. Import Library    
2. Define Function for Feature Engineering    
3. Define Custom Dataset Class    
4. Define Model Architecture and Training Functions    
5. Train Models and Generate Predictions    
6. Inference    
7. Prepare Submission File    

<!-- vscode-jupyter-toc-config
	numbering=true
	anchor=false
	flat=true
	minLevel=1
	maxLevel=6
	/vscode-jupyter-toc-config -->
<!-- THIS CELL WILL BE REPLACED ON TOC UPDATE. DO NOT WRITE YOUR TEXT IN THIS CELL -->

# 1. Import Library

In [1]:
import sys
lib_dir = "g:/My Drive/Storage/Github/hyuckjinkim"
sys.path.append(lib_dir)

from lib.python.graph import MatplotlibFontManager
fm = MatplotlibFontManager()
fm.set_korean_font(check=False)

from lib.python.filesystem_utils import to_pickle, read_pickle
from lib.python.torch import seed_everything
from lib.python.torch.build_model import train, predict, inference
from lib.python.log import get_logger
seed_everything(42)

In [2]:
# import pandas as pd
# pd.set_option("display.max_rows", None)
# pd.set_option("display.max_columns", None)

# train_df = pd.read_csv('data/train/train.csv')

# train_meta1_df = pd.read_csv('data/train/meta/TRAIN_산지공판장_2018-2021.csv')
# train_meta1_df.drop(['품목코드','품종코드','공판장코드'], axis=1, inplace=True)

# train_meta2_df = pd.read_csv('data/train/meta/TRAIN_전국도매_2018-2021.csv')
# train_meta2_df.drop(['품목코드','품종코드','시장코드']  , axis=1, inplace=True)

# train_df.head(2) # ['평년 평균가격(원)','평균가격(원)']
# train_meta1_df.head(2)
# train_meta2_df.head(2)

In [3]:
import os
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from types import SimpleNamespace
from copy import deepcopy
import pickle
from functools import partial
import gc
gc.collect()

import scipy
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

# 2. Define Function for Feature Engineering
- 타겟의 필터 조건을 제외한 메타데이터의 필터 조건은 참가자들 각자의 기준에 맞춰 자유롭게 사용가능 
- 밑의 필터 조건은 임의로 제공하는 예시

In [4]:
def variance_threshold_select(data, threshold=0.01, ignore_features=list()):
    cols = data.select_dtypes(include=[np.number]).columns
    cols = list(set(cols)-set(ignore_features))

    del_features = []
    for col in cols:
        variance = data[col].std()**2
        if variance<threshold:
            del_features.append(col)
    
    return del_features

def get_sparse_columns(data, p=0.75, ignore_features=list()):
    numerical_features = data.select_dtypes(include=[np.number]).columns.tolist()
    numerical_features = list(set(numerical_features)-set(ignore_features))
    del_features = []
    for col in numerical_features:
        zero_percent = np.where(data[col]==0,1,0).sum() / len(data)
        if zero_percent>p:
            del_features.append(col)
    return del_features

def add_trend(data, ignore_features=list()):
    d = data.copy()
    
    numerical_features = d.select_dtypes(include=[np.number]).columns.tolist()
    numerical_features = list(set(numerical_features)-set(ignore_features))

    new_columns = {}
    for col in numerical_features:
        new_columns[f'{col}_diff'] = d[col].diff().bfill()
        for window in [3]:  # test data의 size가 9이므로 window를 낮은 값으로 사용
            new_columns[f'{col}_ma{window}'] = d[col].rolling(window=window).mean().bfill()
        for alpha in [0.5]:
            new_columns[f'{col}_ew{alpha}'] = d[col].ewm(alpha=alpha, adjust=False).mean()

    d = pd.concat([d, pd.DataFrame(new_columns)], axis=1) # 기존 DataFrame과 새로운 컬럼을 한 번에 합침
    d = d.copy()                                          # 메모리 단편화를 해결하기 위해 DataFrame을 복사

    return d

In [5]:
def process_data(raw_file, 산지공판장_file, 전국도매_file, 품목명, scalers=None, imputers=None, is_train=True):
    raw_data = pd.read_csv(raw_file)
    산지공판장 = pd.read_csv(산지공판장_file)
    전국도매 = pd.read_csv(전국도매_file)

    # 품목코드, 품종코드, 공판장코드, 시장코드 제거
    산지공판장.drop(['품목코드','품종코드','공판장코드'], axis=1, inplace=True)
    전국도매  .drop(['품목코드','품종코드','시장코드']  , axis=1, inplace=True)

    # 연도 중복제거 및 2018년을 기준으로 1년단위로 변경
    산지공판장.drop('연도', axis=1, inplace=True)
    전국도매['연도'] -= 2018

    # 이상값(0이하) 처리
    for col in ['전순 평균가격(원) PreVious SOON', '전달 평균가격(원) PreVious MMonth', '전년 평균가격(원) PreVious YeaR']:
        loc = 전국도매[col] < 0
        전국도매.loc[loc,col] = 0

    # log변환
    raw_cols = ['평년 평균가격(원)', '평균가격(원)']
    산지공판장_cols =  ['총반입량(kg)', '총거래금액(원)', '평균가(원/kg)', '중간가(원/kg)', '최저가(원/kg)', '최고가(원/kg)', '경매 건수', 
                       '전순 평균가격(원) PreVious SOON', '전달 평균가격(원) PreVious MMonth', '전년 평균가격(원) PreVious YeaR', '평년 평균가격(원) Common Year SOON']
    전국도매_cols = ['총반입량(kg)', '총거래금액(원)', '평균가(원/kg)', '고가(20%) 평균가', '중가(60%) 평균가 ', '저가(20%) 평균가', '중간가(원/kg)', '최저가(원/kg)',
                    '최고가(원/kg)', '경매 건수', '전순 평균가격(원) PreVious SOON', '전달 평균가격(원) PreVious MMonth', '전년 평균가격(원) PreVious YeaR', '평년 평균가격(원) Common Year SOON']
    for col in raw_cols:
        raw_data[col] = np.log1p(raw_data[col])
    for col in 산지공판장_cols:
        산지공판장[col] = np.log1p(산지공판장[col])
    for col in 전국도매_cols:
        전국도매[col] = np.log1p(전국도매[col])

    # 타겟 및 메타데이터 필터 조건 정의
    conditions = {
        '감자': {
            'target': lambda df: (df['품종명'] == '감자 수미') & (df['거래단위'] == '20키로상자') & (df['등급'] == '상'),
            '공판장': {'공판장명': ['*전국농협공판장'], '품목명': ['감자'], '품종명': ['수미'], '등급명': ['상']},
            '도매': {'시장명': ['*전국도매시장'], '품목명': ['감자'], '품종명': ['수미']}
        },
        '건고추': {
            'target': lambda df: (df['품종명'] == '화건') & (df['거래단위'] == '30 kg') & (df['등급'] == '상품'),
            '공판장': None, 
            '도매': None  
        },
        '깐마늘(국산)': {
            'target': lambda df: (df['거래단위'] == '20 kg') & (df['등급'] == '상품'),
            '공판장': {'공판장명': ['*전국농협공판장'], '품목명': ['마늘'], '품종명': ['깐마늘'], '등급명': ['상']},
            '도매': {'시장명': ['*전국도매시장'], '품목명': ['마늘'], '품종명': ['깐마늘']}
        },
        '대파': {
            'target': lambda df: (df['품종명'] == '대파(일반)') & (df['거래단위'] == '1키로단') & (df['등급'] == '상'),
            '공판장': {'공판장명': ['*전국농협공판장'], '품목명': ['대파'], '품종명': ['대파(일반)'], '등급명': ['상']},
            '도매': {'시장명': ['*전국도매시장'], '품목명': ['대파'], '품종명': ['대파(일반)']}
        },
        '무': {
            'target': lambda df: (df['거래단위'] == '20키로상자') & (df['등급'] == '상'),
            '공판장': {'공판장명': ['*전국농협공판장'], '품목명': ['무'], '품종명': ['기타무'], '등급명': ['상']},
            '도매': {'시장명': ['*전국도매시장'], '품목명': ['무'], '품종명': ['무']}
        },
        '배추': {
            'target': lambda df: (df['거래단위'] == '10키로망대') & (df['등급'] == '상'),
            '공판장': {'공판장명': ['*전국농협공판장'], '품목명': ['배추'], '품종명': ['쌈배추'], '등급명': ['상']},
            '도매': {'시장명': ['*전국도매시장'], '품목명': ['배추'], '품종명': ['배추']}
        },
        '사과': {
            'target': lambda df: (df['품종명'].isin(['홍로', '후지'])) & (df['거래단위'] == '10 개') & (df['등급'] == '상품'),
            '공판장': {'공판장명': ['*전국농협공판장'], '품목명': ['사과'], '품종명': ['후지'], '등급명': ['상']},
            '도매': {'시장명': ['*전국도매시장'], '품목명': ['사과'], '품종명': ['후지']}
        },
        '상추': {
            'target': lambda df: (df['품종명'] == '청') & (df['거래단위'] == '100 g') & (df['등급'] == '상품'),
            '공판장': {'공판장명': ['*전국농협공판장'], '품목명': ['상추'], '품종명': ['청상추'], '등급명': ['상']},
            '도매': {'시장명': ['*전국도매시장'], '품목명': ['상추'], '품종명': ['청상추']}
        },
        '양파': {
            'target': lambda df: (df['품종명'] == '양파') & (df['거래단위'] == '1키로') & (df['등급'] == '상'),
            '공판장': {'공판장명': ['*전국농협공판장'], '품목명': ['양파'], '품종명': ['기타양파'], '등급명': ['상']},
            '도매': {'시장명': ['*전국도매시장'], '품목명': ['양파'], '품종명': ['양파(일반)']}
        },
        '배': {
            'target': lambda df: (df['품종명'] == '신고') & (df['거래단위'] == '10 개') & (df['등급'] == '상품'),
            '공판장': {'공판장명': ['*전국농협공판장'], '품목명': ['배'], '품종명': ['신고'], '등급명': ['상']},
            '도매': {'시장명': ['*전국도매시장'], '품목명': ['배'], '품종명': ['신고']}
        }
    }

    # 타겟 데이터 필터링
    raw_품목 = raw_data[raw_data['품목명'] == 품목명]
    target_mask = conditions[품목명]['target'](raw_품목)
    filtered_data = raw_품목[target_mask]

    # 다른 품종에 대한 파생변수 생성
    other_data = raw_품목[~target_mask]
    unique_combinations = other_data[['품종명', '거래단위', '등급']].drop_duplicates()
    for _, row in unique_combinations.iterrows():
        품종명, 거래단위, 등급 = row['품종명'], row['거래단위'], row['등급']
        mask = (other_data['품종명'] == 품종명) & (other_data['거래단위'] == 거래단위) & (other_data['등급'] == 등급)
        temp_df = other_data[mask]
        for col in ['평년 평균가격(원)', '평균가격(원)']:
            new_col_name = f'{품종명}_{거래단위}_{등급}_{col}'
            filtered_data = filtered_data.merge(temp_df[['시점', col]], on='시점', how='left', suffixes=('', f'_{new_col_name}'))
            filtered_data.rename(columns={f'{col}_{new_col_name}': new_col_name}, inplace=True)

    # 공판장 데이터 처리
    if conditions[품목명]['공판장']:
        filtered_공판장 = 산지공판장
        for key, value in conditions[품목명]['공판장'].items():
            filtered_공판장 = filtered_공판장[filtered_공판장[key].isin(value)]
        filtered_공판장 = filtered_공판장.add_prefix('공판장_').rename(columns={'공판장_시점': '시점'})
        filtered_data = filtered_data.merge(filtered_공판장, on='시점', how='left')

    # 도매 데이터 처리
    if conditions[품목명]['도매']:
        filtered_도매 = 전국도매
        for key, value in conditions[품목명]['도매'].items():
            filtered_도매 = filtered_도매[filtered_도매[key].isin(value)]
        filtered_도매 = filtered_도매.add_prefix('도매_').rename(columns={'도매_시점': '시점'})
        filtered_data = filtered_data.merge(filtered_도매, on='시점', how='left')

    # 수치형 컬럼 결측값 처리 -> 절반이상이 null인 컬럼은 제거
    if is_train:
        null_cnt = filtered_data.isnull().sum()
        null_cnt = null_cnt[null_cnt!=0]
        null_rate = null_cnt / len(filtered_data)
        del_cols = null_rate[null_rate>=0.5].index.tolist()
        filtered_data.drop(del_cols, axis=1, inplace=True)

    # 수치형 컬럼 처리
    numeric_columns = filtered_data.select_dtypes(include=[np.number]).columns.tolist()
    filtered_data = filtered_data[['시점']+numeric_columns]

    # 결측치 처리
    #filtered_data[numeric_columns] = filtered_data[numeric_columns].fillna(0)
    if imputers is None:
        null_cnt = filtered_data.isnull().sum()
        null_cols = null_cnt[null_cnt!=0].index.tolist()
    
        imputers = {}
        for col in null_cols:
            imputer = SimpleImputer(strategy='median')
            filtered_data[col] = imputer.fit_transform(filtered_data[col].values.reshape(-1,1))
            imputers[col] = imputer

    else:
        for col in imputers.keys():
            imputer = imputers[col]
            filtered_data[col] = imputer.transform(filtered_data[col].values.reshape(-1,1))

    # # trend 추가
    # ignore_features = filtered_data.columns[filtered_data.columns.str.contains('연도|등급코드')]
    # filtered_data = add_trend(filtered_data, ignore_features)

    # 정규화 적용
    numeric_columns = filtered_data.select_dtypes(include=[np.number]).columns.tolist()
    if scalers is None:
        scalers = {}
        for col in numeric_columns:
            scaler = MinMaxScaler()
            filtered_data[col] = scaler.fit_transform(filtered_data[col].values.reshape(-1,1))
            scalers[col] = scaler
    else:
        for col in scalers.keys():
            scaler = scalers[col]
            filtered_data[col] = scaler.transform(filtered_data[col].values.reshape(-1,1))

    # 분산이 일정이하거나 0이 많은 값은 제거
    if is_train:
        del_cols = variance_threshold_select(filtered_data, threshold=CFG.variance_threshold, ignore_features=['평균가격(원)'])
        filtered_data.drop(del_cols, axis=1, inplace=True)
        print(f'variance<{CFG.variance_threshold} columns: ({len(del_cols)}) {del_cols}')

        # del_cols = get_sparse_columns(filtered_data, p=CFG.sparse_threshold, ignore_features=['평균가격(원)'])
        # filtered_data.drop(del_cols, axis=1, inplace=True)
        # print(f'sparse columns: ({len(del_cols)}) {del_cols}')

    return filtered_data, scalers, imputers

# 3. Define Custom Dataset Class

In [6]:
class AgriculturePriceDataset(Dataset):
    def __init__(self, dataframe, window_size=9, prediction_length=3, is_test=False):
        self.data = dataframe
        self.window_size = window_size
        self.prediction_length = prediction_length
        self.is_test = is_test
        
        self.price_column = '평균가격(원)'
        self.numeric_columns = self.data.select_dtypes(include=[np.number]).columns.tolist()

        self.sequences = []
        if not self.is_test:
            for i in range(len(self.data) - self.window_size - self.prediction_length + 1):
                x = self.data[self.numeric_columns].iloc[i:i+self.window_size].values
                y = self.data[self.price_column].iloc[i+self.window_size:i+self.window_size+self.prediction_length].values
                self.sequences.append((x, y))
        else:
            self.sequences = [self.data[self.numeric_columns].values]
    
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        if not self.is_test:
            x, y = self.sequences[idx]
            return torch.FloatTensor(x), torch.FloatTensor(y)
        else:
            return torch.FloatTensor(self.sequences[idx])

# 4. Define Model Architecture and Training Functions

In [7]:
class LSTM(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.input_size = config.d_model
        self.hidden_size = config.hidden_size
        self.num_layers = config.num_layers
        self.output_size = config.pred_len
        self.dropout = config.dropout
        
        # 첫 번째 LSTM 레이어
        self.lstm1 = nn.LSTM(self.input_size, self.hidden_size, self.num_layers, dropout=self.dropout, batch_first=True)
        
        # 두 번째 LSTM 레이어
        self.lstm2 = nn.LSTM(self.hidden_size, self.hidden_size//2, self.num_layers, dropout=self.dropout, batch_first=True)
        
        # 최종 Fully Connected 레이어
        self.fc = nn.Linear(self.hidden_size//2, self.output_size)
    
    def forward(self, x):
        # 첫 번째 LSTM에 대한 초기 hidden state와 cell state
        h0_1 = torch.zeros(self.lstm1.num_layers, x.size(0), self.lstm1.hidden_size).to(x.device)
        c0_1 = torch.zeros(self.lstm1.num_layers, x.size(0), self.lstm1.hidden_size).to(x.device)
        
        # 두 번째 LSTM에 대한 초기 hidden state와 cell state
        h0_2 = torch.zeros(self.lstm2.num_layers, x.size(0), self.lstm2.hidden_size).to(x.device)
        c0_2 = torch.zeros(self.lstm2.num_layers, x.size(0), self.lstm2.hidden_size).to(x.device)
        
        # 첫 번째 LSTM 레이어의 순전파
        out, _ = self.lstm1(x, (h0_1, c0_1))
        
        # 두 번째 LSTM 레이어의 순전파
        out, _ = self.lstm2(out, (h0_2, c0_2))
        
        # 마지막 Fully Connected 레이어를 거쳐 예측 값 반환
        out = self.fc(out[:, -1, :])
        return out

In [8]:
# https://github.com/lss-1138/SegRNN/blob/main/models/SegRNN.py
from lib.python.torch.models.SegRNN import Model as SegRNN
from lib.python.torch.models.NLinear import Model as NLinear
from lib.python.torch.models.DLinear import Model as DLinear
from lib.python.torch.models.TimeSeriesTransformer import Model as TimeSeriesTransformer
from lib.python.torch.models.PatchMixer import Model as PatchMixer

class EnsembleModel(nn.Module):
    def __init__(self, config):
        super().__init__()
        assert config.ensemble in ['average','weightedsum','cat'], "ensemble must be one of ['average','weightedsum','cat']"
        self.ensemble = config.ensemble
        # self.models = ['LSTM', 'SegRNN', 'NLinear', 'DLinear', 'TimeSeriesTransformer', 'PatchMixer']
        self.models = ['SegRNN', 'NLinear', 'DLinear']

        # self.lstm_layer = LSTM(config)
        self.segrnn_layer = SegRNN(config)
        self.nlinear_layer = NLinear(config)
        self.dlinear_layer = DLinear(config)
        # self.transformer_layer = TimeSeriesTransformer(config)
        # self.patchmixer_layer = PatchMixer(config)

        self.fc_segrnn = nn.Linear(config.d_model, 1)
        self.fc_nlinear = nn.Linear(config.d_model, 1)
        self.fc_dlinear = nn.Linear(config.d_model, 1)
        # self.fc_patchmixer = nn.Linear(config.d_model, 1)

        self.fc_cat = nn.Sequential(
            nn.Linear(len(self.models)*config.pred_len, (len(self.models)//2)*config.pred_len),
            nn.ReLU(),
            nn.Dropout(config.dropout),
            nn.Linear((len(self.models)//2)*config.pred_len, config.pred_len),
        )
        self.layernorm = nn.LayerNorm(len(self.models)*config.pred_len)
        self.dropout = nn.Dropout(config.dropout)

        if self.ensemble == 'weightedsum':
            self.ensemble_weights = nn.Parameter(torch.ones(len(self.models)), requires_grad=True)
        elif self.ensemble == 'average':
            self.ensemble_weights = torch.ones(len(self.models))

    def forward(self, x):
        # lstm_out = self.lstm_layer(x)
        segrnn_out = self.fc_segrnn(self.segrnn_layer(x)).squeeze(-1)
        nlinear_out = self.fc_nlinear(self.nlinear_layer(x)).squeeze(-1)
        dlinear_out = self.fc_dlinear(self.dlinear_layer(x)).squeeze(-1)
        # transformer_out = self.transformer_layer(x)
        # patchmixer_out = self.fc_patchmixer(self.patchmixer_layer(x)).squeeze(-1)

        # lstm_out, segrnn_out, nlinear_out, dlinear_out, transformer_out, patchmixer_out
        outputs = [segrnn_out, nlinear_out, dlinear_out]

        if self.ensemble in ['average', 'weightedsum']:
            weights = torch.softmax(self.ensemble_weights, dim=0)
            outputs = torch.stack(outputs, dim=0)
            ensembled = torch.einsum('i,ibk->bk', weights, outputs)
        elif self.ensemble == 'cat':
            ensembled = torch.cat(outputs, dim=-1)
            ensembled = self.layernorm(ensembled)
            ensembled = self.dropout(ensembled)
            ensembled = self.fc_cat(ensembled)

        return ensembled

# 5. Train Models and Generate Predictions

In [9]:
def nmae(true, pred, is_train=True):
    if is_train:
        return torch.mean(torch.abs(true - pred) / true)
    else:
        true, pred = np.array(true), np.array(pred)
        return np.mean(np.abs(true - pred) / true)

def inverse_transform(x, scaler, is_train=True):
    origin = scaler.data_min_[0] + x * (scaler.data_max_[0] - scaler.data_min_[0]) # (1) minmax scaler
    origin = torch.expm1(origin) if is_train else np.expm1(origin)                 # (2) log
    return origin

def custom_metric(true, pred, scaler):
    true = inverse_transform(true, scaler, is_train=True)
    pred = inverse_transform(pred, scaler, is_train=True)
    nmae_score = nmae(true, pred, is_train=True)
    return nmae_score

In [10]:
config = {
    "train_path": "data/train/train.csv",
    "train_meta1_path": "data/train/meta/TRAIN_산지공판장_2018-2021.csv",
    "train_meta2_path": "data/train/meta/TRAIN_전국도매_2018-2021.csv",
    "learning_rate": 5e-4, #0.001,
    "epoch": 10_000,
    "batch_size": 16,
    "output_size": 3,
    "weight_decay": 5e-3,
    "test_size": 0.2,
    "seed": 42,
    "variance_threshold": 0.01,
    "sparse_threshold": 0.5,
    "device": 'cpu',
}

model_config = {
    "seq_len": 9,
    "pred_len": 3,
    "dropout": 0.5,

    # SegRNN
    "rnn_type": 'rnn', # rnn, gru, lstm
    "dec_way": 'pmf',  # rmf, pmf
    "seg_len": 3,
    "channel_id": False,
    "revin": True,

    # NLinear
    "channels": 3,
    "individual": True,

    # TimeSeriesTransfromer
    "hidden_size": 256,
    "num_layers": 3,
    "num_heads": 8,
    "method": 'multihead', # multihead, performer, linformer

    # PatchMixer
    "patch_len": 9,
    "stride": 1,
    "mixer_kernel_size": 3,
    "head_dropout": 0.5,
    "e_layers": 1,

    # Ensemble
    "ensemble": 'weightedsum', # average, weightedsum, cat
}

CFG = SimpleNamespace(**config)
MODEL_CFG = SimpleNamespace(**model_config)
품목_리스트 = ['건고추', '사과', '감자', '배', '깐마늘(국산)', '무', '상추', '배추', '양파', '대파']

save_dir = f'models/EnsembleModel_{MODEL_CFG.ensemble}_batch{CFG.batch_size}_model3/'
os.makedirs(save_dir, exist_ok=True)
print(f'{save_dir=}')

save_dir='models/EnsembleModel_weightedsum_batch16_model3/'


In [11]:
# from torchinfo import summary

# # [x for x,y in train_loader][0].shape
# input_size = (16,9,299)

# # LSTM : 56,499
# # SegRNN : 181,795
# # NLinear : 96
# # DLinear : 17,940
# # TimeSeriesTransformer : 80,587
# # PatchMixer : 9,022

# summary(LSTM(model_cfg), input_size=input_size)
# summary(SegRNN(model_cfg), input_size=input_size)
# summary(NLinear(model_cfg), input_size=input_size)
# summary(DLinear(model_cfg), input_size=input_size)
# summary(TimeSeriesTransformer(model_cfg), input_size=input_size)
# summary(PatchMixer(model_cfg), input_size=input_size)

In [12]:
logger = get_logger(save_path=os.path.join(save_dir,'train.log'))
trace_func = logger.info
# trace_func = print

train_nmae_list = []
val_nmae_list = []

for i, 품목명 in enumerate(품목_리스트):
    model_path = os.path.join(save_dir,f'{품목명}.pth')

    trace_func('')
    trace_func('='*170)
    trace_func(f'> [{i+1:02d}/{len(품목_리스트)}] {품목명}')
    trace_func('='*170)
    trace_func('')

    # preprocessing
    trace_func('> Preprocessing')
    train_data, scalers, imputers = process_data(CFG.train_path, CFG.train_meta1_path, CFG.train_meta2_path, 품목명)
    if train_data.isnull().sum().any():
        raise ValueError('Null Value Dectected')
    
    # make dataset
    trace_func('> Make Dataset')
    dataset = AgriculturePriceDataset(train_data)
    trace_func(f'   data shape: ({sum([x.size(0) for x,y in dataset])},{len(dataset.numeric_columns)})')

    # train, validation split
    trace_func('')
    trace_func('> Train')
    tr_data, val_data = train_test_split(dataset, test_size=CFG.test_size, random_state=CFG.seed, shuffle=True)
    train_loader = DataLoader(tr_data, CFG.batch_size, shuffle=True)
    val_loader = DataLoader(val_data, CFG.batch_size, shuffle=False)

    # define model
    model_cfg = deepcopy(MODEL_CFG)
    model_cfg.enc_in = len(dataset.numeric_columns)
    model_cfg.d_model = len(dataset.numeric_columns)
    model = EnsembleModel(model_cfg).to(CFG.device)

    criterion = nn.HuberLoss() #nn.L1Loss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=CFG.learning_rate, weight_decay=CFG.weight_decay)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=50)
    # scheduler = None

    price_scaler = scalers[dataset.price_column]
    custom_metric_ = partial(custom_metric, scaler=price_scaler)
    inverse_transform_ = lambda x: inverse_transform(x, scaler=price_scaler, is_train=True)
    # inverse_transform = None

    # train
    best_model = train(
        model, train_loader, val_loader, CFG.epoch,
        optimizer, criterion, scheduler,
        early_stopping=True, early_stopping_patience=200, early_stopping_verbose=False,
        device=CFG.device, metric_period=100, 
        verbose=True, save_model_path=model_path,
        custom_metric=custom_metric_,
    )

    # ensemble model weights
    if model_cfg.ensemble == 'weightedsum':
        ensemble_weights = best_model.state_dict().get('ensemble_weights')
        weight_dict = {name: round(weight.item(),3) for name,weight in zip(best_model.models, torch.softmax(ensemble_weights, dim=0))}
        trace_func('')
        trace_func(f'<Model Weight> {weight_dict}')

    # scoring : train을 통해 나온 custom_metric 값은 batch들의 평균이므로 전체기준과는 다름
    true, pred = predict(best_model, train_loader, device='cpu', inverse_transform=inverse_transform_)
    train_nmae = nmae(true, pred, is_train=False)
    true, pred = predict(best_model, val_loader, device='cpu', inverse_transform=inverse_transform_)
    val_nmae = nmae(true, pred, is_train=False)
    trace_func(f'<Score> {train_nmae=:.4f}, {val_nmae=:.4f}')
    trace_func('')

    # save attributes
    attributes = {}
    attributes['scalers'] = scalers
    attributes['imputers'] = imputers
    attributes['features'] = train_data.columns.tolist()
    attributes['model_config'] = model_cfg
    attributes['train_nmae'] = train_nmae
    attributes['val_nmae'] = val_nmae
    to_pickle(attributes, os.path.join(save_dir,f'{품목명}_attributes.pkl'))

    # nmae append
    train_nmae_list.append(train_nmae)
    val_nmae_list.append(val_nmae)

[2024-10-14 06:20:48,312 2279627943.py:11][INFO] 
[2024-10-14 06:20:48,317 2279627943.py:13][INFO] > [01/10] 건고추
[2024-10-14 06:20:48,321 2279627943.py:15][INFO] 
[2024-10-14 06:20:48,321 2279627943.py:18][INFO] > Preprocessing
[2024-10-14 06:20:49,395 log.py:38][INFO] variance<0.01 columns: (3) ['양건_30 kg_상품_평균가격(원)', '화건_30 kg_중품_평균가격(원)', '양건_30 kg_중품_평균가격(원)']
[2024-10-14 06:20:49,404 2279627943.py:24][INFO] > Make Dataset
[2024-10-14 06:20:49,475 2279627943.py:26][INFO]    data shape: (1197,13)
[2024-10-14 06:20:49,475 2279627943.py:29][INFO] 
[2024-10-14 06:20:49,478 2279627943.py:30][INFO] > Train
[2024-10-14 06:21:16,458 log.py:38][INFO] *[00100/10000] loss: 0.0087, val_loss: 0.0061, best: 0.0061(100) | custom: 0.0850, val_custom: 0.0635, best: 0.0635 | elapsed: 25.9s, total: 25.9s, remaining: 2567.6s
[2024-10-14 06:21:38,593 log.py:38][INFO] [00200/10000] loss: 0.0034, val_loss: 0.0024, best: 0.0023(174) | custom: 0.0378, val_custom: 0.0353, best: 0.0353 | elapsed: 22.1s, tota

In [13]:
f'train_nmae={np.mean(train_nmae_list):.4f}, val_nmae={np.mean(val_nmae_list):.4f}'

'train_nmae=0.0414, val_nmae=0.1232'

In [14]:
# true, pred = predict(best_model, train_loader, device='cpu', inverse_transform=inverse_transform)
# print(criterion(torch.tensor(true), torch.tensor(pred)).item())
# true[:5], pred[:5]

# 6. Inference

In [15]:
품목별_predictions = {}

pbar_outer = tqdm(품목_리스트, position=0)
for 품목명 in pbar_outer:
    pbar_outer.set_description(품목명)

    model_path = os.path.join(save_dir,f'{품목명}.pth')

    attributes = read_pickle(os.path.join(save_dir,f'{품목명}_attributes.pkl'))
    scalers = attributes['scalers']
    imputers = attributes['imputers']
    features = attributes['features']
    model_cfg = attributes['model_config']

    # define model
    model = EnsembleModel(model_cfg).to(CFG.device)
    model.load_state_dict(torch.load(model_path))

    # inference
    품목_predictions = []
    pbar_inner = tqdm(range(25), desc="테스트 파일 추론 중", position=1, leave=False)
    for i in pbar_inner:
        test_file = f"data/test/TEST_{i:02d}.csv"
        산지공판장_file = f"data/test/meta/TEST_산지공판장_{i:02d}.csv"
        전국도매_file = f"data/test/meta/TEST_전국도매_{i:02d}.csv"

        test_data, _, _ = process_data(test_file, 산지공판장_file, 전국도매_file, 품목명, scalers, imputers, is_train=False)
        test_data = test_data[features]
        if test_data.isnull().sum().any():
            raise ValueError('Null Value Detected')

        test_dataset = AgriculturePriceDataset(test_data, is_test=True)
        test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

        price_scaler = scalers[test_dataset.price_column]
        inverse_transform_ = lambda x: inverse_transform(x, scaler=price_scaler, is_train=True)

        predictions = inference(model, test_loader, device='cpu', inverse_transform=inverse_transform_)
        predictions = np.concatenate(predictions)
        
        if np.isnan(predictions).any():
            pbar_inner.set_postfix({"상태": "NaN"})
            raise ValueError
        else:
            pbar_inner.set_postfix({"상태": "정상"})
            품목_predictions.extend(predictions.flatten())

    품목별_predictions[품목명] = 품목_predictions

  0%|          | 0/10 [00:00<?, ?it/s]

테스트 파일 추론 중:   0%|          | 0/25 [00:00<?, ?it/s]

테스트 파일 추론 중:   0%|          | 0/25 [00:00<?, ?it/s]

테스트 파일 추론 중:   0%|          | 0/25 [00:00<?, ?it/s]

테스트 파일 추론 중:   0%|          | 0/25 [00:00<?, ?it/s]

테스트 파일 추론 중:   0%|          | 0/25 [00:00<?, ?it/s]

테스트 파일 추론 중:   0%|          | 0/25 [00:00<?, ?it/s]

테스트 파일 추론 중:   0%|          | 0/25 [00:00<?, ?it/s]

테스트 파일 추론 중:   0%|          | 0/25 [00:00<?, ?it/s]

테스트 파일 추론 중:   0%|          | 0/25 [00:00<?, ?it/s]

테스트 파일 추론 중:   0%|          | 0/25 [00:00<?, ?it/s]

# 7. Prepare Submission File

In [16]:
sample_submission = pd.read_csv('data/sample_submission.csv')

for 품목명, predictions in 품목별_predictions.items():
    sample_submission[품목명] = predictions

# 결과 저장
save_path = f'out/baseline_submission_22_{save_dir.split("/")[-2]}.csv'
sample_submission.to_csv(save_path, index=False)

In [17]:
sample_submission

Unnamed: 0,시점,감자,건고추,깐마늘(국산),대파,무,배추,사과,상추,양파,배
0,TEST_00+1순,34969.539062,640152.9375,164333.421875,1553.543945,28140.962891,10670.495117,30038.326172,1103.236938,1394.958862,33260.730469
1,TEST_00+2순,34847.652344,667147.2500,179791.015625,1541.194946,25088.607422,7823.345703,28003.697266,839.367310,1231.049805,33138.195312
2,TEST_00+3순,39204.726562,650097.4375,183857.109375,1632.093018,23201.632812,7861.479004,28126.257812,830.924377,952.629517,34087.816406
3,TEST_01+1순,42050.136719,664384.6875,165715.375000,1564.461670,11685.291992,6586.710449,29138.808594,814.812500,1431.211304,30191.724609
4,TEST_01+2순,43982.140625,680506.7500,169933.453125,1553.143677,10781.274414,5593.586914,26506.767578,758.432312,1269.656250,32383.712891
...,...,...,...,...,...,...,...,...,...,...,...
70,TEST_23+2순,38000.015625,632088.1875,165298.046875,1513.435791,8488.772461,3326.313477,26835.990234,878.378418,1376.453857,30995.718750
71,TEST_23+3순,40384.496094,622403.1250,171765.640625,1480.764893,7097.342773,3743.668457,26929.365234,900.954407,1134.075928,32091.849609
72,TEST_24+1순,34866.835938,546989.5625,140206.359375,1483.305054,11140.007812,11109.980469,31361.279297,919.516968,635.989258,37252.289062
73,TEST_24+2순,27758.296875,552698.8750,152943.578125,1201.880005,16273.306641,11381.727539,31009.261719,898.828857,459.833191,38741.613281
