# 농산물 가격 예측을 위한 AI 모델 개발 
- '2024 농산물 가격 예측 AI 경진대회'는 데이터와 AI 기술을 활용하여 농산물 가격 예측 능력을 향상시키는 것을 목표로 합니다.<br>  이 대회는 농업 분야의 복잡한 시계열 데이터를 효율적으로 분석하고 예측할 수 있는 AI 알고리즘 개발에 초점을 맞추고 있습니다. <br> <br>
- 이 대회의 궁극적 목적은 참가자들의 시계열 데이터 분석 및 예측 역량을 강화하고, <br> AI 기술이 실제 농산물 가격 예측과 관련 정책 결정에 어떻게 기여할 수 있는지 탐구하는 것입니다. 

# Import Library

In [1]:
import sys
lib_dir = "g:/My Drive/Storage/Github/hyuckjinkim"
sys.path.append(lib_dir)

from lib.python.graph import MatplotlibFontManager
fm = MatplotlibFontManager()
fm.set_korean_font(check=False)

from lib.python.torch import seed_everything
from lib.python.torch.build_model import train, predict
from lib.python.log import get_logger

seed_everything(42)

In [2]:
# import pandas as pd
# pd.set_option("display.max_rows", None)
# pd.set_option("display.max_columns", None)

# train_df = pd.read_csv('data/train/train.csv')

# train_meta1_df = pd.read_csv('data/train/meta/TRAIN_산지공판장_2018-2021.csv')
# train_meta1_df.drop(['품목코드','품종코드','공판장코드'], axis=1, inplace=True)

# train_meta2_df = pd.read_csv('data/train/meta/TRAIN_전국도매_2018-2021.csv')
# train_meta2_df.drop(['품목코드','품종코드','시장코드']  , axis=1, inplace=True)

# train_df.head(2) # ['평년 평균가격(원)','평균가격(원)']
# train_meta1_df.head(2)
# train_meta2_df.head(2)

In [3]:
import os
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from types import SimpleNamespace
import pickle
import gc
gc.collect()

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

# Hyperparameter Setting

# Define Function for Feature Engineering
- 타겟의 필터 조건을 제외한 메타데이터의 필터 조건은 참가자들 각자의 기준에 맞춰 자유롭게 사용가능 
- 밑의 필터 조건은 임의로 제공하는 예시

In [4]:
def year_convert(data):
    data['연도'] -= 2018

    offset = 0.1
    map_dict = {'상순':offset, '중순':offset+1/3, '하순':offset+2/3}
    data['연도'] += data['시점'].str.extract(r'(상순|중순|하순)')[0].map(map_dict)
    
    return data

def process_data(raw_file, 산지공판장_file, 전국도매_file, 품목명, scalers=None):
    raw_data = pd.read_csv(raw_file)
    산지공판장 = pd.read_csv(산지공판장_file)
    전국도매 = pd.read_csv(전국도매_file)

    # 품목코드, 품종코드, 공판장코드, 시장코드 제거
    산지공판장.drop(['품목코드','품종코드','공판장코드'], axis=1, inplace=True)
    전국도매  .drop(['품목코드','품종코드','시장코드']  , axis=1, inplace=True)

    # 연도에 상/중/하순에 대한 정보도 추가
    산지공판장 = year_convert(산지공판장)
    전국도매 = year_convert(전국도매)

    # 이상값(0이하) 처리
    for col in ['전순 평균가격(원) PreVious SOON', '전달 평균가격(원) PreVious MMonth', '전년 평균가격(원) PreVious YeaR']:
        loc = 전국도매[col] < 0
        전국도매.loc[loc,col] = 0

    # log변환
    raw_cols = ['평년 평균가격(원)', '평균가격(원)']
    산지공판장_cols =  ['총반입량(kg)', '총거래금액(원)', '평균가(원/kg)', '중간가(원/kg)', '최저가(원/kg)', '최고가(원/kg)', '경매 건수', 
                       '전순 평균가격(원) PreVious SOON', '전달 평균가격(원) PreVious MMonth', '전년 평균가격(원) PreVious YeaR', '평년 평균가격(원) Common Year SOON']
    전국도매_cols = ['총반입량(kg)', '총거래금액(원)', '평균가(원/kg)', '고가(20%) 평균가', '중가(60%) 평균가 ', '저가(20%) 평균가', '중간가(원/kg)', '최저가(원/kg)',
                    '최고가(원/kg)', '경매 건수', '전순 평균가격(원) PreVious SOON', '전달 평균가격(원) PreVious MMonth', '전년 평균가격(원) PreVious YeaR', '평년 평균가격(원) Common Year SOON']
    for col in raw_cols: raw_data[col] = np.log1p(raw_data[col])
    for col in 산지공판장_cols: 산지공판장[col] = np.log1p(산지공판장[col])
    for col in 전국도매_cols: 전국도매[col] = np.log1p(전국도매[col])

    # 타겟 및 메타데이터 필터 조건 정의
    conditions = {
    '감자': {
        'target': lambda df: (df['품종명'] == '감자 수미') & (df['거래단위'] == '20키로상자') & (df['등급'] == '상'),
        '공판장': {'공판장명': ['*전국농협공판장'], '품목명': ['감자'], '품종명': ['수미'], '등급명': ['상']},
        '도매': {'시장명': ['*전국도매시장'], '품목명': ['감자'], '품종명': ['수미']}
    },
    '건고추': {
        'target': lambda df: (df['품종명'] == '화건') & (df['거래단위'] == '30 kg') & (df['등급'] == '상품'),
        '공판장': None, 
        '도매': None  
    },
    '깐마늘(국산)': {
        'target': lambda df: (df['거래단위'] == '20 kg') & (df['등급'] == '상품'),
        '공판장': {'공판장명': ['*전국농협공판장'], '품목명': ['마늘'], '품종명': ['깐마늘'], '등급명': ['상']},
        '도매': {'시장명': ['*전국도매시장'], '품목명': ['마늘'], '품종명': ['깐마늘']}
    },
    '대파': {
        'target': lambda df: (df['품종명'] == '대파(일반)') & (df['거래단위'] == '1키로단') & (df['등급'] == '상'),
        '공판장': {'공판장명': ['*전국농협공판장'], '품목명': ['대파'], '품종명': ['대파(일반)'], '등급명': ['상']},
        '도매': {'시장명': ['*전국도매시장'], '품목명': ['대파'], '품종명': ['대파(일반)']}
    },
    '무': {
        'target': lambda df: (df['거래단위'] == '20키로상자') & (df['등급'] == '상'),
        '공판장': {'공판장명': ['*전국농협공판장'], '품목명': ['무'], '품종명': ['기타무'], '등급명': ['상']},
        '도매': {'시장명': ['*전국도매시장'], '품목명': ['무'], '품종명': ['무']}
    },
    '배추': {
        'target': lambda df: (df['거래단위'] == '10키로망대') & (df['등급'] == '상'),
        '공판장': {'공판장명': ['*전국농협공판장'], '품목명': ['배추'], '품종명': ['쌈배추'], '등급명': ['상']},
        '도매': {'시장명': ['*전국도매시장'], '품목명': ['배추'], '품종명': ['배추']}
    },
    '사과': {
        'target': lambda df: (df['품종명'].isin(['홍로', '후지'])) & (df['거래단위'] == '10 개') & (df['등급'] == '상품'),
        '공판장': {'공판장명': ['*전국농협공판장'], '품목명': ['사과'], '품종명': ['후지'], '등급명': ['상']},
        '도매': {'시장명': ['*전국도매시장'], '품목명': ['사과'], '품종명': ['후지']}
    },
    '상추': {
        'target': lambda df: (df['품종명'] == '청') & (df['거래단위'] == '100 g') & (df['등급'] == '상품'),
        '공판장': {'공판장명': ['*전국농협공판장'], '품목명': ['상추'], '품종명': ['청상추'], '등급명': ['상']},
        '도매': {'시장명': ['*전국도매시장'], '품목명': ['상추'], '품종명': ['청상추']}
    },
    '양파': {
        'target': lambda df: (df['품종명'] == '양파') & (df['거래단위'] == '1키로') & (df['등급'] == '상'),
        '공판장': {'공판장명': ['*전국농협공판장'], '품목명': ['양파'], '품종명': ['기타양파'], '등급명': ['상']},
        '도매': {'시장명': ['*전국도매시장'], '품목명': ['양파'], '품종명': ['양파(일반)']}
    },
    '배': {
        'target': lambda df: (df['품종명'] == '신고') & (df['거래단위'] == '10 개') & (df['등급'] == '상품'),
        '공판장': {'공판장명': ['*전국농협공판장'], '품목명': ['배'], '품종명': ['신고'], '등급명': ['상']},
        '도매': {'시장명': ['*전국도매시장'], '품목명': ['배'], '품종명': ['신고']}
    }
    }

    # 타겟 데이터 필터링
    raw_품목 = raw_data[raw_data['품목명'] == 품목명]
    target_mask = conditions[품목명]['target'](raw_품목)
    filtered_data = raw_품목[target_mask]

    # 다른 품종에 대한 파생변수 생성
    other_data = raw_품목[~target_mask]
    unique_combinations = other_data[['품종명', '거래단위', '등급']].drop_duplicates()
    for _, row in unique_combinations.iterrows():
        품종명, 거래단위, 등급 = row['품종명'], row['거래단위'], row['등급']
        mask = (other_data['품종명'] == 품종명) & (other_data['거래단위'] == 거래단위) & (other_data['등급'] == 등급)
        temp_df = other_data[mask]
        for col in ['평년 평균가격(원)', '평균가격(원)']:
            new_col_name = f'{품종명}_{거래단위}_{등급}_{col}'
            filtered_data = filtered_data.merge(temp_df[['시점', col]], on='시점', how='left', suffixes=('', f'_{new_col_name}'))
            filtered_data.rename(columns={f'{col}_{new_col_name}': new_col_name}, inplace=True)

    # 공판장 데이터 처리
    if conditions[품목명]['공판장']:
        filtered_공판장 = 산지공판장
        for key, value in conditions[품목명]['공판장'].items():
            filtered_공판장 = filtered_공판장[filtered_공판장[key].isin(value)]
        
        filtered_공판장 = filtered_공판장.add_prefix('공판장_').rename(columns={'공판장_시점': '시점'})
        filtered_data = filtered_data.merge(filtered_공판장, on='시점', how='left')

    # 도매 데이터 처리
    if conditions[품목명]['도매']:
        filtered_도매 = 전국도매
        for key, value in conditions[품목명]['도매'].items():
            filtered_도매 = filtered_도매[filtered_도매[key].isin(value)]
        
        filtered_도매 = filtered_도매.add_prefix('도매_').rename(columns={'도매_시점': '시점'})
        filtered_data = filtered_data.merge(filtered_도매, on='시점', how='left')

    # 수치형 컬럼 처리
    numeric_columns = filtered_data.select_dtypes(include=[np.number]).columns
    filtered_data = filtered_data[['시점'] + list(numeric_columns)]
    filtered_data[numeric_columns] = filtered_data[numeric_columns].fillna(0)

    # 정규화 적용
    if scalers is None:
        scalers = {}
        for col in numeric_columns:
            scaler = MinMaxScaler()
            filtered_data[col] = scaler.fit_transform(filtered_data[col].values.reshape(-1,1))
            scalers[col] = scaler
    else:
        for col in numeric_columns:
            scaler = scalers[col]
            filtered_data[col] = scaler.transform(filtered_data[col].values.reshape(-1,1))

    return filtered_data, scalers


# Define Custom Dataset Class

In [5]:
class AgriculturePriceDataset(Dataset):
    def __init__(self, dataframe, window_size=9, prediction_length=3, is_test=False):
        self.data = dataframe
        self.window_size = window_size
        self.prediction_length = prediction_length
        self.is_test = is_test
        
        self.price_column = '평균가격(원)'
        self.numeric_columns = self.data.select_dtypes(include=[np.number]).columns.tolist()

        self.sequences = []
        if not self.is_test:
            for i in range(len(self.data) - self.window_size - self.prediction_length + 1):
                x = self.data[self.numeric_columns].iloc[i:i+self.window_size].values
                y = self.data[self.price_column].iloc[i+self.window_size:i+self.window_size+self.prediction_length].values
                self.sequences.append((x, y))
        else:
            self.sequences = [self.data[self.numeric_columns].values]
    
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        if not self.is_test:
            x, y = self.sequences[idx]
            return torch.FloatTensor(x), torch.FloatTensor(y)
        else:
            return torch.FloatTensor(self.sequences[idx])

# Define Model Architecture and Training Functions

In [6]:
class LinformerSelfAttention(nn.Module):
    def __init__(self, input_dim, seq_len, num_heads, k=32, dropout=0.1):
        super(LinformerSelfAttention, self).__init__()
        self.num_heads = num_heads
        self.head_dim = input_dim // num_heads
        self.k = k  # Low-rank projection dimension
        assert self.head_dim * num_heads == input_dim, "input_dim must be divisible by num_heads"

        # Linear projections for queries, keys, and values
        self.query = nn.Linear(input_dim, input_dim)
        self.key = nn.Linear(input_dim, input_dim)
        self.value = nn.Linear(input_dim, input_dim)

        # Low-rank projection matrices for keys and values
        self.proj_key = nn.Linear(seq_len, seq_len)  # 수정: 투영 차원을 원래 시퀀스 길이와 일치시킴
        self.proj_value = nn.Linear(seq_len, seq_len)  # 수정: 투영 차원을 맞춤

        # Output projection
        self.out = nn.Linear(input_dim, input_dim)

        # Dropout
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        bsz, seq_len, _ = x.size()

        # Project to queries, keys, and values
        queries = self.query(x).view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        keys = self.key(x).view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        values = self.value(x).view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2)

        # Apply low-rank projection to keys and values
        keys = self.proj_key(keys).transpose(1, 2)  # 투영 후 크기 맞춤
        values = self.proj_value(values).transpose(1, 2)  # 투영 후 크기 맞춤

        # Scaled dot-product attention
        scores = torch.matmul(queries, keys.transpose(-2, -1)) / (self.head_dim ** 0.5)
        attn = torch.softmax(scores, dim=-1)
        attn = self.dropout(attn)

        # Attention output
        output = torch.matmul(attn, values)
        output = output.transpose(1, 2).contiguous().view(bsz, seq_len, self.num_heads * self.head_dim)

        return self.out(output)

class PerformerSelfAttention(nn.Module):
    def __init__(self, input_dim, num_heads, kernel_size=32, dropout=0.1):
        super(PerformerSelfAttention, self).__init__()
        self.num_heads = num_heads
        self.head_dim = input_dim // num_heads
        self.kernel_size = kernel_size
        assert self.head_dim * num_heads == input_dim, "input_dim must be divisible by num_heads"

        # Linear projections for queries, keys, and values
        self.query = nn.Linear(input_dim, input_dim)
        self.key = nn.Linear(input_dim, input_dim)
        self.value = nn.Linear(input_dim, input_dim)

        # Output projection
        self.out = nn.Linear(input_dim, input_dim)

        # Dropout
        self.dropout = nn.Dropout(dropout)

    def feature_map(self, x):
        # Random Fourier feature mapping (or kernel approximation)
        return torch.exp(-x ** 2 / 2)

    def forward(self, x):
        bsz, seq_len, _ = x.size()

        # Project to queries, keys, and values
        queries = self.feature_map(self.query(x).view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2))
        keys = self.feature_map(self.key(x).view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2))
        values = self.value(x).view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2)

        # Perform efficient attention
        kv = torch.einsum('bhse,bhsc->bhsec', keys, values)
        qkv = torch.einsum('bhse,bhsec->bhsc', queries, kv)

        # Attention output
        output = qkv.transpose(1, 2).contiguous().view(bsz, seq_len, self.num_heads * self.head_dim)
        return self.out(output)

In [7]:
class Time2Vec(nn.Module):
    def __init__(self, input_dim):
        super(Time2Vec, self).__init__()
        periodic_dim = (input_dim-1) // 2
        self.linear = nn.Linear(input_dim, input_dim - periodic_dim*2)
        self.periodic = nn.Linear(input_dim, periodic_dim)
    
    def forward(self, x):
        linear_out = self.linear(x)
        periodic_sin = torch.sin(self.periodic(x))
        periodic_cos = torch.cos(self.periodic(x))  # cosine 추가
        periodic_out = torch.cat([periodic_sin, periodic_cos], dim=-1)  # sin과 cos 결합
        return torch.cat([linear_out, periodic_out], dim=-1)

# Define Transformer Encoder Block
class TransformerBlock(nn.Module):
    def __init__(self, input_dim, num_heads, dropout, method='multihead', seq_len=None):
        super(TransformerBlock, self).__init__()
        self.method = method
        
        if method == 'multihead':
            self.attention = nn.MultiheadAttention(input_dim, num_heads, dropout=dropout)
        elif method == 'linformer':
            self.attention = LinformerSelfAttention(input_dim, seq_len, num_heads, dropout=dropout)
        elif method == 'performer':
            self.attention = PerformerSelfAttention(input_dim, num_heads, dropout=dropout)

        self.norm1 = nn.LayerNorm(input_dim)
        self.norm2 = nn.LayerNorm(input_dim)
        self.ff = nn.Sequential(
            nn.Linear(input_dim, 4 * input_dim),
            nn.GELU(),
            nn.Linear(4 * input_dim, input_dim),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        if self.method=='multihead':
            attended, _ = self.attention(x, x, x) # 1. Attention
        else:
            attended = self.attention(x)
        x = self.norm1(attended + x)              # 2. 잔차 연결 + Layer Normalization
        feedforward = self.ff(x)                  # 3. Feedforward + BatchNorm 적용
        x = self.norm2(feedforward + x)           # 4. 잔차 연결 + Layer Normalization
        return x

# Define main model architecture
class TimeSeriesTransformer(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_heads, output_size, dropout, method, seq_len):
        super(TimeSeriesTransformer, self).__init__()
        self.max_len = 10

        self.time2vec = Time2Vec(input_size)
        self.embedding = nn.Linear(input_size, hidden_size)
        self.position_encoding = self.generate_position_encoding(hidden_size, self.max_len)
        self.dropout = nn.Dropout(dropout)
        
        self.transformer_blocks = nn.ModuleList([
            TransformerBlock(hidden_size, num_heads, dropout, method, seq_len) 
            for _ in range(num_layers)
        ])
        
        self.output_layer = nn.Sequential(
            nn.Linear(hidden_size, hidden_size // 2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_size // 2, output_size)
        )

    def generate_position_encoding(self, hidden_size, max_len):
        pe = torch.zeros(max_len, hidden_size)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, hidden_size, 2).float() * (-np.log(10000.0) / hidden_size))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        return pe

    def forward(self, x):
        b, s, f = x.shape
        x = self.time2vec(x)
        x = self.embedding(x)
        x = x + self.position_encoding[:, :s, :].to(x.device)
        x = self.dropout(x)
        for transformer in self.transformer_blocks:
            x = transformer(x)
        x = x.mean(dim=1)
        x = self.output_layer(x)
        return x

# Train Models and Generate Predictions

In [8]:
def nmae(true,pred):
    true, pred = np.array(true), np.array(pred)
    return np.mean(np.abs(true - pred) / true)

def minmax_inverse_transform(x, scaler, is_train=True):
    origin = scaler.data_min_[0] + x * (scaler.data_max_[0] - scaler.data_min_[0])
    origin = torch.expm1(origin) if is_train else np.expm1(origin)
    return origin

def variance_threshold_select(data, threshold=0.01, ignore_features=list()):
    cols = data.select_dtypes(include=[np.number]).columns
    cols = list(set(cols)-set(ignore_features))

    del_features = []
    for col in cols:
        variance = train_data[col].std()**2
        if variance<threshold:
            del_features.append(col)
    
    return del_features

In [9]:
def predict(best_model, loader, device, inverse_transform):
    best_model.to(device)
    best_model.eval()
    
    true_list = []
    pred_list = []
    with torch.no_grad():
        for data,label in loader:
            data = data.float().to(device)

            output = best_model(data)
            output = inverse_transform(output)
            output = output.cpu().numpy().tolist()

            label  = inverse_transform(label)
            label = label.cpu().numpy().tolist()

            true_list += label
            pred_list += output

    return true_list, pred_list

def inference(best_model, loader, device, inverse_transform):
    best_model.to(device)
    best_model.eval()
    
    true_list = []
    pred_list = []
    with torch.no_grad():
        for data in loader:
            data = data.float().to(device)

            output = best_model(data)
            output = inverse_transform(output)
            output = output.cpu().numpy().tolist()

            pred_list += output

    return pred_list

In [10]:
config = {
    "learning_rate": 2e-5,
    "epoch": 4048,
    "batch_size": 64,
    "hidden_size": 256,
    "num_layers": 3,
    "output_size": 3,
    "dropout": 0.2,
    "num_heads": 8,
    "weight_decay": 1e-4,
    "test_size": 0.2,
    "seed": 42,
    "threshold": 0.005,
    "device": 'cpu',
}

CFG = SimpleNamespace(**config)
품목_리스트 = ['건고추', '사과', '감자', '배', '깐마늘(국산)', '무', '상추', '배추', '양파', '대파']
# 품목_리스트 = ['감자']

In [12]:
# logger = get_logger(save_path='log/TimeSeriesTransformer_log.log')
# trace_func = logger.info
trace_func = print

os.makedirs('models', exist_ok=True)

품목별_scalers = {}
품목별_delcols = {}
품목별_hyperparams = {}

train_nmae_list = []
val_nmae_list = []

for i, 품목명 in enumerate(품목_리스트):
    model_path = f'models/TimeSeriesTransformer_{품목명}.pth'
    trace_func('')
    trace_func('='*150)
    trace_func(f'> [{i+1:02d}/{len(품목_리스트)}] {품목명}')
    trace_func('='*150)
    trace_func('')

    # preprocessing
    train_data, scalers = process_data("data/train/train.csv", "data/train/meta/TRAIN_산지공판장_2018-2021.csv", "data/train/meta/TRAIN_전국도매_2018-2021.csv", 품목명)
    품목별_scalers[품목명] = scalers
    
    # 분산이 threshold보다 작은 컬럼 제거
    del_cols = variance_threshold_select(train_data, threshold=CFG.threshold, ignore_features=['평균가격(원)'])
    train_data.drop(del_cols, axis=1, inplace=True)
    품목별_delcols[품목명] = del_cols

    # train, validation split
    dataset = AgriculturePriceDataset(train_data)
    tr_data, val_data = train_test_split(dataset, test_size=CFG.test_size, random_state=CFG.seed, shuffle=True)
    train_loader = DataLoader(tr_data, CFG.batch_size, shuffle=True)
    val_loader = DataLoader(val_data, CFG.batch_size, shuffle=False)

    # define model
    품목별_hyperparams[품목명] = dict(
        input_size=len(dataset.numeric_columns),
        hidden_size=CFG.hidden_size,
        num_layers=CFG.num_layers,
        output_size=CFG.output_size,
        dropout=CFG.dropout,
        num_heads=CFG.num_heads,
        method='performer', # multihead, linformer, performer
        seq_len=32,
    )
    model = TimeSeriesTransformer(**품목별_hyperparams[품목명]).to(CFG.device)
    criterion = nn.HuberLoss() #nn.L1Loss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=CFG.learning_rate, weight_decay=CFG.weight_decay)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=50)
    # scheduler = None

    price_scaler = 품목별_scalers[품목명][dataset.price_column]
    inverse_transform = lambda x: minmax_inverse_transform(x, price_scaler)
    # inverse_transform = None

    # train
    best_model = train(
        model, optimizer, train_loader, val_loader, CFG.epoch,
        early_stopping=True, early_stopping_patience=200, early_stopping_verbose=False,
        device='cpu', scheduler=scheduler, metric_period=100, 
        verbose=True, save_model_path=model_path,
        inverse_transform=inverse_transform,
    )

    # scoring
    true, pred = predict(best_model, train_loader, device='cpu', inverse_transform=inverse_transform)
    train_nmae = nmae(true,pred)
    true, pred = predict(best_model, val_loader, device='cpu', inverse_transform=inverse_transform)
    val_nmae = nmae(true,pred)
    trace_func(f'<Score> {train_nmae=:.4f}, {val_nmae=:.4f}')
    trace_func('')

    train_nmae_list.append(train_nmae)
    val_nmae_list.append(val_nmae)


> [01/10] 건고추

*[0100/4048] tr_loss: 88100.1523, val_loss: 32423.4102, best: 32423.4102(100), elapsed: 41.1s, total: 41.1s, remaining: 1622.1s
 [0200/4048] tr_loss: 73002.6914, val_loss: 34428.2969, best: 22214.0449(185), elapsed: 36.8s, total: 78.0s, remaining: 1416.0s
 [0300/4048] tr_loss: 65984.5547, val_loss: 23737.2246, best: 22192.8711(247), elapsed: 34.8s, total: 112.8s, remaining: 1303.1s
 [0400/4048] tr_loss: 62530.0957, val_loss: 28955.0254, best: 22319.3223(392), elapsed: 34.5s, total: 147.3s, remaining: 1259.5s
 [0500/4048] tr_loss: 60944.4863, val_loss: 26231.5332, best: 22319.3223(392), elapsed: 34.9s, total: 182.3s, remaining: 1239.0s
<Stopped> [0592/4048] tr_loss: 68583.0020, val_loss: 24191.9668, best: 22319.3223(392), elapsed: 33.8s, total: 216.0s, remaining: 1166.6s
<Score> train_nmae=0.0579, val_nmae=0.0423


> [02/10] 사과

 [0100/4048] tr_loss: 2680.1761, val_loss: 2638.4236, best: 1907.4352(99), elapsed: 41.7s, total: 41.7s, remaining: 1644.4s
*[0200/4048] tr_loss

In [13]:
f'train_nmae{np.mean(train_nmae_list):.4f}, val_nmae={np.mean(val_nmae_list):.4f}'

'train_nmae0.1138, val_nmae=0.1357'

In [14]:
with open('out/scalers.pkl', 'wb') as pickle_file:
    pickle.dump(품목별_scalers, pickle_file)

with open('out/delcols.pkl', 'wb') as pickle_file:
    pickle.dump(품목별_delcols, pickle_file)

with open('out/hyperparams.pkl', 'wb') as pickle_file:
    pickle.dump(품목별_hyperparams, pickle_file)

In [15]:
# true, pred = predict(best_model, train_loader, device='cpu', inverse_transform=inverse_transform)
# print(criterion(torch.tensor(true), torch.tensor(pred)).item())
# true[:5], pred[:5]

# Inference

In [16]:
with open('out/scalers.pkl', 'rb') as pickle_file:
    품목별_scalers = pickle.load(pickle_file)

with open('out/delcols.pkl', 'rb') as pickle_file:
    품목별_delcols = pickle.load(pickle_file)

with open('out/hyperparams.pkl', 'rb') as pickle_file:
    품목별_hyperparams = pickle.load(pickle_file)

In [17]:
for k,v in 품목별_delcols.items():
    print(k,v)

건고추 []
사과 []
감자 ['감자 수미(햇)_20키로상자_중_평년 평균가격(원)', '감자 수미(저장)_20키로상자_중_평년 평균가격(원)', '감자_20키로상자_중_평년 평균가격(원)', '감자 조풍_20키로상자_중_평년 평균가격(원)', '감자 수미(저장)_20키로상자_상_평년 평균가격(원)', '감자 수미(햇)_20키로상자_하_평년 평균가격(원)', '감자 수미(햇)_20키로상자_특_평년 평균가격(원)', '감자 두백_20키로상자_특_평년 평균가격(원)', '감자 수미(저장)_20키로상자_하_평년 평균가격(원)', '감자 조풍_20키로상자_특_평균가격(원)', '감자_20키로상자_특_평년 평균가격(원)', '홍감자_10키로상자_상_평년 평균가격(원)', '감자 조풍_20키로상자_상_평균가격(원)', '감자 수입_23키로상자_상_평년 평균가격(원)', '감자 조풍_20키로상자_상_평년 평균가격(원)', '감자 조풍_20키로상자_특_평년 평균가격(원)', '감자 수미(햇)_20키로상자_상_평년 평균가격(원)', '감자 조풍_20키로상자_하_평균가격(원)', '감자 두백_20키로상자_상_평년 평균가격(원)', '감자_20키로상자_상_평년 평균가격(원)', '홍감자_10키로상자_중_평년 평균가격(원)', '감자 수미(저장)_20키로상자_특_평년 평균가격(원)', '감자 조풍_20키로상자_중_평균가격(원)', '감자 두백_20키로상자_중_평년 평균가격(원)', '감자 두백_20키로상자_하_평년 평균가격(원)', '감자_20키로상자_하_평년 평균가격(원)', '홍감자_10키로상자_하_평년 평균가격(원)', '홍감자_10키로상자_특_평년 평균가격(원)', '감자 조풍_20키로상자_하_평년 평균가격(원)']
배 ['공판장_등급코드']
깐마늘(국산) ['깐마늘(국산)_20 kg_중품_평년 평균가격(원)', '평년 평균가격(원)']
무 ['다발무_5000키로_상_평년 평균가격(원)', '다발무_1000키로_특_평년 평균가격(원)', '열무_4키로상자_중_평년 평균가격(

In [18]:
품목별_predictions = {}

pbar_outer = tqdm(품목_리스트, position=0)
for 품목명 in pbar_outer:
    pbar_outer.set_description(품목명)
    model_path = f'models/TimeSeriesTransformer_{품목명}.pth'

    # define model
    model = TimeSeriesTransformer(**품목별_hyperparams[품목명]).to(CFG.device)
    model.load_state_dict(torch.load(model_path))

    # inference
    품목_predictions = []
    pbar_inner = tqdm(range(25), desc="테스트 파일 추론 중", position=1, leave=False)
    for i in pbar_inner:
        test_file = f"data/test/TEST_{i:02d}.csv"
        산지공판장_file = f"data/test/meta/TEST_산지공판장_{i:02d}.csv"
        전국도매_file = f"data/test/meta/TEST_전국도매_{i:02d}.csv"

        test_data, _ = process_data(test_file, 산지공판장_file, 전국도매_file, 품목명, scalers=품목별_scalers[품목명])
        test_data.drop(품목별_delcols[품목명], axis=1, inplace=True)
        test_dataset = AgriculturePriceDataset(test_data, is_test=True)
        test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

        price_scaler = 품목별_scalers[품목명][test_dataset.price_column]
        inverse_transform = lambda x: minmax_inverse_transform(x, price_scaler)

        predictions = inference(model, test_loader, device='cpu', inverse_transform=inverse_transform)
        predictions = np.concatenate(predictions)
        
        if np.isnan(predictions).any():
            pbar_inner.set_postfix({"상태": "NaN"})
            raise ValueError
        else:
            pbar_inner.set_postfix({"상태": "정상"})
            품목_predictions.extend(predictions.flatten())

    품목별_predictions[품목명] = 품목_predictions

  0%|          | 0/10 [00:00<?, ?it/s]

테스트 파일 추론 중:   0%|          | 0/25 [00:00<?, ?it/s]

테스트 파일 추론 중:   0%|          | 0/25 [00:00<?, ?it/s]

테스트 파일 추론 중:   0%|          | 0/25 [00:00<?, ?it/s]

테스트 파일 추론 중:   0%|          | 0/25 [00:00<?, ?it/s]

테스트 파일 추론 중:   0%|          | 0/25 [00:00<?, ?it/s]

테스트 파일 추론 중:   0%|          | 0/25 [00:00<?, ?it/s]

테스트 파일 추론 중:   0%|          | 0/25 [00:00<?, ?it/s]

테스트 파일 추론 중:   0%|          | 0/25 [00:00<?, ?it/s]

테스트 파일 추론 중:   0%|          | 0/25 [00:00<?, ?it/s]

테스트 파일 추론 중:   0%|          | 0/25 [00:00<?, ?it/s]

# Prepare Submission File

In [19]:
sample_submission = pd.read_csv('data/sample_submission.csv')

for 품목명, predictions in 품목별_predictions.items():
    sample_submission[품목명] = predictions

# 결과 저장
save_path = 'out/baseline_submission_8.csv'
sample_submission.to_csv(save_path, index=False)

In [20]:
pd.read_csv(save_path)

Unnamed: 0,시점,감자,건고추,깐마늘(국산),대파,무,배추,사과,상추,양파,배
0,TEST_00+1순,42525.945312,506869.65625,183690.796875,1642.616821,15125.791992,11944.115234,30389.492188,1161.661377,903.362244,30842.589844
1,TEST_00+2순,39920.003906,502257.96875,200335.000000,1633.035522,14614.766602,8350.434570,29342.046875,1075.500854,908.290833,30363.361328
2,TEST_00+3순,38372.023438,503067.62500,202348.078125,1644.846558,13541.884766,6710.583496,29171.673828,994.754333,868.916199,31405.814453
3,TEST_01+1순,44263.695312,583607.75000,183654.531250,1529.945190,13934.520508,8440.171875,28856.148438,962.416443,913.465576,30517.808594
4,TEST_01+2순,41750.437500,575217.12500,199787.625000,1513.524658,13433.417969,7233.039062,28257.222656,939.898804,904.462585,30110.117188
...,...,...,...,...,...,...,...,...,...,...,...
70,TEST_23+2순,49752.140625,613444.75000,200405.328125,1477.261719,9706.476562,6125.318848,28466.066406,722.732361,923.762939,30239.214844
71,TEST_23+3순,45767.785156,611481.62500,202500.203125,1516.331421,9382.199219,6080.461426,28575.574219,742.771912,882.899780,31099.503906
72,TEST_24+1순,38576.734375,534925.25000,168197.140625,1524.353271,10799.892578,10938.084961,35017.027344,748.899963,567.506287,34598.562500
73,TEST_24+2순,36103.441406,536602.68750,181658.093750,1633.351074,10580.342773,9403.125977,34105.507812,756.600769,595.250427,33710.242188


In [None]:
tmp = pd.read_csv('data/train/train.csv')
tmp.groupby('품목명')['평균가격(원)'].describe().astype(int)

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

In [None]:
품목명 = '무'
train_data, scaler = process_data("data/train/train.csv", "data/train/meta/TRAIN_산지공판장_2018-2021.csv", "data/train/meta/TRAIN_전국도매_2018-2021.csv", 품목명)
scaler = 품목별_scalers[품목명]['평균가격(원)']
tmp = minmax_inverse_transform(train_data['평균가격(원)'], scaler, is_train=False)
tmp.max()

In [None]:
tmp.describe()

In [None]:
i=0

test_file = f"data/test/TEST_{i:02d}.csv"
산지공판장_file = f"data/test/meta/TEST_산지공판장_{i:02d}.csv"
전국도매_file = f"data/test/meta/TEST_전국도매_{i:02d}.csv"

test_data, _ = process_data(test_file, 산지공판장_file, 전국도매_file, 품목명, scalers=품목별_scalers[품목명])

In [None]:
test_data.describe()

In [None]:
train_df = pd.read_csv('data/train/train.csv')
t = train_df[train_df['품목명']==품목명]
t['평균가격(원)'].describe().astype(int)

# plt.hist(t['평균가격(원)'], bins=50)
# plt.yscale('log')

plt.boxplot(t['평균가격(원)'])
plt.show()
plt.boxplot(np.log1p(t['평균가격(원)']))
plt.show()

In [None]:
t[t['평균가격(원)']>10000000]
t.groupby('거래단위')['평균가격(원)'].mean().astype(int)

In [None]:
tt = pd.read_csv('data/sample_submission.csv')
tt

In [None]:
# sample_submission = pd.read_csv('out/baseline_submission.csv')
# sample_submission.head()