# 농산물 가격 예측을 위한 AI 모델 개발 
- '2024 농산물 가격 예측 AI 경진대회'는 데이터와 AI 기술을 활용하여 농산물 가격 예측 능력을 향상시키는 것을 목표로 합니다.<br>  이 대회는 농업 분야의 복잡한 시계열 데이터를 효율적으로 분석하고 예측할 수 있는 AI 알고리즘 개발에 초점을 맞추고 있습니다. <br> <br>
- 이 대회의 궁극적 목적은 참가자들의 시계열 데이터 분석 및 예측 역량을 강화하고, <br> AI 기술이 실제 농산물 가격 예측과 관련 정책 결정에 어떻게 기여할 수 있는지 탐구하는 것입니다. 

# Import Library

In [1]:
import sys
lib_dir = "g:/My Drive/Storage/Github/hyuckjinkim"
sys.path.append(lib_dir)

from lib.python.graph import MatplotlibFontManager
fm = MatplotlibFontManager()
fm.set_korean_font(check=False)

from lib.python.torch import seed_everything
from lib.python.torch.build_model import train, predict
from lib.python.log import get_logger

seed_everything(42)

In [2]:
# import pandas as pd
# pd.set_option("display.max_rows", None)
# pd.set_option("display.max_columns", None)

# train_df = pd.read_csv('data/train/train.csv')

# train_meta1_df = pd.read_csv('data/train/meta/TRAIN_산지공판장_2018-2021.csv')
# train_meta1_df.drop(['품목코드','품종코드','공판장코드'], axis=1, inplace=True)

# train_meta2_df = pd.read_csv('data/train/meta/TRAIN_전국도매_2018-2021.csv')
# train_meta2_df.drop(['품목코드','품종코드','시장코드']  , axis=1, inplace=True)

# train_df.head(2) # ['평년 평균가격(원)','평균가격(원)']
# train_meta1_df.head(2)
# train_meta2_df.head(2)

In [3]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from tqdm.notebook import tqdm
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from types import SimpleNamespace
from sklearn.preprocessing import MinMaxScaler
import os
import pickle

# Hyperparameter Setting

# Define Function for Feature Engineering
- 타겟의 필터 조건을 제외한 메타데이터의 필터 조건은 참가자들 각자의 기준에 맞춰 자유롭게 사용가능 
- 밑의 필터 조건은 임의로 제공하는 예시

In [4]:
def year_convert(data):
    data['연도'] -= 2018

    offset = 0.1
    map_dict = {'상순':offset, '중순':offset+1/3, '하순':offset+2/3}
    data['연도'] += data['시점'].str.extract(r'(상순|중순|하순)')[0].map(map_dict)
    
    return data

def process_data(raw_file, 산지공판장_file, 전국도매_file, 품목명, scalers=None):
    raw_data = pd.read_csv(raw_file)
    산지공판장 = pd.read_csv(산지공판장_file)
    전국도매 = pd.read_csv(전국도매_file)

    # 품목코드, 품종코드, 공판장코드, 시장코드 제거
    산지공판장.drop(['품목코드','품종코드','공판장코드'], axis=1, inplace=True)
    전국도매  .drop(['품목코드','품종코드','시장코드']  , axis=1, inplace=True)

    # 연도에 상/중/하순에 대한 정보도 추가
    산지공판장 = year_convert(산지공판장)
    전국도매 = year_convert(전국도매)

    # 이상값(0이하) 처리
    for col in ['전순 평균가격(원) PreVious SOON', '전달 평균가격(원) PreVious MMonth', '전년 평균가격(원) PreVious YeaR']:
        loc = 전국도매[col] < 0
        전국도매.loc[loc,col] = 0

    # # log변환
    # raw_cols = ['평년 평균가격(원)', '평균가격(원)']
    # 산지공판장_cols =  ['총반입량(kg)', '총거래금액(원)', '평균가(원/kg)', '중간가(원/kg)', '최저가(원/kg)', '최고가(원/kg)', '경매 건수', 
    #                    '전순 평균가격(원) PreVious SOON', '전달 평균가격(원) PreVious MMonth', '전년 평균가격(원) PreVious YeaR', '평년 평균가격(원) Common Year SOON']
    # 전국도매_cols = ['총반입량(kg)', '총거래금액(원)', '평균가(원/kg)', '고가(20%) 평균가', '중가(60%) 평균가 ', '저가(20%) 평균가', '중간가(원/kg)', '최저가(원/kg)',
    #                 '최고가(원/kg)', '경매 건수', '전순 평균가격(원) PreVious SOON', '전달 평균가격(원) PreVious MMonth', '전년 평균가격(원) PreVious YeaR', '평년 평균가격(원) Common Year SOON']
    # for col in raw_cols: raw_data[col] = np.log1p(raw_data[col])
    # for col in 산지공판장_cols: 산지공판장[col] = np.log1p(산지공판장[col])
    # for col in 전국도매_cols: 전국도매[col] = np.log1p(전국도매[col])

    # 타겟 및 메타데이터 필터 조건 정의
    conditions = {
    '감자': {
        'target': lambda df: (df['품종명'] == '감자 수미') & (df['거래단위'] == '20키로상자') & (df['등급'] == '상'),
        '공판장': {'공판장명': ['*전국농협공판장'], '품목명': ['감자'], '품종명': ['수미'], '등급명': ['상']},
        '도매': {'시장명': ['*전국도매시장'], '품목명': ['감자'], '품종명': ['수미']}
    },
    '건고추': {
        'target': lambda df: (df['품종명'] == '화건') & (df['거래단위'] == '30 kg') & (df['등급'] == '상품'),
        '공판장': None, 
        '도매': None  
    },
    '깐마늘(국산)': {
        'target': lambda df: (df['거래단위'] == '20 kg') & (df['등급'] == '상품'),
        '공판장': {'공판장명': ['*전국농협공판장'], '품목명': ['마늘'], '품종명': ['깐마늘'], '등급명': ['상']},
        '도매': {'시장명': ['*전국도매시장'], '품목명': ['마늘'], '품종명': ['깐마늘']}
    },
    '대파': {
        'target': lambda df: (df['품종명'] == '대파(일반)') & (df['거래단위'] == '1키로단') & (df['등급'] == '상'),
        '공판장': {'공판장명': ['*전국농협공판장'], '품목명': ['대파'], '품종명': ['대파(일반)'], '등급명': ['상']},
        '도매': {'시장명': ['*전국도매시장'], '품목명': ['대파'], '품종명': ['대파(일반)']}
    },
    '무': {
        'target': lambda df: (df['거래단위'] == '20키로상자') & (df['등급'] == '상'),
        '공판장': {'공판장명': ['*전국농협공판장'], '품목명': ['무'], '품종명': ['기타무'], '등급명': ['상']},
        '도매': {'시장명': ['*전국도매시장'], '품목명': ['무'], '품종명': ['무']}
    },
    '배추': {
        'target': lambda df: (df['거래단위'] == '10키로망대') & (df['등급'] == '상'),
        '공판장': {'공판장명': ['*전국농협공판장'], '품목명': ['배추'], '품종명': ['쌈배추'], '등급명': ['상']},
        '도매': {'시장명': ['*전국도매시장'], '품목명': ['배추'], '품종명': ['배추']}
    },
    '사과': {
        'target': lambda df: (df['품종명'].isin(['홍로', '후지'])) & (df['거래단위'] == '10 개') & (df['등급'] == '상품'),
        '공판장': {'공판장명': ['*전국농협공판장'], '품목명': ['사과'], '품종명': ['후지'], '등급명': ['상']},
        '도매': {'시장명': ['*전국도매시장'], '품목명': ['사과'], '품종명': ['후지']}
    },
    '상추': {
        'target': lambda df: (df['품종명'] == '청') & (df['거래단위'] == '100 g') & (df['등급'] == '상품'),
        '공판장': {'공판장명': ['*전국농협공판장'], '품목명': ['상추'], '품종명': ['청상추'], '등급명': ['상']},
        '도매': {'시장명': ['*전국도매시장'], '품목명': ['상추'], '품종명': ['청상추']}
    },
    '양파': {
        'target': lambda df: (df['품종명'] == '양파') & (df['거래단위'] == '1키로') & (df['등급'] == '상'),
        '공판장': {'공판장명': ['*전국농협공판장'], '품목명': ['양파'], '품종명': ['기타양파'], '등급명': ['상']},
        '도매': {'시장명': ['*전국도매시장'], '품목명': ['양파'], '품종명': ['양파(일반)']}
    },
    '배': {
        'target': lambda df: (df['품종명'] == '신고') & (df['거래단위'] == '10 개') & (df['등급'] == '상품'),
        '공판장': {'공판장명': ['*전국농협공판장'], '품목명': ['배'], '품종명': ['신고'], '등급명': ['상']},
        '도매': {'시장명': ['*전국도매시장'], '품목명': ['배'], '품종명': ['신고']}
    }
    }

    # 타겟 데이터 필터링
    raw_품목 = raw_data[raw_data['품목명'] == 품목명]
    target_mask = conditions[품목명]['target'](raw_품목)
    filtered_data = raw_품목[target_mask]

    # 다른 품종에 대한 파생변수 생성
    other_data = raw_품목[~target_mask]
    unique_combinations = other_data[['품종명', '거래단위', '등급']].drop_duplicates()
    for _, row in unique_combinations.iterrows():
        품종명, 거래단위, 등급 = row['품종명'], row['거래단위'], row['등급']
        mask = (other_data['품종명'] == 품종명) & (other_data['거래단위'] == 거래단위) & (other_data['등급'] == 등급)
        temp_df = other_data[mask]
        for col in ['평년 평균가격(원)', '평균가격(원)']:
            new_col_name = f'{품종명}_{거래단위}_{등급}_{col}'
            filtered_data = filtered_data.merge(temp_df[['시점', col]], on='시점', how='left', suffixes=('', f'_{new_col_name}'))
            filtered_data.rename(columns={f'{col}_{new_col_name}': new_col_name}, inplace=True)

    # 공판장 데이터 처리
    if conditions[품목명]['공판장']:
        filtered_공판장 = 산지공판장
        for key, value in conditions[품목명]['공판장'].items():
            filtered_공판장 = filtered_공판장[filtered_공판장[key].isin(value)]
        
        filtered_공판장 = filtered_공판장.add_prefix('공판장_').rename(columns={'공판장_시점': '시점'})
        filtered_data = filtered_data.merge(filtered_공판장, on='시점', how='left')

    # 도매 데이터 처리
    if conditions[품목명]['도매']:
        filtered_도매 = 전국도매
        for key, value in conditions[품목명]['도매'].items():
            filtered_도매 = filtered_도매[filtered_도매[key].isin(value)]
        
        filtered_도매 = filtered_도매.add_prefix('도매_').rename(columns={'도매_시점': '시점'})
        filtered_data = filtered_data.merge(filtered_도매, on='시점', how='left')

    # 수치형 컬럼 처리
    numeric_columns = filtered_data.select_dtypes(include=[np.number]).columns
    filtered_data = filtered_data[['시점'] + list(numeric_columns)]
    filtered_data[numeric_columns] = filtered_data[numeric_columns].fillna(0)

    # 정규화 적용
    if scalers is None:
        scalers = {}
        for col in numeric_columns:
            scaler = MinMaxScaler()
            filtered_data[col] = scaler.fit_transform(filtered_data[col].values.reshape(-1,1))
            scalers[col] = scaler
    else:
        for col in numeric_columns:
            scaler = scalers[col]
            filtered_data[col] = scaler.transform(filtered_data[col].values.reshape(-1,1))

    return filtered_data, scalers


# Define Custom Dataset Class

In [5]:
class AgriculturePriceDataset(Dataset):
    def __init__(self, dataframe, window_size=9, prediction_length=3, is_test=False):
        self.data = dataframe
        self.window_size = window_size
        self.prediction_length = prediction_length
        self.is_test = is_test
        
        self.price_column = '평균가격(원)'
        self.numeric_columns = self.data.select_dtypes(include=[np.number]).columns.tolist()

        self.sequences = []
        if not self.is_test:
            for i in range(len(self.data) - self.window_size - self.prediction_length + 1):
                x = self.data[self.numeric_columns].iloc[i:i+self.window_size].values
                y = self.data[self.price_column].iloc[i+self.window_size:i+self.window_size+self.prediction_length].values
                self.sequences.append((x, y))
        else:
            self.sequences = [self.data[self.numeric_columns].values]
    
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        if not self.is_test:
            x, y = self.sequences[idx]
            return torch.FloatTensor(x), torch.FloatTensor(y)
        else:
            return torch.FloatTensor(self.sequences[idx])

# Define Model Architecture and Training Functions

In [6]:
# class PricePredictionLSTM(nn.Module):
#     def __init__(self, input_size, hidden_size, num_layers, output_size, dropout=0):
#         super(PricePredictionLSTM, self).__init__()
#         self.hidden_size = hidden_size
#         self.num_layers = num_layers
#         self.lstm = nn.LSTM(input_size, hidden_size, num_layers, dropout=dropout, batch_first=True)
#         self.fc = nn.Linear(hidden_size, output_size)
    
#     def forward(self, x):
#         h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
#         c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
#         out, _ = self.lstm(x, (h0, c0))
#         out = self.fc(out[:, -1, :])
#         return out

In [7]:
class PricePredictionLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size, dropout=0):
        super(PricePredictionLSTM, self).__init__()
        
        # 첫 번째 LSTM 레이어
        self.lstm1 = nn.LSTM(input_size, hidden_size, num_layers, dropout=dropout, batch_first=True)
        
        # 두 번째 LSTM 레이어
        self.lstm2 = nn.LSTM(hidden_size, hidden_size//2, num_layers, dropout=dropout, batch_first=True)
        
        # 최종 Fully Connected 레이어
        self.fc = nn.Linear(hidden_size//2, output_size)
    
    def forward(self, x):
        # 첫 번째 LSTM에 대한 초기 hidden state와 cell state
        h0_1 = torch.zeros(self.lstm1.num_layers, x.size(0), self.lstm1.hidden_size).to(x.device)
        c0_1 = torch.zeros(self.lstm1.num_layers, x.size(0), self.lstm1.hidden_size).to(x.device)
        
        # 두 번째 LSTM에 대한 초기 hidden state와 cell state
        h0_2 = torch.zeros(self.lstm2.num_layers, x.size(0), self.lstm2.hidden_size).to(x.device)
        c0_2 = torch.zeros(self.lstm2.num_layers, x.size(0), self.lstm2.hidden_size).to(x.device)
        
        # 첫 번째 LSTM 레이어의 순전파
        out, _ = self.lstm1(x, (h0_1, c0_1))
        
        # 두 번째 LSTM 레이어의 순전파
        out, _ = self.lstm2(out, (h0_2, c0_2))
        
        # 마지막 Fully Connected 레이어를 거쳐 예측 값 반환
        out = self.fc(out[:, -1, :])
        return out

# Train Models and Generate Predictions

In [8]:
def nmae(true,pred):
    true, pred = np.array(true), np.array(pred)
    return np.mean(np.abs(true - pred) / true)

def minmax_inverse_transform(x, scaler, is_train=True):
    origin = scaler.data_min_[0] + x * (scaler.data_max_[0] - scaler.data_min_[0])
    # if is_train:
    #     origin = torch.exp(origin) - 1
    # else:
    #     origin = np.exp(origin) - 1
    return origin

def variance_threshold_select(data, threshold=0.01, ignore_features=list()):
    cols = data.select_dtypes(include=[np.number]).columns
    cols = list(set(cols)-set(ignore_features))

    del_features = []
    for col in cols:
        variance = train_data[col].std()**2
        if variance<threshold:
            del_features.append(col)
    
    return del_features

In [9]:
def predict(best_model, loader, device, inverse_transform):
    best_model.to(device)
    best_model.eval()
    
    true_list = []
    pred_list = []
    with torch.no_grad():
        for data,label in loader:
            data = data.float().to(device)

            output = best_model(data)
            output = inverse_transform(output)
            output = output.cpu().numpy().tolist()

            label  = inverse_transform(label)
            label = label.cpu().numpy().tolist()

            true_list += label
            pred_list += output

    return true_list, pred_list

def inference(best_model, loader, device, inverse_transform):
    best_model.to(device)
    best_model.eval()
    
    true_list = []
    pred_list = []
    with torch.no_grad():
        for data in loader:
            data = data.float().to(device)

            output = best_model(data)
            output = inverse_transform(output)
            output = output.cpu().numpy().tolist()

            pred_list += output

    return pred_list

In [10]:
config = {
    "learning_rate": 2e-5,
    "epoch": 4048,
    "batch_size": 64,
    "hidden_size": 256,
    "num_layers": 2,
    "output_size": 3,
    "dropout": 0.2,
    "test_size": 0.2,
    "seed": 42,
    "threshold": 0.005,
}

CFG = SimpleNamespace(**config)
품목_리스트 = ['건고추', '사과', '감자', '배', '깐마늘(국산)', '무', '상추', '배추', '양파', '대파']
# 품목_리스트 = ['감자']

In [11]:
logger = get_logger(save_path='log/train_log.log')
os.makedirs('models', exist_ok=True)

품목별_predictions = {}
품목별_scalers = {}
품목별_delcols = {}
품목별_hyperparams = {}

for i, 품목명 in enumerate(품목_리스트):
    model_path = f'models/best_model_{품목명}.pth'
    logger.info('='*150)
    logger.info(f'> [{i+1}/{len(품목_리스트)}] {품목명}')
    logger.info('='*150)

    # preprocessing
    train_data, scalers = process_data("data/train/train.csv", "data/train/meta/TRAIN_산지공판장_2018-2021.csv", "data/train/meta/TRAIN_전국도매_2018-2021.csv", 품목명)
    품목별_scalers[품목명] = scalers
    
    # 분산이 threshold보다 작은 컬럼 제거
    del_cols = variance_threshold_select(train_data, threshold=CFG.threshold, ignore_features=['평균가격(원)'])
    train_data.drop(del_cols, axis=1, inplace=True)
    품목별_delcols[품목명] = del_cols

    # train, validation split
    dataset = AgriculturePriceDataset(train_data)
    tr_data, val_data = train_test_split(dataset, test_size=CFG.test_size, random_state=CFG.seed, shuffle=True)
    train_loader = DataLoader(tr_data, CFG.batch_size, shuffle=True)
    val_loader = DataLoader(val_data, CFG.batch_size, shuffle=False)

    # define model
    품목별_hyperparams[품목명] = dict(
        input_size=len(dataset.numeric_columns),
        hidden_size=CFG.hidden_size,
        num_layers=CFG.num_layers,
        output_size=CFG.output_size,
        dropout=CFG.dropout,
    )
    model = PricePredictionLSTM(**품목별_hyperparams[품목명])
    criterion = nn.L1Loss()
    optimizer = torch.optim.AdamW(model.parameters(), CFG.learning_rate)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=10)
    # scheduler = None

    price_scaler = 품목별_scalers[품목명][dataset.price_column]
    inverse_transform = lambda x: minmax_inverse_transform(x, price_scaler)
    # inverse_transform = None

    # train
    best_model = train(
        model, optimizer, train_loader, val_loader, CFG.epoch,
        early_stopping=True, early_stopping_patience=30, early_stopping_verbose=False,
        device='cpu', scheduler=scheduler, metric_period=100, 
        verbose=True, save_model_path=model_path,
        inverse_transform=inverse_transform,
    )

    # scoring
    true, pred = predict(best_model, train_loader, device='cpu', inverse_transform=inverse_transform)
    train_nmae = nmae(true,pred)
    true, pred = predict(best_model, val_loader, device='cpu', inverse_transform=inverse_transform)
    val_nmae = nmae(true,pred)
    print(f'<Score> {train_nmae=:.4f}, {val_nmae=:.4f}')

[2024-10-10 23:40:53,077 997685439.py:12][INFO] > [1/10] 건고추
[2024-10-10 23:41:10,982 log.py:38][INFO] *[0100/4048] tr_loss: 101845.3359, val_loss: 70643.6641, best: 70643.6641(100), elapsed: 14.5s, total: 14.5s, remaining: 574.0s
[2024-10-10 23:41:27,849 log.py:38][INFO] [0200/4048] tr_loss: 66624.9258, val_loss: 48337.7109, best: 48197.7891(193), elapsed: 16.9s, total: 31.4s, remaining: 649.1s
[2024-10-10 23:41:30,293 log.py:38][INFO] <Stopped> tr_loss: 69515.0645, val_loss: 47952.8906, best: 47712.0547(202), elapsed: 2.4s, total: 33.9s, remaining: 93.2s
[2024-10-10 23:41:30,323 log.py:38][INFO] <Score> train_nmae=0.1088, val_nmae=0.0943
[2024-10-10 23:41:30,325 997685439.py:12][INFO] > [2/10] 사과
[2024-10-10 23:41:47,697 log.py:38][INFO] <Stopped> tr_loss: 3438.5430, val_loss: 3068.8313, best: 2884.6204(68), elapsed: 15.7s, total: 15.7s, remaining: 619.1s
[2024-10-10 23:41:47,720 log.py:38][INFO] <Score> train_nmae=0.1454, val_nmae=0.1351
[2024-10-10 23:41:47,722 997685439.py:12][INF

In [12]:
with open('out/scalers.pkl', 'wb') as pickle_file:
    pickle.dump(품목별_scalers, pickle_file)

with open('out/delcols.pkl', 'wb') as pickle_file:
    pickle.dump(품목별_delcols, pickle_file)

with open('out/hyperparams.pkl', 'wb') as pickle_file:
    pickle.dump(품목별_hyperparams, pickle_file)

In [13]:
# true, pred = predict(best_model, train_loader, device='cpu', inverse_transform=inverse_transform)
# print(criterion(torch.tensor(true), torch.tensor(pred)).item())
# true[:5], pred[:5]

# Inference

In [14]:
with open('out/scalers.pkl', 'rb') as pickle_file:
    품목별_scalers = pickle.load(pickle_file)

with open('out/delcols.pkl', 'rb') as pickle_file:
    품목별_delcols = pickle.load(pickle_file)

with open('out/hyperparams.pkl', 'rb') as pickle_file:
    품목별_hyperparams = pickle.load(pickle_file)

In [15]:
for k,v in 품목별_delcols.items():
    print(k,v)

[2024-10-10 23:47:40,687 log.py:38][INFO] 건고추
[2024-10-10 23:47:40,688 log.py:38][INFO] []
[2024-10-10 23:47:40,689 log.py:38][INFO] 사과
[2024-10-10 23:47:40,690 log.py:38][INFO] []
[2024-10-10 23:47:40,691 log.py:38][INFO] 감자
[2024-10-10 23:47:40,692 log.py:38][INFO] ['감자_20키로상자_중_평년 평균가격(원)', '감자 수미(햇)_20키로상자_상_평년 평균가격(원)', '홍감자_10키로상자_하_평년 평균가격(원)', '감자 수미(저장)_20키로상자_특_평년 평균가격(원)', '감자 수미(저장)_20키로상자_중_평년 평균가격(원)', '감자 조풍_20키로상자_하_평균가격(원)', '감자 두백_20키로상자_하_평년 평균가격(원)', '감자_20키로상자_상_평년 평균가격(원)', '감자_20키로상자_특_평년 평균가격(원)', '홍감자_10키로상자_상_평년 평균가격(원)', '홍감자_10키로상자_중_평년 평균가격(원)', '홍감자_10키로상자_특_평년 평균가격(원)', '감자 수미(햇)_20키로상자_하_평년 평균가격(원)', '감자 수미(햇)_20키로상자_중_평년 평균가격(원)', '감자 조풍_20키로상자_특_평년 평균가격(원)', '감자 수미(햇)_20키로상자_특_평년 평균가격(원)', '감자 조풍_20키로상자_중_평년 평균가격(원)', '감자 수미(저장)_20키로상자_하_평년 평균가격(원)', '감자 두백_20키로상자_상_평년 평균가격(원)', '감자 조풍_20키로상자_상_평균가격(원)', '감자 조풍_20키로상자_상_평년 평균가격(원)', '감자 수미(저장)_20키로상자_상_평년 평균가격(원)', '감자 조풍_20키로상자_중_평균가격(원)', '감자 두백_20키로상자_특_평년 평균가격(원)', '감자_20키로상자_하_평년 평균가격(원)', '감자 수입_

In [29]:
pbar_outer = tqdm(품목_리스트, position=0)
for 품목명 in pbar_outer:
    pbar_outer.set_description(품목명)
    model_path = f'models/best_model_{품목명}.pth'

    # define model
    model = PricePredictionLSTM(**품목별_hyperparams[품목명])
    model.load_state_dict(torch.load(model_path))

    # inference
    품목_predictions = []
    pbar_inner = tqdm(range(25), desc="테스트 파일 추론 중", position=1, leave=False)
    for i in pbar_inner:
        test_file = f"data/test/TEST_{i:02d}.csv"
        산지공판장_file = f"data/test/meta/TEST_산지공판장_{i:02d}.csv"
        전국도매_file = f"data/test/meta/TEST_전국도매_{i:02d}.csv"
        
        test_data, _ = process_data(test_file, 산지공판장_file, 전국도매_file, 품목명, scalers=품목별_scalers[품목명])
        test_data.drop(품목별_delcols[품목명], axis=1, inplace=True)
        test_dataset = AgriculturePriceDataset(test_data, is_test=True)
        test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

        price_scaler = 품목별_scalers[품목명][test_dataset.price_column]
        inverse_transform = lambda x: minmax_inverse_transform(x, price_scaler)

        predictions = inference(model, test_loader, device='cpu', inverse_transform=inverse_transform)
        predictions = np.concatenate(predictions)
        
        if np.isnan(predictions).any():
            pbar_inner.set_postfix({"상태": "NaN"})
            raise ValueError
        else:
            pbar_inner.set_postfix({"상태": "정상"})
            품목_predictions.extend(predictions.flatten())

    품목별_predictions[품목명] = 품목_predictions

  0%|          | 0/10 [00:00<?, ?it/s]

테스트 파일 추론 중:   0%|          | 0/25 [00:00<?, ?it/s]

테스트 파일 추론 중:   0%|          | 0/25 [00:00<?, ?it/s]

테스트 파일 추론 중:   0%|          | 0/25 [00:00<?, ?it/s]

테스트 파일 추론 중:   0%|          | 0/25 [00:00<?, ?it/s]

테스트 파일 추론 중:   0%|          | 0/25 [00:00<?, ?it/s]

테스트 파일 추론 중:   0%|          | 0/25 [00:00<?, ?it/s]

테스트 파일 추론 중:   0%|          | 0/25 [00:00<?, ?it/s]

테스트 파일 추론 중:   0%|          | 0/25 [00:00<?, ?it/s]

테스트 파일 추론 중:   0%|          | 0/25 [00:00<?, ?it/s]

테스트 파일 추론 중:   0%|          | 0/25 [00:00<?, ?it/s]

# Prepare Submission File

In [30]:
sample_submission = pd.read_csv('data/sample_submission.csv')

for 품목명, predictions in 품목별_predictions.items():
    sample_submission[품목명] = predictions

# 결과 저장
sample_submission.to_csv('out/baseline_submission_6.csv', index=False)

In [31]:
pd.read_csv('out/baseline_submission_6.csv')

Unnamed: 0,시점,감자,건고추,깐마늘(국산),대파,무,배추,사과,상추,양파,배
0,TEST_00+1순,27161.814453,567577.6250,124441.312500,2207.366699,12887.693359,11140.514648,23393.445312,1090.436768,1048.425659,34251.398438
1,TEST_00+2순,26764.187500,576237.0000,125168.007812,2050.588379,13068.929688,9781.863281,23712.386719,1066.190063,1000.773071,34455.289062
2,TEST_00+3순,28427.761719,564376.4375,123401.335938,1958.496704,12544.164062,8935.784180,22054.353516,1103.549316,954.284058,34522.839844
3,TEST_01+1순,25615.847656,595737.0000,123562.492188,1711.304443,13621.152344,11238.201172,22478.740234,1078.932251,1061.366089,31861.457031
4,TEST_01+2순,24973.390625,599965.9375,124310.781250,1686.277100,13840.006836,9847.901367,22972.359375,1055.297852,1010.481812,31976.542969
...,...,...,...,...,...,...,...,...,...,...,...
70,TEST_23+2순,25775.929688,637512.3750,124440.125000,1492.693848,13148.222656,9313.007812,23375.228516,990.667603,1014.497437,31164.898438
71,TEST_23+3순,27713.425781,634582.6250,122666.328125,1492.610718,12543.001953,8551.014648,21563.980469,1031.956665,969.649170,31338.464844
72,TEST_24+1순,35000.527344,571086.5000,115891.414062,1129.684937,9705.466797,8637.441406,24409.968750,901.355713,722.378540,41196.449219
73,TEST_24+2순,31905.011719,579390.3750,116865.234375,1303.866943,9829.894531,8114.021484,24602.863281,859.195374,750.811890,41645.605469


In [34]:
tmp = pd.read_csv('data/train/train.csv')
tmp.groupby('품목명')['평균가격(원)'].describe().astype(int)

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
품목명,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
감자,4752,11389,17637,0,0,0,19897,128932
건고추,1152,333839,343517,0,0,383419,628852,1070075
깐마늘(국산),288,116155,28119,64500,86151,122518,136621,169100
대파,3024,26621,36827,0,0,4023,43533,266042
무,4752,185279,1096089,0,0,0,3216,13350000
배,576,16742,16532,0,0,19570,31583,53463
배추,3744,8393,7517,0,3343,6527,11527,65108
사과,720,9271,10981,0,0,0,19077,35339
상추,576,867,384,402,591,759,1009,2403
양파,9792,5456,7453,0,0,753,10291,40321


In [None]:
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

In [None]:
품목명 = '무'
train_data, scaler = process_data("data/train/train.csv", "data/train/meta/TRAIN_산지공판장_2018-2021.csv", "data/train/meta/TRAIN_전국도매_2018-2021.csv", 품목명)
scaler = 품목별_scalers[품목명]['평균가격(원)']
tmp = minmax_inverse_transform(train_data['평균가격(원)'], scaler, is_train=False)
tmp.max()

In [None]:
tmp.describe()

In [None]:
i=0

test_file = f"data/test/TEST_{i:02d}.csv"
산지공판장_file = f"data/test/meta/TEST_산지공판장_{i:02d}.csv"
전국도매_file = f"data/test/meta/TEST_전국도매_{i:02d}.csv"

test_data, _ = process_data(test_file, 산지공판장_file, 전국도매_file, 품목명, scalers=품목별_scalers[품목명])

In [None]:
test_data.describe()

In [None]:
train_df = pd.read_csv('data/train/train.csv')
t = train_df[train_df['품목명']==품목명]
t['평균가격(원)'].describe().astype(int)

# plt.hist(t['평균가격(원)'], bins=50)
# plt.yscale('log')

plt.boxplot(t['평균가격(원)'])
plt.show()
plt.boxplot(np.log1p(t['평균가격(원)']))
plt.show()

In [None]:
t[t['평균가격(원)']>10000000]
t.groupby('거래단위')['평균가격(원)'].mean().astype(int)

In [None]:
tt = pd.read_csv('data/sample_submission.csv')
tt

In [None]:
# sample_submission = pd.read_csv('out/baseline_submission.csv')
# sample_submission.head()