# 농산물 가격 예측을 위한 AI 모델 개발 
- '2024 농산물 가격 예측 AI 경진대회'는 데이터와 AI 기술을 활용하여 농산물 가격 예측 능력을 향상시키는 것을 목표로 합니다.<br>  이 대회는 농업 분야의 복잡한 시계열 데이터를 효율적으로 분석하고 예측할 수 있는 AI 알고리즘 개발에 초점을 맞추고 있습니다. <br> <br>
- 이 대회의 궁극적 목적은 참가자들의 시계열 데이터 분석 및 예측 역량을 강화하고, <br> AI 기술이 실제 농산물 가격 예측과 관련 정책 결정에 어떻게 기여할 수 있는지 탐구하는 것입니다. 

# Import Library

In [1]:
import sys
lib_dir = "g:/My Drive/Storage/Github/hyuckjinkim"
sys.path.append(lib_dir)

from lib.python.graph import MatplotlibFontManager
fm = MatplotlibFontManager()
fm.set_korean_font(check=False)

from lib.python.torch.build_model import train, predict

In [2]:
# import pandas as pd
# pd.set_option("display.max_rows", None)
# pd.set_option("display.max_columns", None)

# train_df = pd.read_csv('data/train/train.csv')

# train_meta1_df = pd.read_csv('data/train/meta/TRAIN_산지공판장_2018-2021.csv')
# train_meta1_df.drop(['품목코드','품종코드','공판장코드'], axis=1, inplace=True)

# train_meta2_df = pd.read_csv('data/train/meta/TRAIN_전국도매_2018-2021.csv')
# train_meta2_df.drop(['품목코드','품종코드','시장코드']  , axis=1, inplace=True)

# train_df.head(2) # ['평년 평균가격(원)','평균가격(원)']
# train_meta1_df.head(2)
# train_meta2_df.head(2)

In [3]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from tqdm.notebook import tqdm
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from types import SimpleNamespace
from sklearn.preprocessing import MinMaxScaler
import os

# Hyperparameter Setting

# Define Function for Feature Engineering
- 타겟의 필터 조건을 제외한 메타데이터의 필터 조건은 참가자들 각자의 기준에 맞춰 자유롭게 사용가능 
- 밑의 필터 조건은 임의로 제공하는 예시

In [4]:
def year_convert(data):
    data['연도'] -= 2018

    offset = 0.1
    map_dict = {'상순':offset, '중순':offset+1/3, '하순':offset+2/3}
    data['연도'] += data['시점'].str.extract(r'(상순|중순|하순)')[0].map(map_dict)
    
    return data

def process_data(raw_file, 산지공판장_file, 전국도매_file, 품목명, scalers=None):
    raw_data = pd.read_csv(raw_file)
    산지공판장 = pd.read_csv(산지공판장_file)
    전국도매 = pd.read_csv(전국도매_file)

    # 품목코드, 품종코드, 공판장코드, 시장코드 제거
    산지공판장.drop(['품목코드','품종코드','공판장코드'], axis=1, inplace=True)
    전국도매  .drop(['품목코드','품종코드','시장코드']  , axis=1, inplace=True)

    # 연도에 상/중/하순에 대한 정보도 추가
    산지공판장 = year_convert(산지공판장)
    전국도매 = year_convert(전국도매)

    # 이상값(0이하) 처리
    for col in ['전순 평균가격(원) PreVious SOON', '전달 평균가격(원) PreVious MMonth', '전년 평균가격(원) PreVious YeaR']:
        loc = 전국도매[col] < 0
        전국도매.loc[loc,col] = 0

    # log변환
    raw_cols = ['평년 평균가격(원)', '평균가격(원)']
    산지공판장_cols =  ['총반입량(kg)', '총거래금액(원)', '평균가(원/kg)', '중간가(원/kg)', '최저가(원/kg)', '최고가(원/kg)', '경매 건수', 
                       '전순 평균가격(원) PreVious SOON', '전달 평균가격(원) PreVious MMonth', '전년 평균가격(원) PreVious YeaR', '평년 평균가격(원) Common Year SOON']
    전국도매_cols = ['총반입량(kg)', '총거래금액(원)', '평균가(원/kg)', '고가(20%) 평균가', '중가(60%) 평균가 ', '저가(20%) 평균가', '중간가(원/kg)', '최저가(원/kg)',
                    '최고가(원/kg)', '경매 건수', '전순 평균가격(원) PreVious SOON', '전달 평균가격(원) PreVious MMonth', '전년 평균가격(원) PreVious YeaR', '평년 평균가격(원) Common Year SOON']
    for col in raw_cols: raw_data[col] = np.log1p(raw_data[col])
    for col in 산지공판장_cols: 산지공판장[col] = np.log1p(산지공판장[col])
    for col in 전국도매_cols: 전국도매[col] = np.log1p(전국도매[col])

    # 타겟 및 메타데이터 필터 조건 정의
    conditions = {
    '감자': {
        'target': lambda df: (df['품종명'] == '감자 수미') & (df['거래단위'] == '20키로상자') & (df['등급'] == '상'),
        '공판장': {'공판장명': ['*전국농협공판장'], '품목명': ['감자'], '품종명': ['수미'], '등급명': ['상']},
        '도매': {'시장명': ['*전국도매시장'], '품목명': ['감자'], '품종명': ['수미']}
    },
    '건고추': {
        'target': lambda df: (df['품종명'] == '화건') & (df['거래단위'] == '30 kg') & (df['등급'] == '상품'),
        '공판장': None, 
        '도매': None  
    },
    '깐마늘(국산)': {
        'target': lambda df: (df['거래단위'] == '20 kg') & (df['등급'] == '상품'),
        '공판장': {'공판장명': ['*전국농협공판장'], '품목명': ['마늘'], '품종명': ['깐마늘'], '등급명': ['상']},
        '도매': {'시장명': ['*전국도매시장'], '품목명': ['마늘'], '품종명': ['깐마늘']}
    },
    '대파': {
        'target': lambda df: (df['품종명'] == '대파(일반)') & (df['거래단위'] == '1키로단') & (df['등급'] == '상'),
        '공판장': {'공판장명': ['*전국농협공판장'], '품목명': ['대파'], '품종명': ['대파(일반)'], '등급명': ['상']},
        '도매': {'시장명': ['*전국도매시장'], '품목명': ['대파'], '품종명': ['대파(일반)']}
    },
    '무': {
        'target': lambda df: (df['거래단위'] == '20키로상자') & (df['등급'] == '상'),
        '공판장': {'공판장명': ['*전국농협공판장'], '품목명': ['무'], '품종명': ['기타무'], '등급명': ['상']},
        '도매': {'시장명': ['*전국도매시장'], '품목명': ['무'], '품종명': ['무']}
    },
    '배추': {
        'target': lambda df: (df['거래단위'] == '10키로망대') & (df['등급'] == '상'),
        '공판장': {'공판장명': ['*전국농협공판장'], '품목명': ['배추'], '품종명': ['쌈배추'], '등급명': ['상']},
        '도매': {'시장명': ['*전국도매시장'], '품목명': ['배추'], '품종명': ['배추']}
    },
    '사과': {
        'target': lambda df: (df['품종명'].isin(['홍로', '후지'])) & (df['거래단위'] == '10 개') & (df['등급'] == '상품'),
        '공판장': {'공판장명': ['*전국농협공판장'], '품목명': ['사과'], '품종명': ['후지'], '등급명': ['상']},
        '도매': {'시장명': ['*전국도매시장'], '품목명': ['사과'], '품종명': ['후지']}
    },
    '상추': {
        'target': lambda df: (df['품종명'] == '청') & (df['거래단위'] == '100 g') & (df['등급'] == '상품'),
        '공판장': {'공판장명': ['*전국농협공판장'], '품목명': ['상추'], '품종명': ['청상추'], '등급명': ['상']},
        '도매': {'시장명': ['*전국도매시장'], '품목명': ['상추'], '품종명': ['청상추']}
    },
    '양파': {
        'target': lambda df: (df['품종명'] == '양파') & (df['거래단위'] == '1키로') & (df['등급'] == '상'),
        '공판장': {'공판장명': ['*전국농협공판장'], '품목명': ['양파'], '품종명': ['기타양파'], '등급명': ['상']},
        '도매': {'시장명': ['*전국도매시장'], '품목명': ['양파'], '품종명': ['양파(일반)']}
    },
    '배': {
        'target': lambda df: (df['품종명'] == '신고') & (df['거래단위'] == '10 개') & (df['등급'] == '상품'),
        '공판장': {'공판장명': ['*전국농협공판장'], '품목명': ['배'], '품종명': ['신고'], '등급명': ['상']},
        '도매': {'시장명': ['*전국도매시장'], '품목명': ['배'], '품종명': ['신고']}
    }
    }

    # 타겟 데이터 필터링
    raw_품목 = raw_data[raw_data['품목명'] == 품목명]
    target_mask = conditions[품목명]['target'](raw_품목)
    filtered_data = raw_품목[target_mask]

    # 다른 품종에 대한 파생변수 생성
    other_data = raw_품목[~target_mask]
    unique_combinations = other_data[['품종명', '거래단위', '등급']].drop_duplicates()
    for _, row in unique_combinations.iterrows():
        품종명, 거래단위, 등급 = row['품종명'], row['거래단위'], row['등급']
        mask = (other_data['품종명'] == 품종명) & (other_data['거래단위'] == 거래단위) & (other_data['등급'] == 등급)
        temp_df = other_data[mask]
        for col in ['평년 평균가격(원)', '평균가격(원)']:
            new_col_name = f'{품종명}_{거래단위}_{등급}_{col}'
            filtered_data = filtered_data.merge(temp_df[['시점', col]], on='시점', how='left', suffixes=('', f'_{new_col_name}'))
            filtered_data.rename(columns={f'{col}_{new_col_name}': new_col_name}, inplace=True)

    # 공판장 데이터 처리
    if conditions[품목명]['공판장']:
        filtered_공판장 = 산지공판장
        for key, value in conditions[품목명]['공판장'].items():
            filtered_공판장 = filtered_공판장[filtered_공판장[key].isin(value)]
        
        filtered_공판장 = filtered_공판장.add_prefix('공판장_').rename(columns={'공판장_시점': '시점'})
        filtered_data = filtered_data.merge(filtered_공판장, on='시점', how='left')

    # 도매 데이터 처리
    if conditions[품목명]['도매']:
        filtered_도매 = 전국도매
        for key, value in conditions[품목명]['도매'].items():
            filtered_도매 = filtered_도매[filtered_도매[key].isin(value)]
        
        filtered_도매 = filtered_도매.add_prefix('도매_').rename(columns={'도매_시점': '시점'})
        filtered_data = filtered_data.merge(filtered_도매, on='시점', how='left')

    # 수치형 컬럼 처리
    numeric_columns = filtered_data.select_dtypes(include=[np.number]).columns
    filtered_data = filtered_data[['시점'] + list(numeric_columns)]
    filtered_data[numeric_columns] = filtered_data[numeric_columns].fillna(0)

    # 정규화 적용
    if scalers is None:
        scalers = {}
        for col in numeric_columns:
            scaler = MinMaxScaler()
            filtered_data[col] = scaler.fit_transform(filtered_data[col].values.reshape(-1,1))
            scalers[col] = scaler
    else:
        for col in numeric_columns:
            scaler = scalers[col]
            filtered_data[col] = scaler.transform(filtered_data[col].values.reshape(-1,1))

    return filtered_data, scalers


# Define Custom Dataset Class

In [5]:
class AgriculturePriceDataset(Dataset):
    def __init__(self, dataframe, window_size=9, prediction_length=3, is_test=False):
        self.data = dataframe
        self.window_size = window_size
        self.prediction_length = prediction_length
        self.is_test = is_test
        
        self.price_column = '평균가격(원)'
        self.numeric_columns = self.data.select_dtypes(include=[np.number]).columns.tolist()

        self.sequences = []
        if not self.is_test:
            for i in range(len(self.data) - self.window_size - self.prediction_length + 1):
                x = self.data[self.numeric_columns].iloc[i:i+self.window_size].values
                y = self.data[self.price_column].iloc[i+self.window_size:i+self.window_size+self.prediction_length].values
                self.sequences.append((x, y))
        else:
            self.sequences = [self.data[self.numeric_columns].values]
    
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        if not self.is_test:
            x, y = self.sequences[idx]
            return torch.FloatTensor(x), torch.FloatTensor(y)
        else:
            return torch.FloatTensor(self.sequences[idx])

# Define Model Architecture and Training Functions

In [6]:
class PricePredictionLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size, dropout=0):
        super(PricePredictionLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

# Train Models and Generate Predictions

In [7]:
config = {
    "learning_rate": 2e-5,
    "epoch": 4048,
    "batch_size": 64,
    "hidden_size": 64,
    "num_layers": 2,
    "output_size": 3,
    "test_size": 0.2,
    "seed": 42,
}

CFG = SimpleNamespace(**config)
품목_리스트 = ['건고추', '사과', '감자', '배', '깐마늘(국산)', '무', '상추', '배추', '양파', '대파']
# 품목_리스트 = ['감자']

In [8]:
# 품목별_scalers[품목명][dataset.price_column].inverse_transform
def minmax_inverse_transform(x, scaler, is_train=True):
    origin = scaler.data_min_[0] + x * (scaler.data_max_[0] - scaler.data_min_[0])
    if is_train:
        origin = torch.exp(origin) - 1
    else:
        origin = np.exp(origin) - 1
    return origin

In [9]:
품목별_predictions = {}
품목별_scalers = {}
os.makedirs('models', exist_ok=True)

for i, 품목명 in enumerate(품목_리스트):
    model_path = f'models/best_model_{품목명}.pth'
    print(f'\n{'='*150}\n> [{i+1}/{len(품목_리스트)}] {품목명}\n{'='*150}\n')

    # preprocessing
    train_data, scalers = process_data("data/train/train.csv", "data/train/meta/TRAIN_산지공판장_2018-2021.csv", "data/train/meta/TRAIN_전국도매_2018-2021.csv", 품목명)
    품목별_scalers[품목명] = scalers
    dataset = AgriculturePriceDataset(train_data)

    # train, validation split
    tr_data, val_data = train_test_split(dataset, test_size=CFG.test_size, random_state=CFG.seed, shuffle=True)
    train_loader = DataLoader(tr_data, CFG.batch_size, shuffle=True)
    val_loader = DataLoader(val_data, CFG.batch_size, shuffle=False)

    # define model
    model = PricePredictionLSTM(
        input_size=len(dataset.numeric_columns),
        hidden_size=CFG.hidden_size,
        num_layers=CFG.num_layers,
        output_size=CFG.output_size,
    )
    criterion = nn.L1Loss()
    optimizer = torch.optim.AdamW(model.parameters(), CFG.learning_rate)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=10)
    # scheduler = None

    price_scaler = 품목별_scalers[품목명][dataset.price_column]
    inverse_transform = lambda x: minmax_inverse_transform(x, price_scaler)
    # inverse_transform = None

    # train
    best_model = train(
        model, optimizer, train_loader, val_loader, CFG.epoch,
        early_stopping=True, early_stopping_patience=100, early_stopping_verbose=False,
        device='cpu', scheduler=scheduler, metric_period=50, 
        verbose=True, save_model_path=model_path,
        inverse_transform=inverse_transform,
    )


> [1/10] 건고추

*[0050/4048] tr_loss: 219557.3828, val_loss: 173134.7188, best: 173134.7188(50), elapsed: 3.7s, total: 3.7s, remaining: 295.4s
*[0100/4048] tr_loss: 200742.0938, val_loss: 157930.4219, best: 157930.4219(100), elapsed: 3.6s, total: 7.4s, remaining: 287.2s
*[0150/4048] tr_loss: 178983.7578, val_loss: 134584.3750, best: 134584.3750(150), elapsed: 3.6s, total: 11.0s, remaining: 282.9s
*[0200/4048] tr_loss: 128323.0234, val_loss: 88125.6875, best: 88125.6875(200), elapsed: 3.7s, total: 14.7s, remaining: 282.5s
*[0250/4048] tr_loss: 104030.4570, val_loss: 71600.9297, best: 71600.9297(250), elapsed: 3.9s, total: 18.7s, remaining: 296.8s
*[0300/4048] tr_loss: 104834.4922, val_loss: 70233.2812, best: 70233.2812(300), elapsed: 4.4s, total: 23.1s, remaining: 327.8s
*[0350/4048] tr_loss: 104610.1406, val_loss: 68814.8750, best: 68814.8750(350), elapsed: 4.4s, total: 27.5s, remaining: 325.9s
*[0400/4048] tr_loss: 102005.7617, val_loss: 67076.5547, best: 67076.5547(400), elapsed: 3.9s

In [10]:
def predict(best_model, loader, device, inverse_transform):
    best_model.to(device)
    best_model.eval()
    
    true_list = []
    pred_list = []
    with torch.no_grad():
        for data,label in loader:
            data = data.float().to(device)

            output = best_model(data)
            output = inverse_transform(output)
            output = output.cpu().numpy().tolist()

            label  = inverse_transform(label)
            label = label.cpu().numpy().tolist()

            true_list += label
            pred_list += output

    return true_list, pred_list

def inference(best_model, loader, device, inverse_transform):
    best_model.to(device)
    best_model.eval()
    
    true_list = []
    pred_list = []
    with torch.no_grad():
        for data in loader:
            data = data.float().to(device)

            output = best_model(data)
            output = inverse_transform(output)
            output = output.cpu().numpy().tolist()

            pred_list += output

    return pred_list

In [11]:
# true, pred = predict(best_model, train_loader, device='cpu', inverse_transform=inverse_transform)
# print(criterion(torch.tensor(true), torch.tensor(pred)).item())
# true[:5], pred[:5]

In [19]:
pbar_outer = tqdm(품목_리스트, position=0)
for 품목명 in pbar_outer:
    pbar_outer.set_description(품목명)
    model_path = f'models/best_model_{품목명}.pth'

    # preprocessing
    train_data, scaler = process_data("data/train/train.csv", "data/train/meta/TRAIN_산지공판장_2018-2021.csv", "data/train/meta/TRAIN_전국도매_2018-2021.csv", 품목명)
    품목별_scalers[품목명] = scaler
    dataset = AgriculturePriceDataset(train_data)

    # train, validation split
    tr_data, val_data = train_test_split(dataset, test_size=CFG.test_size, random_state=CFG.seed)
    train_loader = DataLoader(tr_data, CFG.batch_size, shuffle=True)
    val_loader = DataLoader(val_data, CFG.batch_size, shuffle=False)

    # define model
    model = PricePredictionLSTM(
        input_size=len(dataset.numeric_columns),
        hidden_size=CFG.hidden_size,
        num_layers=CFG.num_layers,
        output_size=CFG.output_size,
    )
    model.load_state_dict(torch.load(model_path))

    # inference
    품목_predictions = []
    pbar_inner = tqdm(range(25), desc="테스트 파일 추론 중", position=1, leave=False)
    for i in pbar_inner:
        test_file = f"data/test/TEST_{i:02d}.csv"
        산지공판장_file = f"data/test/meta/TEST_산지공판장_{i:02d}.csv"
        전국도매_file = f"data/test/meta/TEST_전국도매_{i:02d}.csv"
        
        test_data, _ = process_data(test_file, 산지공판장_file, 전국도매_file, 품목명, scalers=품목별_scalers[품목명])
        test_dataset = AgriculturePriceDataset(test_data, is_test=True)
        test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

        predictions = inference(model, test_loader, device='cpu', inverse_transform=inverse_transform)
        predictions_array = np.concatenate(predictions)

        # 예측값을 원래 스케일로 복원
        price_column_index = test_data.columns.get_loc(test_dataset.price_column)
        predictions_reshaped = predictions_array.reshape(-1, 1)
        
        # 가격 열에 대해서만 inverse_transform 적용
        price_scaler = 품목별_scalers[품목명][test_dataset.price_column]
        predictions_original_scale = price_scaler.inverse_transform(predictions_reshaped)
        #print(predictions_original_scale)
        
        if np.isnan(predictions_original_scale).any():
            pbar_inner.set_postfix({"상태": "NaN"})
        else:
            pbar_inner.set_postfix({"상태": "정상"})
            품목_predictions.extend(predictions_original_scale.flatten())

    품목별_predictions[품목명] = 품목_predictions

  0%|          | 0/10 [00:00<?, ?it/s]

테스트 파일 추론 중:   0%|          | 0/25 [00:00<?, ?it/s]

테스트 파일 추론 중:   0%|          | 0/25 [00:00<?, ?it/s]

테스트 파일 추론 중:   0%|          | 0/25 [00:00<?, ?it/s]

테스트 파일 추론 중:   0%|          | 0/25 [00:00<?, ?it/s]

테스트 파일 추론 중:   0%|          | 0/25 [00:00<?, ?it/s]

테스트 파일 추론 중:   0%|          | 0/25 [00:00<?, ?it/s]

테스트 파일 추론 중:   0%|          | 0/25 [00:00<?, ?it/s]

테스트 파일 추론 중:   0%|          | 0/25 [00:00<?, ?it/s]

테스트 파일 추론 중:   0%|          | 0/25 [00:00<?, ?it/s]

테스트 파일 추론 중:   0%|          | 0/25 [00:00<?, ?it/s]

# Prepare Submission File

In [20]:
sample_submission = pd.read_csv('data/sample_submission.csv')

for 품목명, predictions in 품목별_predictions.items():
    sample_submission[품목명] = predictions

# 결과 저장
sample_submission.to_csv('out/baseline_submission_2.csv', index=False)

In [21]:
sample_submission

Unnamed: 0,시점,감자,건고추,깐마늘(국산),대파,무,배추,사과,상추,양파,배
0,TEST_00+1순,39402.344386,825.519388,40437.412822,1993.361973,183770.685105,3902.466552,440294.429588,12291.798734,9887.388630,20384.453007
1,TEST_00+2순,39478.646135,838.424068,42068.813764,1930.211177,181745.858206,3513.722772,427138.775887,10387.277291,10864.845885,19815.690907
2,TEST_00+3순,38884.611926,860.372464,41275.555334,1919.129537,177245.254816,3121.041671,431831.256138,9567.529182,8834.245243,19423.659943
3,TEST_01+1순,45454.679849,421263.080302,125303.177261,2894.849287,10568.380954,15688.484209,17209.832390,1104.576767,1184.401613,20981.862262
4,TEST_01+2순,45910.652578,416158.069756,130844.609961,2975.710887,11417.501638,13759.076211,16779.472052,1009.536278,1269.853558,21072.462074
...,...,...,...,...,...,...,...,...,...,...,...
70,TEST_23+2순,44761.402274,463844.473299,130664.061836,2370.184123,9777.872170,9966.538805,17140.386976,987.677060,1253.188329,20903.566730
71,TEST_23+3순,43942.855160,468823.511117,128008.274095,2508.367281,8440.847760,9403.988863,17345.433497,1001.708176,1084.844786,20741.441249
72,TEST_24+1순,44907.821412,444148.625659,105376.220160,2551.209933,16328.456249,22419.594134,20870.917773,1225.121626,832.276808,27396.264443
73,TEST_24+2순,42038.049009,445259.760441,111785.083052,2493.045818,16071.094975,19880.580412,20688.325333,1266.507841,673.872516,27363.765844


In [None]:
# sample_submission = pd.read_csv('out/baseline_submission.csv')
# sample_submission.head()