In [1]:
import glob
import os
from tqdm import tqdm

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from matplotlib import rc
import seaborn as sns

%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [2]:
import tensorflow as tf
from tensorflow.keras import layers, models, optimizers

In [3]:
# gpu 설정
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

1 Physical GPUs, 1 Logical GPUs


In [4]:
plt.rcParams["font.family"] = 'NanumGothic'

In [5]:
train = pd.read_csv(os.path.join('Data', 'public_data', 'train.csv'))

- public 데이터에는 2020년도만 포함시켜야함
- 2020-09-29 ~ 2020-12-03일 까지

### 데이터 정규화

In [6]:
# 요일은 원핫 인코딩을 적용함, 요일에 대한 의미가 없을 것이라고 판단
train = pd.concat([train, pd.get_dummies(train['요일'])], axis = 1)
train = train.drop(['요일'], axis = 1)

col1 = train.columns[-7:].to_list()
col2 = train.columns[1:-7].to_list()

new_col = ['date'] + col1 + col2
train = train[new_col]

# 0 ~ 1 값으로 정규화 진행
norm = train.iloc[:,8:].max(0)
train.iloc[:,8:] = train.iloc[:,8:]/norm

### 하이퍼파라미터

In [7]:
# device = torch.device("cuda:0") # GPU 사용
# target_n = 21 # 맞춰야하는 품목/품종의 수
# learning_rate = 5e-4 # 학습률
# BATCH_SIZE = 128 # 배치사이즈
# EPOCHS = 50 # 총 eopochs
# teacher_forcing = False # 교사강요 설정
# n_layers = 3 # rnn레이어 층
# dropout = 0.2 # 드롭아웃
window_size = 28 # 인코더 시퀀스 길이
future_size = 28 # 디코더 시퀀스 길이
# hidden_dim = 128 # rnn 히든차원
# save_path = f'./models/best_model.pt' # 모델 저장 경로

### 전처리

In [8]:
x_data = []; y_data = []
for i in range(train.shape[0] - window_size - future_size):
    x = train.iloc[i:i+window_size, 1:].to_numpy()
    y = train.iloc[i+window_size:i+window_size+future_size, 9::2].to_numpy()
#     y_0 = np.zeros([1, y.shape[1]]) # 디코더 첫 입력값? 
    x_data.append(x)
    y_data.append(np.concatenate([y], axis = 0))

In [9]:
x_data = np.array(x_data)
y_data = np.array(y_data)

In [10]:
x_data.shape, y_data.shape

((1677, 28, 49), (1677, 28, 21))

In [11]:
train_test_split = 1
x_train = x_data[:-train_test_split-future_size]
y_train = y_data[:-train_test_split-future_size]
x_val = x_data[-train_test_split:]
y_val = y_data[-train_test_split:]

In [12]:
x_train.shape, y_train.shape, x_val.shape, y_val.shape

((1648, 28, 49), (1648, 28, 21), (1, 28, 49), (1, 28, 21))

### LSTM 모델

- LSTM 입력 포맷
- https://swlock.blogspot.com/2019/04/keras-lstm-understanding-input-and.html

In [13]:
from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.callbacks import EarlyStopping

In [14]:
# Build the LSTM model
# return_sequences 이전 정보를 모두 볼지 안볼지 확인하는 옵션
model = Sequential()
model.add(LSTM(28, return_sequences = True, input_shape = (x_train.shape[1], x_train.shape[2])))
model.add(LSTM(64, return_sequences = False))
model.add(Dense(30))
model.add(Dense(21))

In [15]:
model.compile(optimizer='adam', loss = 'mean_absolute_error')

In [16]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 28, 28)            8736      
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                23808     
_________________________________________________________________
dense (Dense)                (None, 30)                1950      
_________________________________________________________________
dense_1 (Dense)              (None, 21)                651       
Total params: 35,145
Trainable params: 35,145
Non-trainable params: 0
_________________________________________________________________


In [19]:
early_stop = EarlyStopping(monitor='loss', patience=2, verbose=1)

In [20]:
model.fit(x_train, y_train, batch_size=1, epochs=200, validation_data=(x_val, y_val), verbose = 1, callbacks = [early_stop])

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 00021: early stopping


<tensorflow.python.keras.callbacks.History at 0x7f3000097fd0>

- loss가 줄어들지 않음
- 여러 배수 실험 필요

In [23]:
x_train.shape,y_train.shape

((1648, 28, 49), (1648, 28, 21))

In [24]:
x_val.shape, y_val.shape

((1, 28, 49), (1, 28, 21))

### test

In [25]:
submission = pd.read_csv('Data/sample_submission.csv')

In [26]:
public_date_list = submission[submission['예측대상일자'].str.contains('2020')]['예측대상일자'].str.split('+').str[0].unique()

In [27]:
public_date_list

array(['2020-09-29', '2020-09-30', '2020-10-01', '2020-10-02',
       '2020-10-03', '2020-10-04', '2020-10-05', '2020-10-06',
       '2020-10-07', '2020-10-08', '2020-10-09', '2020-10-10',
       '2020-10-11', '2020-10-12', '2020-10-13', '2020-10-14',
       '2020-10-15', '2020-10-16', '2020-10-17', '2020-10-18',
       '2020-10-19', '2020-10-20', '2020-10-21', '2020-10-22',
       '2020-10-23', '2020-10-24', '2020-10-25', '2020-10-26',
       '2020-10-27', '2020-10-28', '2020-10-29', '2020-10-30',
       '2020-10-31', '2020-11-01', '2020-11-02', '2020-11-03',
       '2020-11-04', '2020-11-05'], dtype=object)

In [28]:
for date in public_date_list:
    test_df = pd.read_csv(f'Data/public_data/test_files/test_{date}.csv')
    data = pd.read_csv('Data/public_data/train.csv')
    data = pd.concat([data, test_df]).iloc[-window_size:] # 이전 28일을 보고 예측함

    week_day_map = {}
    
    # 요일은 원핫 인코딩을 적용함, 요일에 대한 의미가 없을 것이라고 판단
    data = pd.concat([data, pd.get_dummies(data['요일'])], axis = 1)
    data = data.drop(['요일'], axis = 1)

    col1 = data.columns[-7:].to_list()
    col2 = data.columns[1:-7].to_list()

    new_col = col1 + col2
    data = data[new_col]

    # 0 ~ 1 값으로 정규화 진행
    norm = data.iloc[:,7:].max(0)
    data.iloc[:,7:] = data.iloc[:,7:]/norm
    
    output = model.predict(np.array(data).reshape(1,28,49)) * norm[1::2].to_numpy()
    break
    idx = submission.loc[submission['예측대상일자'].str.contains(date)].index
    submission.loc[idx, '배추_가격(원/kg)':] = output[0]

In [128]:
submission.to_csv('result/01_LSTM.csv', index = False)

- 제출결과 NMAE : 0.40739 점수 나옴
- LSTM 모델 코드를 잘못 짠듯함, 28일 데이터를 보고 2주후 & 4주후 예측하도록 나타나지 않음
- 인풋을 배추, 무, 양파 각각 따로 구성을하고 LSTM 모델을 적용해보는 시도 필요함

- 'forecasting' keyword -> NN이 아닌 모델에서는 반복 수행이 필요함
- Prophet, arima 시도해볼만함