In [1]:
import numpy as np
import pandas as pd
import os
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, SimpleRNN, GRU, LSTM
from keras.optimizers import SGD

In [5]:
DATA_PATH = "../data" # 학습에 사용할 csv 파일이 저장된 폴더입니다.
TRAIN_FILE = "train.csv" # 학습 및 예측에 사용할 파일입니다.
df_data = pd.read_csv(os.path.join(DATA_PATH, 'train.csv'))

In [10]:
m_data = df_data.melt(id_vars=['ID', '제품', '대분류', '중분류', '소분류', '브랜드', '쇼핑몰'],
                var_name='ds', value_name='y', ignore_index=True)

m_data

Unnamed: 0,ID,제품,대분류,중분류,소분류,브랜드,쇼핑몰,ds,y
0,SAMPLE_00000,B002-00001-00001,B002-C001-0002,B002-C002-0007,B002-C003-0038,B002-00001,S001-00001,2022-01-01,0
1,SAMPLE_00001,B002-00002-00001,B002-C001-0003,B002-C002-0008,B002-C003-0044,B002-00002,S001-00001,2022-01-01,0
2,SAMPLE_00002,B002-00002-00002,B002-C001-0003,B002-C002-0008,B002-C003-0044,B002-00002,S001-00001,2022-01-01,0
3,SAMPLE_00003,B002-00002-00003,B002-C001-0003,B002-C002-0008,B002-C003-0044,B002-00002,S001-00001,2022-01-01,0
4,SAMPLE_00004,B002-00002-00004,B002-C001-0003,B002-C002-0008,B002-C003-0044,B002-00002,S001-00010,2022-01-01,0
...,...,...,...,...,...,...,...,...,...
13840221,SAMPLE_28889,B002-03798-00046,B002-C001-0003,B002-C002-0008,B002-C003-0044,B002-03798,S001-00001,2023-04-24,0
13840222,SAMPLE_28890,B002-03799-00002,B002-C001-0003,B002-C002-0008,B002-C003-0042,B002-03799,S001-00001,2023-04-24,0
13840223,SAMPLE_28891,B002-03799-00003,B002-C001-0003,B002-C002-0008,B002-C003-0044,B002-03799,S001-00001,2023-04-24,0
13840224,SAMPLE_28892,B002-03799-00004,B002-C001-0003,B002-C002-0008,B002-C003-0044,B002-03799,S001-00001,2023-04-24,0


In [None]:
def make_train(df):
        data_list = [] 
        for code in df['ID'].unique():
                d = df[df['ID'] == code].reset_index().drop(['index','ID'], axis=1).sort_values('ds')
                data_list.append(d)

        make = pd.DataFrame(data_list)
        return make
ds_data = make_train(m_data)
ds_data

In [None]:
ds_data.index = ds_data.ds
ts_data = ds_data.drop('ds', axis=1)
ts_data

In [None]:
# 입력값과 출력값 데이터를 위한 코드

def ts_train_test_normalize(ts_data, time_steps, for_periods):
    """
    [input]
    data : 날짜를 인덱스로 가지는 데이터

    [output]
    X_train, Y_train : 2022.01.01 ~ 2022.12.31
    X_test : 2023.01.01 ~ 2023.04.24
    predict : 2023.04.25 ~ 2023.05.15

    [time_steps]
    input 데이터의 time steps

    [for_periods]
    output 데이터의 time steps
    """

    # training & test
    ts_train = ts_data[:'2022-12-31'].values
    ts_test = ts_data['2023-01-01':].values
    ts_train_len = len(ts_train)
    ts_test_len = len(ts_test)

    # scale the data (데이터 정규화)
    from sklearn.preprocessing import MinMaxScaler
    sc = MinMaxScaler(feature_range=(0,1))
    ts_train_scaled = sc.fit_transform(ts_train)

    # training data sample과 time steps로 원본 데이터 슬라이싱하기
    X_train = []
    y_train = []
    y_train_stacked = []

    for i in range(time_steps, ts_train_len -1):
        X_train.append(ts_train[i-time_steps:i, 0])
        y_train.append(ts_train[i:i+for_periods, 0])

    X_train, y_train = np.array(X_train), np.array(y_train)

    # 3차원으로 재구성하기
    # np.reshape(samples, time steps, features) 
    # 1차원 배열의 shape을 2차원으로 변경하거나, 1차원 배열의 shape을 3차원으로 변경하려고 할때, Numpy 라이브러리에서 제공하는 reshape()함수를 이용
    X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))

    # preparing to create X_test
    inputs = pd.concat((ts_data[:'2022-12-31'], ts_data['2023-01-01':]), axis=0).values
    inputs = inputs[len(inputs) -len(ts_test) - time_steps:]
    inputs = inputs.reshape(-1,1)

    X_test = []
    for i in range(time_steps, ts_test_len+time_steps-for_periods):
        X_test.append(inputs[i-time_steps:i, 0])
    X_test = np.array(X_test)
    X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))

    return X_train, y_train, X_test, sc


In [None]:
def LSTM_model(X_train, y_train, X_test, sc):

    # LSTM 아키텍쳐(architecture)
    my_LSTM_model = Sequential()
    my_LSTM_model.add(LSTM(units = 50,
                           return_sequences = True,
                           input_shape = (X_train.shape[1],1),
                           activation = 'tanh'))
    my_LSTM_model.add(LSTM(units = 50, activation = 'tanh'))
    my_LSTM_model.add(Dense(units = 2))

    # 컴파일링(compiling)
    learning_rate = 0.01
    momentum = 0.9
    nesterov = False
    optimizer = tf.keras.optimizers.legacy.SGD(learning_rate=learning_rate, momentum=momentum, nesterov=nesterov)
    my_LSTM_model.compile(optimizer=optimizer, loss='mean_squared_error')
    
    # training data 세트에 피팅(fitting)
    my_LSTM_model.fit(X_train, y_train, epochs = 50, batch_size = 150, verbose = 0)

    # X_test를 LSTM모델에 넣어서 예측하기
    LSTM_prediction = my_LSTM_model.predict(X_test)

    # 스케일러에 예측값 넣어 반환하기
    LSTM_prediction = sc.inverse_transform(LSTM_prediction)

    return my_LSTM_model, LSTM_prediction

In [None]:
my_LSTM_model, LSTM_prediction = LSTM_model(X_train, y_train, X_test, sc)

In [None]:
y_pred = pd.DataFrame(LSTM_prediction[:, 0])
y_test = ts_data['2022-01-01':, 'y'][0:len(LSTM_prediction)]
y_test.reset_index(drop=True, inplace=True)