### 라이브러리 import

In [None]:
import tensorflow as tf
import numpy as np
import random
import os
import pandas as pd
from tensorflow.keras.layers import LSTM, Dense, Flatten, Dropout, Dropout
from tensorflow.keras.models import Sequential
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error
from tensorflow.keras.backend import clear_session
import gc
from tqdm import tqdm

### 한글 폰트

In [None]:
from matplotlib import font_manager, rc
font_path = "C:/Windows/Fonts/gulim.ttc"
font = font_manager.FontProperties(fname=font_path).get_name()
rc('font', family=font)

### seed 고정

In [None]:
def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    tf.random.set_seed(seed)

In [None]:
seed_everything(42)

### Pandas setting 변경

In [None]:
pd.set_option('display.max_columns', None)

### 데이터 불러오기

In [None]:
df = pd.read_excel('./data/dataset.xlsx')

In [None]:
df.head()

In [None]:
df = df[['VAR0', 'VAR1', 'VAR2', 'VAR3', 'VAR4', 'VAR5', 
         'VAR6', 'VAR8', 'VAR9', 'VAR11', 'VAR13',
       'VAR17', 'VAR18', 'VAR24', 'VAR25', 'VAR26', 
         'VAR16']]

In [None]:
df.drop('VAR7', axis=1, inplace=True)

In [None]:
df.columns

In [None]:
df.corr()

### 함수 선언 및 구현

In [None]:
def scaling(train, test):
    '''
    description:
        train, test 데이터를 받아 MinMaxScaling을 하고 Scaler와 Scaling된 데이터 프레임을 반환
    Args:
        train    : train data
        test     : test data
    Returns:
        sc       : fit 된 MinMaxScaler
        train_sc : scaling 된 train DataFrame
        test_sc  : scaling 된 test DataFrame
    '''   
    sc = MinMaxScaler()
    train_sc = sc.fit_transform(train)
    train_sc = pd.DataFrame(train_sc)
    train_sc.columns = train.columns
    
    test_sc = sc.transform(test)
    test_sc = pd.DataFrame(test_sc)
    test_sc.columns = test.columns
    
    return sc, train_sc, test_sc # 이거 229개의 시군구 모델들이 따로있는거야? 아니면 하나의 모델에 229개의 시군구가 한번에 학습딘거야?

In [None]:
def make_dataset(data, label, window_size=6):
    '''
    description:
        data와 label, window size를 받아 LSTM에서 시계열 학습을 위해 3차원 데이터로 만든다.
    Args:
        data        : 변환할 X
        label       : 변환할 y
        window_size : 학습에 참조할 window 크기
    Returns:
        np.array(feature_list) : 변환한 X
        np.array(label_list)   : 변환한 y
    '''
    feature_list = []
    label_list = []
    for i in range(len(data) - window_size):
        feature_list.append(np.array(data.iloc[i:i+window_size])) # sliding window 방식
        label_list.append(np.array(label.iloc[i+window_size]))
    if not feature_list: # forecasting의 경우
        feature_list = data.values.reshape(-1, window_size, len(data.columns))
    return np.array(feature_list), np.array(label_list)

In [None]:
def split_X_y(df, label_name):
    '''
    description:
        DataFrame을 X와 y로 분리
    Args:
        df         : 분리할 대상
        label_name : Label로 사용할 컬럼명
    Returns:
        features   : X
        label      : y
    '''
    feature_cols = list(df.drop(label_name, axis=1).columns)
    label_cols = label_name
    
    features = df[feature_cols]
    label = df[label_cols]
    return features, label

In [None]:
def split_tr_ts(df, train_size, window_size):
    '''
    description:
        DataFrame을 train-test로 분리
    Args:
        df          : 분리할 대상
        train_size  : train data의 크기
        window_size : window의 크기
    Returns:
        train       : Train DataFrame
        test        : Test DataFrame
    '''
    train = df[:train_size]
    test = df[train_size-window_size:] # test size를 보존하기 위해서 window 크기만큼 train data에서 추가로 가져옴
    return train, test

In [None]:
def prepare_data(df, train_size, window_size):
    '''
    description:
        데이터 준비를 총괄하는 함수
    Args:
        df             : 원본 DataFrame
        train_size     : Train data의 크기
        window_size    : Window의 크기
    Returns:
        sc             : fit 된 MinMaxScaler
        train_features : Train data의 X
        test_features  : Test data의 X
        train_label    : Train data의 y
        test_label     : Test data의 y
    '''
    # train - test 분리
    train, test = split_tr_ts(df, train_size, window_size)
    
    # scaling
    sc, train_sc, test_sc = scaling(train, test)
    
    # X - y 분리
    train_features, train_label = split_X_y(train_sc, 'VAR16')
    test_features, test_label = split_X_y(test_sc, 'VAR16')
    
    train_features, train_label = make_dataset(train_features, train_label, window_size)
    test_features, test_label = make_dataset(test_features, test_label, window_size)
    
    return sc, train_features, test_features, train_label, test_label

In [None]:
def model_creation(train_features):
    '''
    description:
        모델 생성 함수
    Args:
        train_features : Train data의 X
    Returns:
        model          : 생성된 모델
    '''
    model = Sequential()
    model.add(LSTM(64, input_shape=(train_features.shape[1], train_features.shape[2]), activation='tanh', return_sequences=True))
    model.add(LSTM(16, activation='tanh', return_sequences=False))
    model.add(Dense(1))
    
    model.compile(optimizer='adam', loss='mse')
    return model

In [None]:
def fit_model(model, train_features, train_label, epochs, batch_size):
    '''
    description:
        모델 학습 함수
    Args:
        model          : 학습할 모델
        train_features : Train data의 X
        train_label    : Train data의 y
        epochs         : epoch 수
        batch_size     : batch 크기
    Returns:
        model          : 학습된 모델
        history        : 학습한 history
    '''
    history = model.fit(train_features, train_label, epochs=epochs, batch_size=batch_size, verbose=0)
    return model, history

In [None]:
def modeling(train_features, train_label):
    '''
    description:
        모델 생성 및 학습
    Args:
        train_features : Train data의 X
        train_label    : Train data의 y
    Returns:
        model_fit      : 학습된 모델
    '''
    model = model_creation(train_features)
    model_fit, history = fit_model(model, train_features, train_label, 35, 32)
    return model_fit

In [None]:
def inverse_series(sc, series, n_features):
    '''
    description:
        Scaling된 Series를 원래 수치로 복원
    Args:
        sc         : Scaling에 사용한 Scaler
        series     : Scaling된 Series
        n_features : Scaling할 때 존재했던 feature의 갯수
    Returns:
        수치가 복원된 series
    '''
    return sc.inverse_transform(np.concatenate([np.zeros((len(series), n_features)), series.reshape(-1, 1)], 1))

In [None]:
def predict_test(model_fit, test_features, test_label, sc, forecast_flg=False):
    '''
    description:
        Test data로 예측
    Args:
        model_fit     : 학습된 모델
        test_features : Test data의 X
        test_label    : Test data의 y
        sc            : Scaling에 사용한 Scaler
        forecast_flg  : Test label이 없는 forecasting임을 알리는 플래그
    Returns:
        inv_test      : 실제 값
        inv_pred      : 예측 값
    '''
    pred = model_fit.predict(test_features)
    if forecast_flg:
        inv_pred = inverse_series(sc, pred, test_features.shape[2])
        inv_test = None
    else:
        inv_pred = inverse_series(sc, pred, test_features.shape[2])
        inv_test = inverse_series(sc, test_label, test_features.shape[2])
    return inv_test, inv_pred

In [None]:
def print_mae(inv_actual, inv_pred):
    '''
    description:
        예측한 것에 대한 mae를 출력
    Args:
        inv_actual : 실제 값
        inv_pred   : 예측 값
    Returns:
        None
    '''
    print(mean_absolute_error(inv_actual[:, -1], inv_pred[:, -1]))

In [None]:
def plot_result(y, inv_pred, train_size):
    '''
    description:
        예측 결과를 시각화
    Args:
        y          : 전체 y값
        inv_pred   : 예측 값
        train_size : Train data의 크기
    Returns:
        None
    '''
    inv_pred = pd.DataFrame(inv_pred[:, -1].round(0))
    inv_pred.index = list(range(train_size, len(y)))
    
    plt.figure(figsize=(15, 5))
    plt.plot(y.reset_index(drop=True), label='actual')
    plt.plot(inv_pred, label='prediction')
    plt.xlabel('시간')
    plt.ylabel('VAR16')
    plt.show()

In [None]:
def evaluate_model(y, inv_actual, inv_pred, train_size):
    '''
    description:
    Args:
        y          : 전체 y값
        inv_actual : Test label 값
        inv_pred   : 예측 값
        train_size : Train data 크기
    Returns:
        None
    '''
    print_mae(inv_actual, inv_pred)
    plot_result(y, inv_pred, train_size)

In [None]:
sd = df['VAR1'].unique()
train_size = 79
window_size = 6
forecast_flg = True
pred_df = pd.DataFrame(columns={'VAR0', 'VAR1','VAR2', 'VAR16'})   

for i, v in tqdm(enumerate(sd)):
    sd_df = df[df['VAR1']==v]
    for j, vv in enumerate(sd_df['VAR2'].unique()):    
        # 시군구 단위 데이터 프레임 생성
        sgg = sd_df[sd_df['VAR2']==vv].drop(['VAR1','VAR2'], axis=1)
        sgg.set_index('VAR0', inplace=True)

        # 데이터 준비
        sc, train_features, test_features, train_label, test_label = prepare_data(sgg, train_size, window_size)
        
        # 모델링 및 훈련
        model_fit = modeling(train_features, train_label)
        
        # 예측
        inv_test, inv_pred = predict_test(model_fit, test_features, test_label, sc, forecast_flg)
        
        # 성능 평가
        if not forecast_flg:
            evaluate_model(sgg['VAR16'], inv_test, inv_pred, train_size)
        else:
            # 데이터 프레임에 저장
            row = pd.DataFrame({'VAR0':'2022-06', 
                                'VAR1':v, 
                                'VAR2':vv, 
                                'VAR16':inv_pred[:, -1].round(0)})
            pred_df = pd.concat([pred_df, row], ignore_index=True)
        
        gc.collect()
        clear_session()