In [63]:
import tensorflow as tf
import numpy as np
import random
import os
import pandas as pd
from tensorflow.keras.layers import LSTM, Dense, Flatten, Dropout, Dropout
from tensorflow.keras.models import Sequential
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from tensorflow.keras.backend import clear_session
import gc

In [2]:
from matplotlib import font_manager, rc
font_path = "C:/Windows/Fonts/gulim.ttc"
font = font_manager.FontProperties(fname=font_path).get_name()
rc('font', family=font)

In [3]:
def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    tf.random.set_seed(seed)

In [4]:
seed_everything(42)

In [5]:
df = pd.read_excel('./data/dataset.xlsx')

In [6]:
df = df[['STDR_YM', 'CTPRVN_NM', 'SIGNGU_NM', 'LTRS_CO', '중견기업수', '중소기업수', 'TOT_ENTRPRS_CO', 'ECNY_CO',
       'RETIRE_CO', '지급액', '수출건수', '수출금액', '수입건수', '수입금액', '무역수지',
       '취득자수', '상실자수', '수급자격신청자수', '수급자격인정자수', '실업급여지급자수', '실업급여지급건수',
       '실업급여지급액', '신규성립사업장수', '소멸사업장수', 'elec_use', 'EMPLY_CO', '피보험자수']]

In [123]:
df.drop('EMPLY_CO', axis=1, inplace=True)

In [104]:
def scaling(train, test):
    sc = MinMaxScaler()
    train_sc = sc.fit_transform(train)
    train_sc = pd.DataFrame(train_sc)
    train_sc.columns = train.columns
    
    test_sc = sc.transform(test)
    test_sc = pd.DataFrame(test_sc)
    test_sc.columns = test.columns
    
    return sc, train_sc, test_sc

In [150]:
def make_dataset(data, label, window_size=6):
    feature_list = []
    label_list = []
    for i in range(len(data) - window_size):
        feature_list.append(np.array(data.iloc[i:i+window_size]))
        label_list.append(np.array(label.iloc[i+window_size]))
    if not feature_list:
        feature_list = data.values.reshape(-1, window_size, len(data.columns))
    return np.array(feature_list), np.array(label_list)

In [106]:
def split_X_y(df, label_name):
    feature_cols = list(df.drop(label_name, axis=1).columns)
    label_cols = label_name
    
    features = df[feature_cols]
    label = df[label_cols]
    return features, label

In [121]:
def split_tr_ts(df, train_size, window_size):
    train = df[:train_size]
    test = df[train_size-window_size:]
    return train, test

In [127]:
def prepare_data(df, train_size, window_size):
    # train - test 분리
    train, test = split_tr_ts(df, train_size, window_size)
    
    # scaling
    sc, train_sc, test_sc = scaling(train, test)
    
    # X - y 분리
    train_features, train_label = split_X_y(train_sc, '피보험자수')
    test_features, test_label = split_X_y(test_sc, '피보험자수')
    
    train_features, train_label = make_dataset(train_features, train_label, window_size)
    test_features, test_label = make_dataset(test_features, test_label, window_size)
    
    return sc, train_features, test_features, train_label, test_label

In [109]:
def model_creation(train_features):
    model = Sequential()
    model.add(LSTM(64, input_shape=(train_features.shape[1], train_features.shape[2]), activation='tanh', return_sequences=True))
    model.add(LSTM(16, activation='tanh', return_sequences=False))
    model.add(Dense(1))
    
    model.compile(optimizer='adam', loss='mse')
    return model

In [110]:
def fit_model(model, train_features, train_label, epochs, batch_size):
    history = model.fit(train_features, train_label, epochs=epochs, batch_size=batch_size, verbose=0)
    return model, history

In [111]:
def modeling(train_features, train_label):
    model = model_creation(train_features)
    model_fit, history = fit_model(model, train_features, train_label, 35, 32)
    return model_fit

In [112]:
def inverse_series(sc, series, n_features):
    return sc.inverse_transform(np.concatenate([np.zeros((len(series), n_features)), series.reshape(-1, 1)], 1))

In [113]:
def predict_test(model_fit, test_features, test_label, sc, forecast_flg=False):
    pred = model_fit.predict(test_features)
    if forecast_flg:
        inv_pred = inverse_series(sc, pred, test_features.shape[2])
        inv_test = None
    else:
        inv_pred = inverse_series(sc, pred, test_features.shape[2])
        inv_test = inverse_series(sc, test_label, test_features.shape[2])
    return inv_test, inv_pred

In [114]:
def calc_mse(inv_actual, inv_pred):
    print(mean_squared_error(inv_actual[:, -1], inv_pred[:, -1]))

In [115]:
def plot_result(y, inv_pred, train_size):
    inv_pred = pd.DataFrame(inv_pred[:, -1].round(0))
    inv_pred.index = list(range(train_size, len(y)))
    
    plt.figure(figsize=(15, 5))
    plt.plot(y.reset_index(drop=True), label='actual')
    plt.plot(inv_pred, label='prediction')
    plt.xlabel('시간')
    plt.ylabel('피보험자수')
    plt.show()

In [116]:
def evaluate_model(y, inv_actual, inv_pred, train_size):
    print_mse(inv_actual, inv_pred)
    #plot_result(y, inv_pred, train_size)

In [152]:
sd = df['CTPRVN_NM'].unique()
train_size = 79
window_size = 6
forecast_flg = True
pred_df = pd.DataFrame(columns={'STDR_YM', 'CTPRVN_NM','SIGNGU_NM', '피보험자수'})   

for i, v in enumerate(sd):
    sd_df = df[df['CTPRVN_NM']==v]
    for j, vv in enumerate(sd_df['SIGNGU_NM'].unique()):    
        # 시군구 단위 데이터 프레임 생성
        sgg = sd_df[sd_df['SIGNGU_NM']==vv].drop(['CTPRVN_NM','SIGNGU_NM'], axis=1)
        sgg.set_index('STDR_YM', inplace=True)

        # 데이터 준비
        sc, train_features, test_features, train_label, test_label = prepare_data(sgg, train_size, window_size)
        
        # 모델링 및 훈련
        model_fit = modeling(train_features, train_label)
        
        # 예측
        inv_test, inv_pred = predict_test(model_fit, test_features, test_label, sc, forecast_flg)
        
        # 성능 평가
        if not forecast_flg:
            evaluate_model(sgg['피보험자수'], inv_test, inv_pred, train_size)
        
        # 데이터 프레임에 저장
        row = pd.DataFrame({'STDR_YM':'2021-06', 
                            'CTPRVN_NM':v, 
                            'SIGNGU_NM':vv, 
                            '피보험자수':inv_pred[:, -1].round(0)})
        pred_df = pd.concat([pred_df, row], ignore_index=True)
        
        gc.collect()
        clear_session()



In [154]:
pred_df.to_csv('./pred_df.csv', encoding='cp949', index=False)

In [155]:
pred_df

Unnamed: 0,CTPRVN_NM,피보험자수,SIGNGU_NM,STDR_YM
0,강원도,44810.0,강릉시,2021-06
1,강원도,5840.0,고성군,2021-06
2,강원도,15973.0,동해시,2021-06
3,강원도,14978.0,삼척시,2021-06
4,강원도,14370.0,속초시,2021-06
...,...,...,...,...
224,충청북도,25528.0,제천시,2021-06
225,충청북도,7438.0,증평군,2021-06
226,충청북도,38859.0,진천군,2021-06
227,충청북도,203520.0,청주시,2021-06
