# 0. Initialization On Colab

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

FOLDERNAME = 'Colab Notebooks/21_BigCon'
assert FOLDERNAME is not None, "[!] Enter the foldername."

import sys
sys.path.append('/content/drive/My Drive/{}/datasets'.format(FOLDERNAME))

# 1. Call Library And Setting Working Directory

In [None]:
import matplotlib
%matplotlib inline

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import os
import warnings
warnings.filterwarnings("ignore")

In [None]:
os.chdir('/content/drive/My Drive/21_BigCon/datasets')

# 2. Load Data

In [None]:
# cluster별 데이터 Load
cluster0 = pd.read_csv('data_cluster0.csv')
cluster1 = pd.read_csv('data_cluster1.csv')
cluster2 = pd.read_csv('data_cluster2.csv')

for cluster in [cluster0, cluster1, cluster2]:
  cluster.drop(['Unnamed: 0'], axis=1, inplace=True)
  cluster['base_date'] = cluster['base_date'].map(str)
  cluster.index = cluster['base_date']                 # DataFrame의 index를 'base_date'로 바꿔줌
  cluster.drop(['base_date'], axis=1, inplace=True)
cluster0

# 3. 시계열 모형을 바탕으로 21년 7~8월의 feature값(X) 예측해 cluster별로 csv 파일로 저장

In [None]:
# GridSearch를 이용해 최적의 p, d, q를 찾아 7~8월 예측값까지 얻어내는 함수 작성
from statsmodels.tsa.arima_model import ARIMA
import itertools

def grid_search_pdq(a):
  '''
  - Input
  1. a(하나의 emd에 대한 하나의 feature로 나타나는 Series)
  2. d : decision_d를 바탕으로 얻어낸 d
  - Output : 최적의 ARIMA 모형을 바탕으로 한 7, 8월의 예측값
  '''
  # Grid Search
  p = d = q = range(0,3)
  pdq = list(itertools.product(p,d,q)) # gets all possible combinations of p, d, and q
  combs = {} # stores aic and order pairs
  aics = [] # stores aics
  # Grid Search continued
  for combination in pdq:
    try:
      model = ARIMA(a, order=combination) # create all possible models
      model = model.fit()
      combs.update({model.aic : combination}) # store combinations
      aics.append(model.aic)
    except:
      continue

    best_aic = min(aics)

  # Model Creation and Forecasting
  model = ARIMA(a, order=combs[best_aic])
  model = model.fit()
  july,august = model.forecast(steps=2)[0] # 21년 7월, 8월 예측
  return [july, august]

In [None]:
## Loop문
'''
1. 시계열모형을 바탕으로 21년 7~8월의 X 값 예측
=>  for 행정구역 in 행정구역s
      for feature in features
        feature * base_date로 구성된 dataframe 'a' 생성
        grid_search_pdq(a)로 시계열모형 적용 &21년 7~8월값 예측해 얻어내기
        21년 7~8월값을 각 행정구역별로 Feauture를 행으로 202107, 202108을 열로 갖는 test 데이터프레임 생성
'''

cluster_no = 0

for cluster in [cluster0, cluster1, cluster2]:
  emds = list(np.unique(cluster['emd_nm'])) 
  features = list(cluster.columns[3:])

  result = cluster.copy()

  for emd in emds:
    frame_name = emd + '_test'
    july_list = list()
    august_list = list()
    for feature in features:
      a = cluster[cluster['emd_nm']==emd][feature].astype('float32')
      b = grid_search_pdq(a)

      july_list.append(b[0])
      august_list.append(b[1])
    frame_name = pd.DataFrame({'Feature' : features, '202107' : july_list, '202108' : august_list})  

    frame_name_Transposed = frame_name.T
    header = frame_name_Transposed.iloc[0]
    frame_name_Transposed = frame_name_Transposed[1:]
    frame_name_Transposed.rename(columns = header, inplace=True)
    frame_name_Transposed['emd_nm'] = [emd, emd]
    frame_name_Transposed['base_date'] = frame_name_Transposed.index
    result = result.append(frame_name_Transposed, ignore_index=True)
  
  predicted = result 
  july_predicted = predicted[predicted['base_date']=='202107']
  aug_predicted = predicted[predicted['base_date']=='202108']

  predicted = july_predicted.append(aug_predicted)
  predicted.to_csv('cluster' + str(cluster_no) + '_feature_predicted.csv')
  cluster_no = cluster_no + 1
    

# 4. Train data 정제 및 Training

In [None]:
# 불필요한 column들을 삭제해 Train data (X, y) 정제
cluster0_feature_predicted=pd.read_csv('cluster0_feature_predicted.csv')
cluster1_feature_predicted=pd.read_csv('cluster1_feature_predicted.csv')
cluster2_feature_predicted=pd.read_csv('cluster2_feature_predicted.csv')

for cluster_predicted in [cluster0_feature_predicted,cluster1_feature_predicted,cluster2_feature_predicted]:
  cluster_predicted.index=cluster_predicted[['base_date','emd_nm']]
  cluster_predicted.drop(['Unnamed: 0','em_cnt','em_g','base_date'],axis=1,inplace=True)

cluster0_feature_predicted=pd.get_dummies(cluster0_feature_predicted,columns=['emd_nm'])
cluster2_feature_predicted=pd.get_dummies(cluster2_feature_predicted,columns=['emd_nm'])
cluster1_feature_predicted=pd.get_dummies(cluster1_feature_predicted,columns=['emd_nm'])

cluster0_feature_predicted

In [None]:
# Train data의 y 값 분리
cluster0_y=cluster0['em_g']
cluster1_y=cluster1['em_g']
cluster2_y=cluster2['em_g']

In [None]:
# Train data의 X 값 분리
cluster0_x=pd.get_dummies(cluster0,columns=['emd_nm'])
cluster1_x=pd.get_dummies(cluster1,columns=['emd_nm'])
cluster2_x=pd.get_dummies(cluster2,columns=['emd_nm'])

for i in [cluster0_x,cluster1_x,cluster2_x]:
  i.drop(['em_g','em_cnt'],axis=1,inplace=True)

## Training for cluster0

In [None]:
# cluster0에 대한 training

import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

param = {
    'max_depth':[2,3,4],
    'n_estimators':range(300,600,100),
    'colsample_bytree':[0.5,0.7,1],
    'colsample_bylevel':[0.5,0.7,1],
}

model = xgb.XGBRegressor()
grid_search = GridSearchCV(estimator=model, param_grid=param, cv=10, n_jobs=-1)

grid_search.fit(cluster0_x,cluster0_y)

In [None]:
cv_result = pd.DataFrame(grid_search.cv_results_)
cv_result

In [None]:
print("best params : ", grid_search.best_params_)
print('best score : ', grid_search.best_score_)

In [None]:
model = xgb.XGBRegressor(colsample_bylevel= 1, colsample_bytree= 1, max_depth= 2, n_estimators= 400)
model.fit(cluster0_x,cluster0_y)
model.predict(cluster0_feature_predicted)

In [None]:
grid_search.predict(cluster0_feature_predicted)

In [None]:
pd.DataFrame(grid_search.predict(cluster0_feature_predicted),index=cluster0_feature_predicted.index,columns=['em_g'])

## Training for cluster1

In [None]:
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

param = {
    'max_depth':[2,3,4],
    'n_estimators':range(300,600,100),
    'colsample_bytree':[0.5,0.7,1],
    'colsample_bylevel':[0.5,0.7,1],
}

model = xgb.XGBRegressor()
grid_search = GridSearchCV(estimator=model, param_grid=param, cv=10, n_jobs=-1)

grid_search.fit(cluster1_x,cluster1_y)

print("best params : ", grid_search.best_params_)
print('best score : ', grid_search.best_score_)

In [None]:
pd.DataFrame(grid_search.predict(cluster1_feature_predicted),index=cluster1_feature_predicted.index,columns=['em_g'])

## Training for cluster2

In [None]:
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

param = {
    'max_depth':[2,3,4],
    'n_estimators':range(300,600,100),
    'colsample_bytree':[0.5,0.7,1],
    'colsample_bylevel':[0.5,0.7,1],
}

model = xgb.XGBRegressor()
grid_search = GridSearchCV(estimator=model, param_grid=param, cv=10, n_jobs=-1)

grid_search.fit(cluster2_x,cluster2_y)

print("best params : ", grid_search.best_params_)
print('best score : ', grid_search.best_score_)

In [None]:
pd.DataFrame(grid_search.predict(cluster2_feature_predicted),index=cluster2_feature_predicted.index,columns=['em_g'])