In [75]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV,  RandomizedSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix, f1_score, roc_auc_score, mean_absolute_error
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.utils.class_weight import compute_class_weight
from scipy.stats import randint
from bayes_opt import BayesianOptimization
from xgboost import XGBClassifier, XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from tensorflow.keras.layers import Conv1D, RNN, GRU, LSTM, Dense, Input, BatchNormalization, Dropout, concatenate, Flatten, add, MaxPool1D, RepeatVector
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.losses import Huber 
from tensorflow.keras.regularizers import L1
from tensorflow.keras.metrics import AUC
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.optimizers.schedules import ExponentialDecay
from tensorflow import one_hot
import tensorflow as tf
import os
import pandas as pd
import numpy as np 
import seaborn as sns
import statsmodels.api as sm
import matplotlib.pyplot as plt
import time
import math
import random
import warnings
import pickle

In [2]:
warnings.filterwarnings('ignore')

# 함수 정의

In [None]:
def mae_cv(model, df): 
    # TimeSeries cv 함수 
    maes = []
    cv_model = model
    years = ['A', 'B', 'C', 'D', 'E', 'F']
    for i in range(4):
        train_cv = df[(df['year'].isin(years[0:i+1]))|((df['year'] == years[i+1])&(df['month']==1))]
        test_cv = df[((df['year'] == years[i+1])&(df['month']!=1)) | ((df['year'] == years[i+2])&(df['month']==1))]
        x_train_cv, y_train_cv = make_xy(train_cv)
        x_test_cv, y_test_cv = make_xy(test_cv)
        _ = cv_model.fit(x_train_cv, y_train_cv)
        maes.append(mean_absolute_error(y_test_cv, cv_model.predict(x_test_cv)))
    return np.mean(maes)

In [4]:
def make_weather_df(df:pd.DataFrame, weather:str):
    # 계절별로 나눠주는 함수
    new_df = df.copy()
    if weather == 'spring':
        m_list = range(2, 5)
    elif weather == 'summer':
        m_list = range(5, 8)
    elif weather == 'fall':
        m_list = range(8, 11)
    elif weather == 'winter':
        m_list = [11,12,1]
        
    return new_df[new_df['month'].isin(m_list)].reset_index(drop = True)

In [5]:
def make_xy(df:pd.DataFrame):
    x = df.copy()
    x['mmwk'] = 'm' + x['month'].astype(str) + 'w' + x['week'].astype(str)
    x['hour'] = 'h' + x['hour'].astype(str)
    x = x.drop(['area', 'year','month','week','day','land_temp'], axis = 1)
    x['fog'] = pd.Categorical(x['fog'], categories = ['C', 'F', 'G', 'H', 'R', 'S', 'X'])
    x = pd.concat([x, pd.get_dummies(x['mmwk']), pd.get_dummies(x['fog'])], axis = 1).drop(['mmwk', 'hour', 'fog'], axis = 1)
    y = df[['land_temp']]
    return x, y

In [77]:
def make_x(df:pd.DataFrame):
    #test data 용
    x = df.copy()
    x['mmwk'] = 'm' + x['month'].astype(str) + 'w' + x['week'].astype(str)
    x['hour'] = 'h' + x['hour'].astype(str)
    x = x.drop(['area', 'year','month','week','day'], axis = 1)
    x['fog'] = pd.Categorical(x['fog'], categories = ['C', 'F', 'G', 'H', 'R', 'S', 'X'])
    x = pd.concat([x, pd.get_dummies(x['mmwk']), pd.get_dummies(x['fog'])], axis = 1).drop(['mmwk', 'hour', 'fog'], axis = 1)
    return x

In [78]:
def make_lagged(df:pd.DataFrame, columns:list, lag:int):
    #lagged 변수 생성
    new_df = df.drop(range(lag)).reset_index(drop = True).copy()
    up, down = lag-1, 1
    while up > -1: 
        drop_list = list(range(up)) + list(range(len(df)-1, len(df)-1-down, -1))
        lag = df[columns].drop(drop_list).reset_index(drop = True).copy()
        lag.columns = [x + '-' + str(down) for x in lag.columns]
        new_df = pd.concat([new_df, lag], axis = 1)
        up -= 1
        down += 1 
    return new_df

In [79]:
def repeat_lagged(df:pd.DataFrame, columns:list, lag:int):
    #area 별로 나눠서 lag 변수 생성
    dfs = []
    for area in df.area.unique():
        tmp = df[df['area']==area].reset_index(drop = True)
        dfs.append(make_lagged(tmp, columns, lag))
    return pd.concat(dfs).reset_index(drop=True)

# 데이터

In [9]:
path = 'data/'

In [10]:
train = pd.read_csv(path+'train0624.csv', index_col = 0).reset_index(drop = True)
train['month'] = train.month.astype(int)
train['day'] = train['day'].astype(int)
train['hour'] = train['hour'].astype(int)
train['area'] = train['area'].astype(int)
train['week'] = np.where(train['day'] < 11, 1, np.where(train['day'] < 21, 2, 3))

In [11]:
train['snow'] = np.where((train['snow'] == -99.9)&(train['fog'] != 'S'), 0, train['snow'])
train['solar_time'] = np.where((train['solar_time'] == -99.9)&(train['fog'] != 'C'), 0, train['solar_time'])
train['solar_amt'] = np.where((train['solar_amt'] == -99.9)&(train['fog'] != 'C'), 0, train['solar_amt'])
train['rain'] = np.where((train['rain'] == -99.9)&(train['fog'] != 'R'), 0, train['rain'])
train['tf_rain'] = np.where((train['tf_rain'] == -99.9)&(train['fog'] != 'R'), 0, train['tf_rain'])
train = train.replace(-99.9, np.nan)
train['solar_amt'] = train['solar_amt'].fillna(0)
train['solar_time'] = train['solar_time'].fillna(0)
train['tf_rain'] = np.where(train['tf_rain'] < 0, 0, train['tf_rain'])/60
train['humid'] = train['humid']/100

In [12]:
test = pd.read_csv(path+'imputed_test_data0624.csv').reset_index(drop = True)
test['month'] = test.month.astype(int)
test['day'] = test['day'].astype(int)
test['hour'] = test['hour'].astype(int)
test['area'] = test['area'].replace(1, 'a').replace(2, 'b').replace(3, 'c')
test['week'] = np.where(test['day'] < 11, 1, np.where(test['day'] < 21, 2, 3))

In [13]:
test['snow'] = np.where((test['snow'] == -99.9)&(test['fog'] != 'S'), 0, test['snow'])
test['solar_time'] = np.where((test['solar_time'] == -99.9)&(test['fog'] != 'C'), 0, test['solar_time'])
test['solar_amt'] = np.where((test['solar_amt'] == -99.9)&(test['fog'] != 'C'), 0, test['solar_amt'])
test['rain'] = np.where((test['rain'] == -99.9)&(test['fog'] != 'R'), 0, test['rain'])
test['tf_rain'] = np.where((test['tf_rain'] == -99.9)&(test['fog'] != 'R'), 0, test['tf_rain'])
test = test.replace(-99.9, np.nan)
test['solar_amt'] = test['solar_amt'].fillna(0)
test['solar_time'] = test['solar_time'].fillna(0)
test = test.interpolate(method = 'linear')
test['tf_rain'] = np.where(test['tf_rain'] < 0, 0, test['tf_rain'])/60
test['humid'] = test['humid']/100

In [82]:
seasonal=  {} #hour 감안하기 위해 24시간 seasonality 구함
for area in range(1, 11): 
    # area 별로 시즌 따로 구해줌
    ts = train[(train['area']==area)].reset_index(drop = True) #area 당 train 기간 동안 time series 구함 
    decomposition = sm.tsa.seasonal_decompose(ts['land_temp'], model='additive', period = 24) # hour별 트렌드가 목적이므로 period = 24 
    seasonal.update({'area'+str(area): decomposition.seasonal[:24]}) # period를 24로 잡았기 때문에 0~23 숫자가 계속 반복되므로 처음 24개만 가져옴
season = pd.DataFrame(seasonal).mean(axis = 1) # area 10개에 대해서 평균으로 사용 
train['season'] = 0 
for i in range(24):
    train['season'] = np.where(train['hour'] == i, season[i], train['season'])
test['season'] = 0
for i in range(24):
    test['season'] = np.where(test['hour'] == i, season[i], test['season'])

In [81]:
cols = ['temp', 'dp_temp', 'humid', 'wind', 'rain', 'tf_rain', 'solar_amt', 'solar_time']
train_lagged = repeat_lagged(train, cols, 24)
test_lagged = repeat_lagged(test, cols, 24)

# 모델 선택

In [16]:
def cat_cv(depth, random_strength, bagging_temperature, l2_leaf_reg, border_count):
    #catboost용 cv 함수 정의
    depth = int(depth)
    border_count = int(border_count)
    random_strength = 10**(random_strength)
    bagging_temperature = 10**(bagging_temperature)
    model = CatBoostRegressor(random_state = 0, verbose = False, loss_function = "MAE", depth=depth, random_strength=random_strength, 
                                  bagging_temperature=bagging_temperature, l2_leaf_reg=l2_leaf_reg, border_count=border_count)
    return -mae_cv(model, train_lagged)
    
cat_param_ranges = {'depth': (1, 10), 'random_strength': (-5, 3), 'bagging_temperature': (-5, 5), 'l2_leaf_reg': (1,500), 'border_count': (1,1024)}

BO = BayesianOptimization(f=cat_cv, pbounds=cat_param_ranges, random_state=0)
start = time.time()
time_limit = 3600*3  # BO 돌릴 시간 (초) 정해줌 
while time.time() - start < time_limit:
        BO.maximize(init_points=1, n_iter= 1)

best_params = BO.max['params'] 
best_params['depth'] = int(best_params['depth'])
best_params['border_count'] = int(best_params['border_count'])
best_params['bagging_temperature'] = 10**best_params['bagging_temperature']
best_params['random_strength'] = 10**best_params['random_strength']
    
best_model = CatBoostRegressor(random_state = 0, verbose = False, loss_function = "MAE", **best_params)
x_train, y_train = make_xy(train_lagged)
_ = best_model.fit(x_train, y_train)
y_pred = best_model.predict(x_train)
print("CAT train MAE:", round(mean_absolute_error(y_train, y_pred), 3))

|   iter    |  target   | baggin... | border... |   depth   | l2_lea... | random... |
-------------------------------------------------------------------------------------
| [0m1        [0m | [0m-1.561   [0m | [0m0.4881   [0m | [0m732.6    [0m | [0m6.425    [0m | [0m272.9    [0m | [0m-1.611   [0m |
| [0m2        [0m | [0m-1.766   [0m | [0m-0.6712  [0m | [0m456.1    [0m | [0m1.083    [0m | [0m383.1    [0m | [0m1.104    [0m |
|   iter    |  target   | baggin... | border... |   depth   | l2_lea... | random... |
-------------------------------------------------------------------------------------
| [95m3        [0m | [95m-1.533   [0m | [95m1.459    [0m | [95m448.7    [0m | [95m9.026    [0m | [95m481.9    [0m | [95m-1.932   [0m |
| [0m4        [0m | [0m-1.55    [0m | [0m-0.7621  [0m | [0m451.2    [0m | [0m7.837    [0m | [0m479.6    [0m | [0m-4.328   [0m |
|   iter    |  target   | baggin... | border... |   depth   | l2_lea... | random.

# 제출용

In [83]:
x_test = make_x(test_lagged) 
test_lagged['y_pred'] = best_model.predict(x_test) #검증 데이터 예측 생성

In [87]:
summer = pd.read_excel(path+'subminssionUser.xlsx', sheet_name=None)['SUMMER']

In [88]:
summer['idx'] = summer['STN'] + summer['YEAR'] + summer['MMDDHH'].astype(str) #제출 데이터와 학습 데이터 연결할 key 변수 
test_lagged['idx'] = test_lagged['area'] + test_lagged['year'] + test_lagged['month'].astype(str) + test_lagged['day'].astype(str).str.zfill(2) + test_lagged['hour'].astype(str).str.zfill(2)

In [89]:
summer = summer.merge(test_lagged[['idx', 'y_pred']], on = 'idx', how = 'left') # 제출 데이터에 맞추기 위해 left join

In [92]:
summer['y_pred'].isna().sum() #na 있는지 확인

0

In [72]:
summer.to_csv('summer_result.csv', index = False)