In [1]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV,  RandomizedSearchCV, cross_val_score, TimeSeriesSplit
from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix, f1_score, roc_auc_score, mean_absolute_error
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, VotingRegressor
from sklearn.utils.class_weight import compute_class_weight
from sklearn.pipeline import Pipeline
from sklearn.compose import TransformedTargetRegressor
from scipy.stats import randint
from bayes_opt import BayesianOptimization
from lightgbm import LGBMRegressor

from tensorflow import one_hot
import tensorflow as tf
import os
import pandas as pd
import numpy as np 
import seaborn as sns
import statsmodels.api as sm
import matplotlib.pyplot as plt
import time
import math
import random
import warnings
import itertools

from skopt import BayesSearchCV
from skopt.space import Real, Integer

warnings.filterwarnings('ignore')

## [LGBM] area ensemble
- LGBM Ensemble
- lag 24시간 추가(temp, dp_temp, wind, rain, humid, solar_amt, solar_time)
- 상호작용항 추가
   - 일사량의 누적합과 상대습도, 기온, 풍속
   - 상대습도와 after_rain, tf_rain, dp_temp
   - after_rain과 풍속(wind)

- 3차 상호작용항
    - 기온, 상대습도, 풍속
    - 기온, 일사량, 일조시간
    - 기온, 상대습도, 일사량

- 상호작용항의 lag(1,2,3,24)
    - 대상변수: 'humid_after_rain','humid_tf_rain', 'humid_dp_temp'
    - 대상변수 설명: 상대습도와 after_rain, tf_rain, dp_temp의 상호작용항

- day, month 범주화

#### 1. 함수 정의

In [31]:
warnings.filterwarnings('ignore')
def feature_extraction(original_df:pd.DataFrame):
    df = original_df.copy()
    
    #abnormal value adjustment
    df['solar_amt'] = np.where(surface['solar_amt'] < 0, 0, surface['solar_amt'])
    df['solar_time'] = np.where(surface['solar_time'] < 0, 0, surface['solar_time'])
    df['rain'] = np.where(surface['rain'] < 0, 0, surface['rain'])
    df['tf_rain'] = np.where(surface['tf_rain'] < 0, 0, surface['tf_rain'])/60 #0~1
    df['humid'] = np.where(surface['humid'] < 0, 0, surface['humid'])/100 #0~1
    df['snow'] = np.where(surface['snow'] == -99.9, 0, surface['snow'])
    
    #preprocessing
    df['hour_sin'] = np.sin(2 * np.pi * df['hour']/24.0)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour']/24.0)
    month_days = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
    df['day_cos'] = np.cos(2 * np.pi * df['day'] / df['month'].map(lambda x: month_days[x-1]))
    df['day_sin'] = np.sin(2 * np.pi * df['day'] / df['month'].map(lambda x: month_days[x-1]))
    
    #difference
    df['temp_diff'] = df['temp'] - df['dp_temp']
    
    #interaction
    interaction_cols = ["temp", "dp_temp", "humid", "wind", "solar_amt"]
    interaction_terms = []
    for terms in itertools.combinations(interaction_cols, 2):
        term_name = '_'.join(terms)
        df[term_name] = df[list(terms)].product(axis=1)
    df['temp_dp'] = df['temp']*df['dp_temp']
    
    return df

In [32]:
def process_group(group, wthr):
    group = group.reset_index(drop=True)
    group[wthr+'_diff'] = group[wthr].diff()
    group['after_'+wthr] = 0
    index_list = group[group[wthr+'_diff'] < 0].index
    for index in index_list:
        for i in range(24):
            if index + i < len(group):
                group.loc[index + i, 'after_'+wthr] = i + 1
    return group

def add_lag(original_df:pd.DataFrame, lag_cols: list, pct_cols: list, lags: list):
    # lag_cols: lag 넣을 변수들 목록 
    # pct_cols: percentage change lag 넣을 변수들 목록 
    # lags: 몇 시간 전 값의 lag를 설정할 것인지
    
    df = original_df.copy()
    
    #lag
    for col in pct_cols:
        for i in lags:
            df[col + '_pct_' + str(i)] = df.groupby('area')[col].pct_change(periods=i)
    for col in lag_cols:
        for i in lags:
            df[col + '_lag_' + str(i)] = df.groupby('area')[col].shift(periods=i)

    #cumulative 
    df['rain_mask'] = (df['tf_rain'] != 0).astype(int)
    df['rain_group'] = (df['rain_mask'].diff() < 0).astype(int).cumsum()
    df['rain_count'] = df.groupby(['area', 'rain_group'])['rain_mask'].apply(lambda x: x.cumsum() * x)
    df.drop(['rain_mask', 'rain_group'], axis=1, inplace=True)
    df['solar_amt_sum'] = df.groupby('area')['solar_amt'].rolling(window=3).sum().reset_index(drop=True)
    df = df.groupby('area').apply(process_group, wthr="snow")
    df.drop('snow_diff', axis=1, inplace = True)
    df.reset_index(drop = True, inplace = True)
    df = df.groupby('area').apply(process_group, wthr="rain")
    df.drop('rain_diff', axis=1, inplace = True)
    df.reset_index(drop = True, inplace = True)

    #interaction
    df["temp_after_snow"] = df[["temp", "after_snow"]].product(axis=1)
    df["solar_amt_after_snow"] = df[["solar_amt", "after_snow"]].product(axis=1)
    df["temp_after_rain"] = df[["temp", "after_rain"]].product(axis=1)
    df["solar_amt_after_rain"] = df[["solar_amt", "after_rain"]].product(axis=1)
    
    return df

In [33]:
def make_weather_df(original_df:pd.DataFrame, weather:str, adj:bool, fog_split:dict):
    # adj: snow adjustment 시행여부
    # fog split: fog 변수 어떻게 분할한건지 
    
    df = original_df.copy()
    if adj == True:
        df['solar_amt'] = df['solar_amt'].div(1 + np.abs(df['snow']))
        df['temp'] = df['temp'].div(1 + np.abs(df['snow'])*8)
    if weather == 'spring':
        m_list = range(2, 5)
    elif weather == 'summer':
        m_list = range(5, 8)
    elif weather == 'fall':
        m_list = range(8, 11)
    elif weather == 'winter':
        m_list = [range(11, 13)] + [1] 
    
    df['fog'] = surface['fog'].replace(fog_split)
    df = pd.get_dummies(df, columns = ['fog'])
    
    if weather in ['summer', 'fall']:
        snow_cols = [x for x in df.columns if x.endswith('snow')]
        df.drop(snow_cols, axis = 1, inplace = True)
    
    df = df[df['month'].isin(m_list)].reset_index(drop = True)
    # month_day 범주화
    df['new_day'] = df['day'].apply(lambda x: 1 if x in range(1,8) else (2 if x in range(8,15) else (3 if x in range(15,22) else 4)))
    df['MMDD'] = df['month'].astype('int').astype('str') + '_' + df['new_day'].astype('int').astype('str').str.zfill(2)
    df = pd.get_dummies(df, columns = ['MMDD'])
    df.drop(['new_day'], axis=1, inplace=True)    
    
    return df

In [1]:
# Cross-Validation
from sklearn.model_selection._split import _BaseKFold, indexable, _num_samples
from sklearn.utils.validation import _deprecate_positional_args

class GroupTimeSeriesSplit(_BaseKFold):
    @_deprecate_positional_args
    def __init__(self,
                 n_splits=5,
                 *,
                 max_train_size=None
                 ):
        super().__init__(n_splits, shuffle=False, random_state=None)
        self.max_train_size = max_train_size

    def split(self, X, y=None, groups=None):
        if groups is None:
            raise ValueError(
                "The 'groups' parameter should not be None")
        X, y, groups = indexable(X, y, groups)
        n_samples = _num_samples(X)
        n_splits = self.n_splits
        n_folds = n_splits + 1
        group_dict = {}
        u, ind = np.unique(groups, return_index=True)
        unique_groups = u[np.argsort(ind)]
        n_samples = _num_samples(X)
        n_groups = _num_samples(unique_groups)
        for idx in np.arange(n_samples):
            if (groups[idx] in group_dict):
                group_dict[groups[idx]].append(idx)
            else:
                group_dict[groups[idx]] = [idx]
        if n_folds > n_groups:
            raise ValueError(
                ("Cannot have number of folds={0} greater than"
                 " the number of groups={1}").format(n_folds,
                                                     n_groups))
        group_test_size = n_groups // n_folds
        group_test_starts = range(n_groups - n_splits * group_test_size,
                                  n_groups, group_test_size)
        for group_test_start in group_test_starts:
            train_array = []
            test_array = []
            for train_group_idx in unique_groups[:group_test_start]:
                train_array_tmp = group_dict[train_group_idx]
                train_array = np.sort(np.unique(
                                      np.concatenate((train_array,
                                                      train_array_tmp)),
                                      axis=None), axis=None)
            train_end = train_array.size
            if self.max_train_size and self.max_train_size < train_end:
                train_array = train_array[train_end -
                                          self.max_train_size:train_end]
            for test_group_idx in unique_groups[group_test_start:
                                                group_test_start +
                                                group_test_size]:
                test_array_tmp = group_dict[test_group_idx]
                test_array = np.sort(np.unique(
                                              np.concatenate((test_array,
                                                              test_array_tmp)),
                                     axis=None), axis=None)
            yield [int(i) for i in train_array], [int(i) for i in test_array]

#### 2. train data 전처리

In [62]:
path = 'C:/Users/hyuns/OneDrive - 고려대학교/문서/WeatherContest/data/'
surface = pd.read_csv(path + 'train0624.csv')
surface.drop('Unnamed: 0', axis = 1, inplace = True)
surface = surface.astype({'month':'int', 
                                       'day' : 'int', 
                                       'hour':'int',
                                       'area':'int'})
surface_fe = feature_extraction(original_df = surface) #lag 안한 변수만 추가 
surface_fe 

Unnamed: 0,area,year,month,day,hour,temp,dp_temp,humid,wind,rain,...,temp_humid,temp_wind,temp_solar_amt,dp_temp_humid,dp_temp_wind,dp_temp_solar_amt,humid_wind,humid_solar_amt,wind_solar_amt,temp_dp
0,1,A,2,1,0,-9.9,-10.7,0.939,0.6,0.0,...,-9.2961,-5.94,-0.000000,-10.0473,-6.42,-0.000000,0.5634,0.000000,0.000000,105.93
1,1,A,2,1,1,-10.8,-11.6,0.938,0.6,0.0,...,-10.1304,-6.48,-0.000000,-10.8808,-6.96,-0.000000,0.5628,0.000000,0.000000,125.28
2,1,A,2,1,2,-11.4,-12.1,0.946,0.7,0.0,...,-10.7844,-7.98,-0.000000,-11.4466,-8.47,-0.000000,0.6622,0.000000,0.000000,137.94
3,1,A,2,1,3,-11.6,-12.5,0.934,0.6,0.0,...,-10.8344,-6.96,-0.000000,-11.6750,-7.50,-0.000000,0.5604,0.000000,0.000000,145.00
4,1,A,2,1,4,-11.8,-12.7,0.930,0.6,0.0,...,-10.9740,-7.08,-0.000000,-11.8110,-7.62,-0.000000,0.5580,0.000000,0.000000,149.86
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
438235,10,F,1,31,19,5.7,-7.2,0.392,3.9,0.0,...,2.2344,22.23,1.074818,-2.8224,-28.08,-1.357665,1.5288,0.073917,0.735402,-41.04
438236,10,F,1,31,20,5.2,-8.0,0.381,4.9,0.0,...,1.9812,25.48,0.000000,-3.0480,-39.20,-0.000000,1.8669,0.000000,0.000000,-41.60
438237,10,F,1,31,21,4.6,-7.9,0.398,6.6,0.0,...,1.8308,30.36,0.000000,-3.1442,-52.14,-0.000000,2.6268,0.000000,0.000000,-36.34
438238,10,F,1,31,22,3.6,-6.6,0.474,7.1,0.0,...,1.7064,25.56,0.000000,-3.1284,-46.86,-0.000000,3.3654,0.000000,0.000000,-23.76


In [63]:
# temp, dp_temp, wind, rain, humid, solar_amt, solar_time 1~24 lag 변수 + 기타 lag 변수들과 interaction들 추가
surface_lag = add_lag(original_df = surface_fe, 
                      lag_cols = ['temp', 'dp_temp', 'wind', 'rain', 'humid', 'solar_amt', 'solar_time'], #lag 추가하는 변수들 
                      pct_cols = [], #percetnage change 추가하는 변수들 
                      lags = range(1,25)) #추가 lag들 
#fall: 맒음 / 안개 / 기타 구분 & snow에 대한 adjustment x 
f = {x:3 for x in surface.fog.unique()} 
f.update({'C': 1, 'X':1, 'F':2})
fall = make_weather_df(original_df = surface_lag, 
                         weather = 'fall', 
                         adj = False, 
                         fog_split = f)

#summer: 맑음 / 비 / 기타 구분 & snow에 대한 adjustment x 
f = {x:3 for x in surface.fog.unique()} 
f.update({'C': 1, 'R':2})
summer = make_weather_df(original_df = surface_lag, 
                         weather = 'summer', 
                         adj = False, 
                         fog_split = f)

In [64]:
# lag 만들때 summer 필요하므로 summer의 마지막부분만 남기기
summer = summer[summer.month == 7]
summer

Unnamed: 0,area,year,month,day,hour,temp,dp_temp,humid,wind,rain,...,MMDD_5_03,MMDD_5_04,MMDD_6_01,MMDD_6_02,MMDD_6_03,MMDD_6_04,MMDD_7_01,MMDD_7_02,MMDD_7_03,MMDD_7_04
1464,1,A,7,1,0,25.1,19.7,0.721,1.0,0.0,...,0,0,0,0,0,0,1,0,0,0
1465,1,A,7,1,1,23.5,20.1,0.809,0.6,0.0,...,0,0,0,0,0,0,1,0,0,0
1466,1,A,7,1,2,22.8,20.2,0.853,1.1,0.0,...,0,0,0,0,0,0,1,0,0,0
1467,1,A,7,1,3,21.8,20.0,0.896,0.8,0.0,...,0,0,0,0,0,0,1,0,0,0
1468,1,A,7,1,4,21.2,19.9,0.926,0.8,0.0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110395,10,E,7,31,19,29.4,25.9,0.816,4.6,0.0,...,0,0,0,0,0,0,0,0,0,1
110396,10,E,7,31,20,28.6,26.2,0.873,3.9,0.0,...,0,0,0,0,0,0,0,0,0,1
110397,10,E,7,31,21,28.2,26.1,0.886,3.5,0.0,...,0,0,0,0,0,0,0,0,0,1
110398,10,E,7,31,22,28.0,26.2,0.901,4.5,0.0,...,0,0,0,0,0,0,0,0,0,1


In [None]:
# train - validation set
train_df = fall[fall['year'] != 'E'].reset_index(drop = True)
test_df = fall[fall['year'] == 'E'].reset_index(drop = True)

summer_train = summer[summer.year != 'E'].reset_index(drop = True)
summer_test = summer[summer.year == 'E'].reset_index(drop = True)

#### 3. test data 전처리

In [65]:
# 검증 데이터
surface = pd.read_csv(path+'imputed_test_data0624.csv')
surface['area'] = surface['area'].astype(int)
surface['month'] = surface['month'].astype(int)
surface['day'] = surface['day'].astype(int)
surface['hour'] = surface['hour'].astype(int)
surface_fe = feature_extraction(original_df = surface)
# 'temp', 'dp_temp', 'wind', 'rain', 'humid', 'solar_amt', 'solar_time' 1~24 lag 변수와 
# 해당 lag의 percentage change 추가 + 기타 lag 변수들과 interaction들 추가
surface_lag = add_lag(original_df = surface_fe, 
                      lag_cols = ['temp', 'dp_temp', 'wind', 'rain', 'humid', 'solar_amt', 'solar_time'], #lag 추가하는 변수들 
                      pct_cols = [], #percetnage change 추가하는 변수들 => 없음.
                      lags = range(1,25)) #24시간lag들 

#fall: 맒음 / 안개 / 기타 구분 & snow에 대한 adjustment x 
f = {x:3 for x in surface.fog.unique()} 
f.update({'C': 1, 'X':1, 'F':2})
fall_test = make_weather_df(original_df = surface_lag, 
                         weather = 'fall', 
                         adj = False, 
                         fog_split = f)

#real summer test
f = {x:3 for x in surface.fog.unique()} 
f.update({'C': 1, 'R':2})
summer_rt = make_weather_df(original_df = surface_lag, 
                         weather = 'summer', 
                         adj = False, 
                         fog_split = f)

In [66]:
fall_test

Unnamed: 0,area,year,month,day,hour,temp,dp_temp,humid,wind,rain,...,MMDD_10_03,MMDD_10_04,MMDD_8_01,MMDD_8_02,MMDD_8_03,MMDD_8_04,MMDD_9_01,MMDD_9_02,MMDD_9_03,MMDD_9_04
0,1,F,8,1,0,25.5,24.0,0.919,2.7,0.0,...,0,0,1,0,0,0,0,0,0,0
1,1,F,8,1,1,25.3,24.1,0.931,2.2,1.1,...,0,0,1,0,0,0,0,0,0,0
2,1,F,8,1,2,25.3,24.1,0.930,2.7,0.9,...,0,0,1,0,0,0,0,0,0,0
3,1,F,8,1,3,25.2,24.2,0.942,2.4,4.9,...,0,0,1,0,0,0,0,0,0,0
4,1,F,8,1,4,25.0,24.2,0.955,4.0,7.5,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6619,3,F,10,31,19,15.8,9.6,0.669,2.2,0.0,...,0,1,0,0,0,0,0,0,0,0
6620,3,F,10,31,20,15.2,9.4,0.685,2.8,0.0,...,0,1,0,0,0,0,0,0,0,0
6621,3,F,10,31,21,15.0,9.3,0.687,3.3,0.0,...,0,1,0,0,0,0,0,0,0,0
6622,3,F,10,31,22,14.8,9.2,0.692,3.1,0.0,...,0,1,0,0,0,0,0,0,0,0


#### 4. train, test 데이터 새로운 변수 추가

In [67]:
df_train = fall
df_test = fall_test

In [68]:
# fall train set에 대해 
# with cumulative sum of solar_amt
df_train['solar_sum_humid'] = df_train[['solar_amt_sum','humid']].product(axis=1)
df_train['solar_sum_temp'] = df_train[['solar_amt_sum','temp']].product(axis=1)
df_train['solar_sum_wind'] = df_train[['solar_amt_sum','wind']].product(axis=1)

# humidity
df_train['humid_after_rain'] = df_train[['humid', 'after_rain']].product(axis=1)
df_train['humid_tf_rain'] = df_train[['humid','tf_rain']].product(axis=1)
df_train['humid_dp_temp'] = df_train[['humid','dp_temp']].product(axis=1)

# cold weather
df_train['after_rain_wind'] = df_train[['after_rain','wind']].product(axis=1)
#-----------------------------------------------------------------------------------------#
# fall test set에 대해
# with cumulative sum of solar_amt
df_test['solar_sum_humid'] = df_test[['solar_amt_sum','humid']].product(axis=1)
df_test['solar_sum_temp'] = df_test[['solar_amt_sum','temp']].product(axis=1)
df_test['solar_sum_wind'] = df_test[['solar_amt_sum','wind']].product(axis=1)

# humidity
df_test['humid_after_rain'] = df_test[['humid', 'after_rain']].product(axis=1)
df_test['humid_tf_rain'] = df_test[['humid','tf_rain']].product(axis=1)
df_test['humid_dp_temp'] = df_test[['humid','dp_temp']].product(axis=1)

# cold weather
df_test['after_rain_wind'] = df_test[['after_rain','wind']].product(axis=1)

#--------------------------------------------------------------------------------------------#
# 3 terms interacted
df_train['humid_temp_wind'] = df_train[['humid','temp','wind']].product(axis=1)
df_train['temp_solar_amt_time'] = df_train[['temp','solar_amt', 'solar_time']].product(axis =1)
df_train['temp_humid_solaramt'] = df_train[['temp', 'solar_amt', 'humid']].product(axis =1)

# 3 terms interacted
df_test['humid_temp_wind'] = df_test[['humid','temp','wind']].product(axis=1)
df_test['temp_solar_amt_time'] = df_test[['temp','solar_amt', 'solar_time']].product(axis =1)
df_test['temp_humid_solaramt'] = df_test[['temp', 'solar_amt', 'humid']].product(axis =1)

fall = df_train
fall_test = df_test

In [69]:
# summer에 대해서도 똑같이 처리하고, lag 추가
df_train = summer
df_test = summer_rt

In [70]:
# with cumulative sum of solar_amt
df_train['solar_sum_humid'] = df_train[['solar_amt_sum','humid']].product(axis=1)
df_train['solar_sum_temp'] = df_train[['solar_amt_sum','temp']].product(axis=1)
df_train['solar_sum_wind'] = df_train[['solar_amt_sum','wind']].product(axis=1)

# humidity
df_train['humid_after_rain'] = df_train[['humid', 'after_rain']].product(axis=1)
df_train['humid_tf_rain'] = df_train[['humid','tf_rain']].product(axis=1)
df_train['humid_dp_temp'] = df_train[['humid','dp_temp']].product(axis=1)

# cold weather
df_train['after_rain_wind'] = df_train[['after_rain','wind']].product(axis=1)
#-----------------------------------------------------------------------------------------#
# fall test set에 대해
# with cumulative sum of solar_amt
df_test['solar_sum_humid'] = df_test[['solar_amt_sum','humid']].product(axis=1)
df_test['solar_sum_temp'] = df_test[['solar_amt_sum','temp']].product(axis=1)
df_test['solar_sum_wind'] = df_test[['solar_amt_sum','wind']].product(axis=1)

# humidity
df_test['humid_after_rain'] = df_test[['humid', 'after_rain']].product(axis=1)
df_test['humid_tf_rain'] = df_test[['humid','tf_rain']].product(axis=1)
df_test['humid_dp_temp'] = df_test[['humid','dp_temp']].product(axis=1)

# cold weather
df_test['after_rain_wind'] = df_test[['after_rain','wind']].product(axis=1)

#--------------------------------------------------------------------------------------------#
# 3 terms interacted
df_train['humid_temp_wind'] = df_train[['humid','temp','wind']].product(axis=1)
df_train['temp_solar_amt_time'] = df_train[['temp','solar_amt', 'solar_time']].product(axis =1)
df_train['temp_humid_solaramt'] = df_train[['temp', 'solar_amt', 'humid']].product(axis =1)

# 3 terms interacted
df_test['humid_temp_wind'] = df_test[['humid','temp','wind']].product(axis=1)
df_test['temp_solar_amt_time'] = df_test[['temp','solar_amt', 'solar_time']].product(axis =1)
df_test['temp_humid_solaramt'] = df_test[['temp', 'solar_amt', 'humid']].product(axis =1)

summer = df_train
summer_rt = df_test

In [71]:
# 상호작용항에 대한 lag 추가
def add_lag_inter(original_df:pd.DataFrame, lag_cols: list, lags: list):
    # lag_cols: lag 넣을 변수들 목록 
    # pct_cols: percentage change lag 넣을 변수들 목록 
    # lags: 어느 lag들 넣을건지
    
    df = original_df.copy()
    
    #lag
    for col in lag_cols:
        for i in lags:
            df[col + '_lag_' + str(i)] = df.groupby('area')[col].shift(periods=i)    
    return df

In [72]:
# concatenate and add lag for interaction terms
concat = pd.concat([summer, fall])
concat_test = pd.concat([summer_rt, fall_test])

concat = add_lag_inter(original_df = concat, lag_cols = ['humid_after_rain','humid_tf_rain', 'humid_dp_temp'], 
                       lags = [1,2,3,24])

concat_test = add_lag_inter(original_df = concat_test, lag_cols = ['humid_after_rain','humid_tf_rain', 'humid_dp_temp'], 
                       lags = [1,2,3,24])

fall = concat[concat.month != 7].reset_index(drop=True)
fall_test = concat_test[concat_test.month != 7].reset_index(drop=True)
fall.drop(['MMDD_5_01', 'MMDD_5_02', 'MMDD_5_03', 'MMDD_5_04', 'MMDD_6_01',
       'MMDD_6_02', 'MMDD_6_03', 'MMDD_6_04', 'MMDD_7_01', 'MMDD_7_02',
       'MMDD_7_03', 'MMDD_7_04'], axis = 1, inplace=True)

fall_test.drop(['MMDD_5_01', 'MMDD_5_02', 'MMDD_5_03', 'MMDD_5_04', 'MMDD_6_01',
       'MMDD_6_02', 'MMDD_6_03', 'MMDD_6_04', 'MMDD_7_01', 'MMDD_7_02',
       'MMDD_7_03', 'MMDD_7_04'], axis = 1, inplace=True)

fall_test = fall_test[fall_test.month.isin([8,9,10])]

#### 5. Cross-Validation, Model Fitting

In [76]:
# train-validation
train_df = fall[fall['year'] != 'E'].reset_index(drop = True)
test_df = fall[fall['year'] == 'E'].reset_index(drop = True)

In [3]:
X_train = train_df.drop(['year','land_temp', 'month','day','hour', 'day_cos', 'day_sin'], axis =1)
y_train = train_df[['area', 'land_temp']]
X_test = test_df.drop(['year','land_temp', 'month','day','hour', 'day_cos', 'day_sin'], axis = 1)
y_test = test_df[['area', 'land_temp']]
X_r_test = fall_test.drop(['area','year', 'month','day','hour', 'day_cos', 'day_sin'], axis =1)

In [5]:
# area 별로 분리 (1 - 10)
for i in range(1, 11):
    globals()['X_train'+str(i)] = X_train[X_train['area'] == i].reset_index(drop=True)
    globals()['y_train'+str(i)] = y_train[y_train['area'] == i].reset_index(drop=True)
    globals()['X_test'+str(i)] = X_test[X_test['area'] == i].reset_index(drop=True)
    globals()['y_test'+str(i)] = y_test[y_test['area'] == i].reset_index(drop=True)

    # 특성 변수와 타겟 변수 분리
    globals()['X_train'+str(i)] = globals()['X_train'+str(i)].drop(['area'], axis=1).reset_index(drop=True)
    globals()['y_train'+str(i)] = globals()['y_train'+str(i)]['land_temp'].reset_index(drop=True)

    # 테스트 데이터
    globals()['X_test'+str(i)] = globals()['X_test'+str(i)].drop(['area'], axis=1).reset_index(drop=True)
    globals()['y_test'+str(i)] = globals()['y_test'+str(i)]['land_temp'].reset_index(drop=True)

In [9]:
# LGBM Ensemble
for i in range(1, 11):           

    # 시계열 교차 검증
    tscv = TimeSeriesSplit(n_splits=5)

    # Bayesian Optimization을 위한 탐색 공간 정의
    param_lgbm = {'regressor__learning_rate' : Real(0.01, 0.2, prior = 'log-uniform'),
                 'regressor__n_estimators' : Integer(500, 1500),
                 'regressor__max_depth': Integer(5, 30),
                 'regressor__num_leaves': Integer(40, 60),
                 'regressor__min_child_samples': Integer(10, 30),
                 'regressor__subsample': Real(0.8, 1.0, prior='uniform'),
                 'regressor__colsample_bytree': Real(0.8, 1.0, prior='uniform'),
                 'regressor__reg_alpha': Real(0.0, 1.0, prior='uniform'),
                 'regressor__reg_lambda': Real(0.0, 1.0, prior='uniform')
}

    # CatBoostRegressor를 포함한 TransformedTargetRegressor 생성
    model = TransformedTargetRegressor(regressor=LGBMRegressor(random_state=42, n_jobs=-1))


    bs_lgbm = BayesSearchCV(estimator=model,
                           search_spaces=param_lgbm, 
                           scoring='neg_mean_absolute_error',
                           cv=tscv, 
                           n_jobs=-1,
                           random_state=42, 
                           refit=True, 
                           return_train_score=True, 
                           optimizer_kwargs={'base_estimator': 'GP'}, 
                           verbose=1)

    # 모델 훈련 및 예측
    bs_lgbm.fit(globals()['X_train'+str(i)], globals()['y_train'+str(i)])
    print('Best hyperparameter'+str(i), ':', bs_lgbm.best_params_)
    print('Train MAE'+str(i), ':', -bs_lgbm.best_score_)

    # 테스트 데이터 예측
    y_pred = bs_lgbm.predict(globals()['X_test'+str(i)])
    print('Test MAE'+str(i), ':', mean_absolute_error(globals()['y_test'+str(i)], y_pred))

    # 최종 모델 저장
    globals()['model_lgbm'+str(i)] = bs_lgbm.best_estimator_

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi



Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best hyperparameter2 : OrderedDict([('regressor__colsample_bytree', 0.8), ('regressor__learning_rate', 0.01), ('regressor__max_depth', 30), ('regressor__min_child_samples', 10), ('regressor__n_estimators', 500), ('regressor__num_leaves', 40), ('regressor__reg_alpha', 1.0), ('regressor__reg_lambda', 0.0), ('regressor__subsample', 1.0)])
Train MAE2 : 1.7862794773004906
Test MAE2 : 1.3642494659889663
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best hyperparameter7 : OrderedDict([('regressor__colsample_bytree', 0.8), ('regressor__learning_rate', 0.01362322496371902), ('regressor__max_depth', 5), ('regressor__min_child_samples', 10), ('regressor__n_estimators', 1052), ('regressor__num_leaves', 40), ('regressor__reg_alpha', 1.0), ('regressor__reg_lambda', 1.0), ('regressor__subsample', 0.8033462098150629)])
Train MAE7 : 1.5910471278909217
Test MAE7 : 1.1967625854148272
Fitting 5 folds for each of 1 candidat

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi

In [10]:
lgbm_models = []

for i in range(1, 11):    
    # pretrained model 추가
    lgbm_models.append(('lgbm_' + str(i), globals()['model_lgbm'+str(i)]))

# VotingRegressor ensemble model 생성
ensemble_model = VotingRegressor(estimators = lgbm_models)

# Fit the ensemble model

ensemble_X_train = pd.DataFrame()
ensemble_y_train = pd.DataFrame()

for i in range(1, 11):
    ensemble_X_train = pd.concat([ensemble_X_train, globals()['X_train'+str(i)]], axis=0)
    ensemble_y_train = pd.concat([ensemble_y_train, globals()['y_train'+str(i)]], axis=0)

ensemble_model.fit(ensemble_X_train, ensemble_y_train)
globals()['model_ensemble'] = ensemble_model
# 테스트 데이터 예측
predictions = []
for j in range(1, 11):
    temp = ensemble_model.predict(globals()['X_test'+str(j)])
    temp_mae = mean_absolute_error(temp, globals()['y_test'+str(j)])
    predictions.append(temp_mae)
print('Test MAE for Ensemble model :', np.mean(predictions))

  y = column_or_1d(y, warn=True)


Test MAE for Ensemble model : 1.2297889264119362


In [83]:
X_r_test = fall_test.drop(['area','year', 'month','day','hour', 'day_cos', 'day_sin'], axis =1)
y_test_pred2 = ensemble_model.predict(X_r_test)