In [None]:
# year별로 time series split을 위한 class 정의
import numpy as np
from sklearn.model_selection._split import _BaseKFold, indexable, _num_samples
from sklearn.utils.validation import _deprecate_positional_args

class GroupTimeSeriesSplit(_BaseKFold):
    @_deprecate_positional_args
    def __init__(self,
                 n_splits=5,
                 *,
                 max_train_size=None
                 ):
        super().__init__(n_splits, shuffle=False, random_state=None)
        self.max_train_size = max_train_size

    def split(self, X, y=None, groups=None):
        if groups is None:
            raise ValueError(
                "The 'groups' parameter should not be None")
        X, y, groups = indexable(X, y, groups)
        n_samples = _num_samples(X)
        n_splits = self.n_splits
        n_folds = n_splits + 1
        group_dict = {}
        u, ind = np.unique(groups, return_index=True)
        unique_groups = u[np.argsort(ind)]
        n_samples = _num_samples(X)
        n_groups = _num_samples(unique_groups)
        for idx in np.arange(n_samples):
            if (groups[idx] in group_dict):
                group_dict[groups[idx]].append(idx)
            else:
                group_dict[groups[idx]] = [idx]
        if n_folds > n_groups:
            raise ValueError(
                ("Cannot have number of folds={0} greater than"
                 " the number of groups={1}").format(n_folds,
                                                     n_groups))
        group_test_size = n_groups // n_folds
        group_test_starts = range(n_groups - n_splits * group_test_size,
                                  n_groups, group_test_size)
        for group_test_start in group_test_starts:
            train_array = []
            test_array = []
            for train_group_idx in unique_groups[:group_test_start]:
                train_array_tmp = group_dict[train_group_idx]
                train_array = np.sort(np.unique(
                                      np.concatenate((train_array,
                                                      train_array_tmp)),
                                      axis=None), axis=None)
            train_end = train_array.size
            if self.max_train_size and self.max_train_size < train_end:
                train_array = train_array[train_end -
                                          self.max_train_size:train_end]
            for test_group_idx in unique_groups[group_test_start:
                                                group_test_start +
                                                group_test_size]:
                test_array_tmp = group_dict[test_group_idx]
                test_array = np.sort(np.unique(
                                              np.concatenate((test_array,
                                                              test_array_tmp)),
                                     axis=None), axis=None)
            yield [int(i) for i in train_array], [int(i) for i in test_array]

In [1]:
import numpy as np
import pandas as pd
import pywt
from statsmodels.robust import mad
import matplotlib.pyplot as plt
import seaborn as sns

import itertools
from catboost import CatBoostRegressor
from sklearn.compose import TransformedTargetRegressor
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.early_stop import no_progress_loss
from sklearn.metrics import mean_absolute_error
import warnings
warnings.filterwarnings('ignore')

In [None]:
path = 'C:/Users/' #데이터 저장된 폴더

# 함수 정의

In [3]:
# 웨이블릿 변환
def WT(df, col, wavelet='db5', thresh=0.63):
    signal = df[col].values
    thresh = thresh*np.nanmax(signal)
    coeff = pywt.wavedec(signal, wavelet, mode="per" )
    coeff[1:] = (pywt.threshold(i, value=thresh, mode="soft" ) for i in coeff[1:]) # 일정 threshold 미만인 계수 처리
    reconstructed_signal = pywt.waverec(coeff, wavelet, mode="per" ) # 처리된 계수로 시계열 데이터 재구성
    return reconstructed_signal

In [4]:
def feature_extraction(original_df:pd.DataFrame):
    df = original_df.copy()
    
    # GAN Imputation 과정에서 0 미만인 값 발생 > 변수의 정의에 따라 0 미만인 값이 나올 수 없는 경우 0으로 대체
    df['solar_amt'] = np.where(df['solar_amt'] < 0, 0, df['solar_amt'])
    df['solar_time'] = np.where(df['solar_time'] < 0, 0, df['solar_time'])
    df['rain'] = np.where(df['rain'] < 0, 0, df['rain'])
    df['tf_rain'] = np.where(df['tf_rain'] < 0, 0, df['tf_rain'])/60 #0~1
    df['humid'] = np.where(df['humid'] < 0, 0, df['humid'])/100 #0~1
    df['snow'] = np.where(df['snow'] == -99.9, 0, df['snow'])
    
    # 시간을 숫자 그대로 사용할 시 연속적 & 순환적으로 흐르는 시간을 제대로 반영하지 못함 > 푸리에 변환
    df['hour_sin'] = np.sin(2 * np.pi * df['hour']/24.0)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour']/24.0)
    
    # 기온-이슬점온도
    df['temp_diff'] = df['temp'] - df['dp_temp']
    
    return df

In [5]:
def process_group(group, wthr):
    # after_rain, after_snow 만드는 코드 : area별로 snow와 rain이 양수에서 0으로 바뀌는 경우 (diff가 음수인 경우)
    # 즉 눈과 비가 그친 직후 1시간 ~ 24시간 값 생성
    group = group.reset_index(drop=True)
    group[wthr+'_diff'] = group[wthr].diff() 
    group['after_'+wthr] = 0
    index_list = group[group[wthr+'_diff'] < 0].index
    for index in index_list:
        for i in range(24):
            if index + i < len(group):
                group.loc[index + i, 'after_'+wthr] = i + 1
    return group

def add_lag(original_df:pd.DataFrame, lag_cols: list, pct_cols: list, lags: list):
    # lag_cols: lag 넣을 변수 목록 
    # pct_cols: percentage change lag 넣을 변수 목록 
    # lags: lag, percentage change 적용할 이전 시간대 값 목록
    
    df = original_df.copy()
    
    # lag, pct
    for col in pct_cols:
        for i in lags:
            df[col + '_pct_' + str(i)] = df.groupby('area')[col].pct_change(periods=i)
    for col in lag_cols:
        for i in lags:
            df[col + '_lag_' + str(i)] = df.groupby('area')[col].shift(periods=i)

    # 2차 상호작용
    interaction_cols = ["temp", "humid", "wind", "solar_amt"]
    interaction_terms = []
    for terms in itertools.combinations(interaction_cols, 2):
        term_name = '_'.join(terms)
        df[term_name] = df[list(terms)].product(axis=1)
    df['temp_dp'] = df['temp']*df['dp_temp']
    # 3차 상호작용
    df['humid_temp_wind'] = df[['humid','temp','wind']].product(axis=1)
    df['temp_solar_amt_time'] = df[['temp','solar_amt', 'solar_time']].product(axis =1)
    df['temp_humid_solaramt'] = df[['temp', 'solar_amt', 'humid']].product(axis =1)
    
    # rain_count, snow_count
    # 눈/비가 오면 1, 안 오면 0 mask 생성
    # area 별로 mask가 0에서 1로 변경되는 시점을 찾아서 그룹화, mask가 1인 경우에만 그룹화된 누적 합 계산
    
    # rain_count
    df['rain_mask'] = (df['tf_rain'] != 0).astype(int) 
    df['rain_group'] = (df['rain_mask'].diff() < 0).astype(int).cumsum() 
    df['rain_count'] = df.groupby(['area', 'rain_group'])['rain_mask'].apply(lambda x: x.cumsum() * x) 
    df.drop(['rain_mask', 'rain_group'], axis=1, inplace=True)
    
    # snow_count
    df['snow_mask'] = (df['snow'] != 0).astype(int) #
    df['snow_group'] = (df['snow_mask'].diff() < 0).astype(int).cumsum() 
    df['snow_count'] = df.groupby(['area', 'snow_group'])['snow_mask'].apply(lambda x: x.cumsum() * x)
    df.drop(['snow_mask', 'snow_group'], axis=1, inplace=True)
    
    df['solar_amt_sum'] = df.groupby('area')['solar_amt'].rolling(window=3).sum().reset_index(drop=True)
    
    # after_rain, after_snow 생성 과정에서 만들어진 필요없는 변수 제거
    df = df.groupby('area').apply(process_group, wthr="snow")
    df.drop('snow_diff', axis=1, inplace = True)
    df.reset_index(drop = True, inplace = True)
    
    df = df.groupby('area').apply(process_group, wthr="rain")
    df.drop('rain_diff', axis=1, inplace = True)
    df.reset_index(drop = True, inplace = True)
    
    return df

In [6]:
def make_weather_df(original_df:pd.DataFrame, surface: pd.DataFrame, weather:str, adj:bool, fog_split:dict):
    # adj : snow adjustment 여부
    # fog split: fog 변수 분할 방법 
    
    df = original_df.copy()
    if adj == True:
        idx = df[df.snow!=0].index
        temp = df.loc[idx,'snow'].apply(lambda x: 1+abs(x)*8)
        df.loc[idx,'temp'] = df.loc[idx,'temp']/(temp)
        
        # solar_amt에 대한 imputation
        idx_2 = df[(df.snow>=0.1)&(df.solar_amt>0)].index
        df.loc[idx_2, 'solar_amt'] = (df.loc[idx_2,'solar_amt']) / (df.loc[idx_2,'snow'].apply(lambda x: 1+abs(x)))
        
        # 주기성 제거
        df.loc[idx, ['hour_sin','hour_cos']] = 0
        
    if weather == 'spring':
        m_list = range(2, 5)
    elif weather == 'summer':
        m_list = range(5, 8)
    elif weather == 'fall':
        m_list = range(8, 11)
    elif weather == 'winter':
        m_list = [11, 12, 1]
    
    df['fog'] = surface['fog'].replace(fog_split)
    df = pd.get_dummies(df, columns = ['fog'])
    
    # 여름, 가을의 경우 눈이 내리지 않으므로 snow 관련 변수 삭제
    if weather in ['summer', 'fall']:
        snow_cols = [x for x in df.columns if x.endswith('snow')]
        df.drop(snow_cols, axis = 1, inplace = True)
    
    df = df[df['month'].isin(m_list)].reset_index(drop = True)
    
    # month_day 범주화
    df['new_day'] = df['day'].apply(lambda x: 1 if x in range(1,8) else (2 if x in range(8,15) else (3 if x in range(15,22) else 4)))
    df['MMDD'] = df['month'].astype('int').astype('str') + '_' + df['new_day'].astype('int').astype('str').str.zfill(2)
    df = pd.get_dummies(df, columns = ['MMDD'])
    df.drop(['new_day'], axis=1, inplace=True)
        
    return df

# 데이터 

In [7]:
surface = pd.read_csv(path+'train0624.csv').iloc[:, 1:]
surface_fe = feature_extraction(original_df = surface) 
surface_fe.head()

Unnamed: 0,area,year,month,day,hour,temp,dp_temp,humid,wind,rain,tf_rain,land_temp,solar_amt,solar_time,snow,fog,hour_sin,hour_cos,temp_diff
0,1.0,A,2.0,1.0,0.0,-9.9,-10.7,0.939,0.6,0.0,0.0,-1.3,0.0,0.0,0.0,H,0.0,1.0,0.8
1,1.0,A,2.0,1.0,1.0,-10.8,-11.6,0.938,0.6,0.0,0.0,-1.5,0.0,0.0,0.0,H,0.258819,0.965926,0.8
2,1.0,A,2.0,1.0,2.0,-11.4,-12.1,0.946,0.7,0.0,0.0,-1.7,0.0,0.0,0.0,H,0.5,0.866025,0.7
3,1.0,A,2.0,1.0,3.0,-11.6,-12.5,0.934,0.6,0.0,0.0,-1.8,0.0,0.0,0.0,H,0.707107,0.707107,0.9
4,1.0,A,2.0,1.0,4.0,-11.8,-12.7,0.93,0.6,0.0,0.0,-2.0,0.0,0.0,0.0,H,0.866025,0.5,0.9


In [8]:
# 봄, 여름, 가을과 다르게 겨울에는 연도별로 time series split이 불가 (11, 12, 1월 중 1월은 다음 연도이기 때문)
# 따라서 datatime 열을 가진 데이터를 생성하여 data split이 용이하도록 A~F year을 임시로 변환
temp = surface_fe.copy()
year_mapping = {'A': 2017, 'B': 2018, 'C': 2019, 'D': 2020, 'E': 2021, 'F' : 2022} # 임시
temp['year'] = temp['year'].map(year_mapping)

# 'year', 'month', 'day', 'hour'를 결합하여 'datetime' 열을 만들기
temp['ds'] = pd.to_datetime(temp[['year', 'month', 'day', 'hour']], format='%Y-%m-%d %H')

In [9]:
def assign_label(row):
    if row['ds'] < pd.Timestamp('2018-02-01'): # A년도 11월 ~ B년도 1월
        return 'A'
    elif row['ds'] < pd.Timestamp('2019-02-01'): # B년도 11월 ~ C년도 1월
        return 'B'
    elif row['ds'] < pd.Timestamp('2020-02-01'): # C년도 11월 ~ D년도 1월
        return 'C'
    elif row['ds'] < pd.Timestamp('2021-02-01'): # D년도 11월 ~ E년도 1월
        return 'D'
    elif row['ds'] < pd.Timestamp('2022-02-01'): # E년도 11월 ~ F년도 1월
        return 'E'

In [10]:
temp["ds_label"] = temp.apply(assign_label, axis=1)

In [11]:
# wavelet transform : 이슬점온도와 습도에 대해 적용
wv_df = pd.DataFrame()

for j in ['A','B','C','D','E']:
    for i in range(1,11):
        tt = surface_fe[(surface_fe.area==i) & (temp.ds_label==j)] # 같은 이유로 datatime 열이 포함된 데이터의 index 이용
        tt_1 = WT(tt, 'dp_temp', wavelet='db5', thresh=0.85)
        tt_2 = WT(tt, 'humid', wavelet='db5', thresh=0.85)
        
        tt_1 = pd.Series(tt_1, index=tt.index, name='dp_temp')
        tt_2 = pd.Series(tt_2, index=tt.index, name='humid')
        
        tt_df = pd.concat([tt_1, tt_2, tt.drop(['dp_temp','humid'], axis=1)], axis=1)
        wv_df = pd.concat([wv_df, tt_df], axis=0)

In [12]:
wv_df.shape

(438240, 19)

In [13]:
# lag, pct 변수 모두 추가하게 되면 변수가 너무 많아져서 변수중요도가 높았던 pct만 추가
surface_lag = add_lag(original_df = wv_df, 
                      lag_cols = [], # lag 추가하는 변수들 
                      pct_cols = ['temp', 'dp_temp', 'humid', 'wind', 'solar_amt'], # percetnage change 추가하는 변수들 
                      lags = [1, 2, 3, 6, 12, 24]) # 적용하는 이전 시간대들
surface_lag.head()

Unnamed: 0,dp_temp,humid,area,year,month,day,hour,temp,wind,rain,...,wind_solar_amt,temp_dp,humid_temp_wind,temp_solar_amt_time,temp_humid_solaramt,rain_count,snow_count,solar_amt_sum,after_snow,after_rain
0,-12.515989,0.654553,1.0,A,2.0,1.0,0.0,-9.9,0.6,0.0,...,0.0,123.90829,-3.888045,-0.0,-0.0,0,0,,0,0
1,-12.295515,0.681314,1.0,A,2.0,1.0,1.0,-10.8,0.6,0.0,...,0.0,132.791559,-4.414912,-0.0,-0.0,0,0,,0,0
2,-12.03952,0.711918,1.0,A,2.0,1.0,2.0,-11.4,0.7,0.0,...,0.0,137.250523,-5.681106,-0.0,-0.0,0,0,0.0,0,0
3,-11.739106,0.746786,1.0,A,2.0,1.0,3.0,-11.6,0.6,0.0,...,0.0,136.17363,-5.197632,-0.0,-0.0,0,0,0.0,0,0
4,-11.431909,0.777542,1.0,A,2.0,1.0,4.0,-11.8,0.6,0.0,...,0.0,134.896526,-5.504994,-0.0,-0.0,0,0,0.0,0,0


In [14]:
variables = ['temp', 'dp_temp', 'rain', 'solar_amt', 'snow']
weights = np.array([0.2, 0.3, 0.5]) # 최근값에 가중치를 주기 위하여 0.2, 0.3, 0.5 가중치 설정 (합이 1이 되도록 함)

def weighted_moving_average(x):
    wma = np.convolve(x, weights[::-1], mode='valid')
    # 첫 두 행의 경우 WMA가 계산되지 않으므로 결과 배열 앞에 두 개의 NaN 값을 추가
    return np.concatenate(([np.nan, np.nan], wma))

for var in variables: # area 별로 3시간 weighted moving average 적용
    surface_lag[var + '_wma_3'] = surface_lag.groupby('area')[var].transform(weighted_moving_average)

In [15]:
# 3 term interaction의 1, 2시간 lag
for col in ['temp_humid_solaramt', "temp_solar_amt_time"]:
    for i in [1, 2]:
            surface_lag[col + '_lag_' + str(i)] = surface_lag.groupby('area')[col].shift(periods=i)

In [16]:
# winter : 맑음 / 눈&비 / 기타 구분 & snow에 대한 adjustment
f = {x:3 for x in surface.fog.unique()} 
f.update({'C': 1, 'R':2, 'S':2})
winter = make_weather_df(original_df = surface_lag, surface = surface,
                         weather = 'winter', 
                         adj = True, 
                         fog_split = f)

In [17]:
winter.head()

Unnamed: 0,dp_temp,humid,area,year,month,day,hour,temp,wind,rain,...,MMDD_11_03,MMDD_11_04,MMDD_12_01,MMDD_12_02,MMDD_12_03,MMDD_12_04,MMDD_1_01,MMDD_1_02,MMDD_1_03,MMDD_1_04
0,3.702674,0.796182,1.0,A,11.0,1.0,0.0,6.8,0.2,0.0,...,0,0,0,0,0,0,0,0,0,0
1,3.76101,0.795138,1.0,A,11.0,1.0,1.0,6.6,0.4,0.0,...,0,0,0,0,0,0,0,0,0,0
2,3.831334,0.783625,1.0,A,11.0,1.0,2.0,6.6,0.3,0.0,...,0,0,0,0,0,0,0,0,0,0
3,3.916705,0.760403,1.0,A,11.0,1.0,3.0,6.4,0.3,0.0,...,0,0,0,0,0,0,0,0,0,0
4,4.001779,0.737672,1.0,A,11.0,1.0,4.0,6.3,0.3,0.0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
winter.isnull().sum().sum()

0

## baseline

In [20]:
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.early_stop import no_progress_loss
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import VotingRegressor

In [21]:
# area별 group time series split을 위하여 기존 datetime 변수를 가지고 있던 데이터를 겨울의 데이터만 포함하도록 조정
temp = temp[temp.month.isin([11, 12, 1])].reset_index(drop=True)
winter.shape, temp.shape

((110400, 87), (110400, 21))

In [22]:
def gen_ensemble_model(train_df):
    
    # feature/target 분리
    X_tv = train_df.drop(['month', 'day', 'hour', 'land_temp', 'year'], axis = 1).reset_index(drop=True)
    y_tv = train_df[['area','land_temp']].reset_index(drop=True)
    
    # train area 별로 분리 (1 - 10)
    for i in range(1, 11):
        globals()['X_tv'+str(i)] = X_tv[X_tv['area'] == i].reset_index(drop=True).drop(['area'], axis=1)
        globals()['y_tv'+str(i)] = y_tv.loc[y_tv['area'] == i, 'land_temp'].reset_index(drop=True)
        
    for i in range(1, 11):
        print('area', i)
        # tuning할 hyperparameter set
        space = {
            'depth': hp.quniform('depth', 3, 10, 1),
            'learning_rate': hp.uniform('learning_rate', 0.01, 0.2),
            'l2_leaf_reg': hp.quniform('l2_leaf_reg', 2, 30, 4),
            'iterations' : hp.quniform('iterations', 100, 500, 50)
            }

        temptemp = temp[y_tv['area']==i].reset_index(drop=True)
        groups = temptemp.apply(assign_label, axis=1).values
        def objective(params, random_seed):
            model = CatBoostRegressor(
                depth = int(params['depth']),
                learning_rate = params['learning_rate'],
                l2_leaf_reg = params['l2_leaf_reg'],
                iterations = params['iterations'],
                random_seed = random_seed,
                verbose = False
            )

            # 하나의 fold에 11, 12, 1월이 다 들어가도록 group time series split 사용 (11월~1월을 그룹화)
            gscv = GroupTimeSeriesSplit(n_splits=4) 
            mae = []
            
            for train_index, valid_index in gscv.split(globals()['X_tv'+str(i)], groups=groups):
                X_train, X_valid = globals()['X_tv'+str(i)].loc[train_index], globals()['X_tv'+str(i)].loc[valid_index]
                y_train, y_valid = globals()['y_tv'+str(i)].loc[train_index], globals()['y_tv'+str(i)].loc[valid_index]

                model.fit(X_train, y_train, verbose=False)
                y_pred = model.predict(X_valid)
                mae.append(mean_absolute_error(y_valid, y_pred))

            return {'loss': np.mean(mae), 'status': STATUS_OK}
        
        # TPE 사용하여 bayesian optimization
        def run_model(random_seed):
            objective_with_seed = lambda params: objective(params, random_seed)

            best = fmin(fn = objective_with_seed, 
                        space = space, 
                        algo = tpe.suggest,
                        trials = Trials(),
                        max_evals = 100,
                        early_stop_fn = no_progress_loss(20), # 20회동안 개선되지 않으면 조기 중지 
                        rstate = np.random.default_rng(random_seed)) # 매 시도마다 일관된 값을 가질 수 있도록 seed 조정

            # 정수형 하이퍼 파라미터 값이 뒤에 .0이 붙어서 실수형으로 반환됨
            best['depth'] = int(best['depth'])
            best['l2_leaf_reg'] = int(best['l2_leaf_reg'])
            best['iterations'] = int(best['iterations'])
            
            # 선택된 hyperparameter로 적합
            model = CatBoostRegressor(
                random_state = random_seed,
                verbose = False,
                **best)
            
            globals()['model_cat'+str(i)] = model
        
        run_model(0)
    
    # area별로 구축한 모델을 모아서 앙상블
    catboost_models = []
    
    for i in range(1,11): 
        catboost_models.append(('catboost_'+str(i), globals()['model_cat'+str(i)]))
        
    ensemble_model = VotingRegressor(estimators = catboost_models)
    
    return ensemble_model

In [23]:
ensemble_model = gen_ensemble_model(winter)

area 1
 45%|█████████████████████▌                          | 45/100 [32:07<39:15, 42.83s/trial, best loss: 1.283454967051718]
area 2
 45%|█████████████████████▌                          | 45/100 [17:56<21:55, 23.92s/trial, best loss: 2.502920735718021]
area 3
 28%|████████████▌                                | 28/100 [23:53<1:01:25, 51.18s/trial, best loss: 1.4845983110357552]
area 4
 47%|██████████████████████                         | 47/100 [36:44<41:26, 46.91s/trial, best loss: 1.3288507639924165]
area 5
 53%|█████████████████████████▍                      | 53/100 [28:16<25:04, 32.01s/trial, best loss: 1.387779976061489]
area 6
 47%|██████████████████████                         | 47/100 [41:48<47:08, 53.37s/trial, best loss: 1.2659759773125252]
area 7
 42%|███████████████████▋                           | 42/100 [36:05<49:50, 51.56s/trial, best loss: 1.1405801059470608]
area 8
 29%|█████████████▋                                 | 29/100 [18:39<45:40, 38.60s/trial, best loss: 1.63

In [24]:
# feature/target 분리
X_tv = winter.drop(['land_temp', 'hour', 'day', 'month', 'year'], axis = 1).reset_index(drop=True)
y_tv = winter[['area', 'land_temp']].reset_index(drop=True)

# train area 별로 분리 (1 - 10)
for i in range(1, 11):
    globals()['X_tv'+str(i)] = X_tv[X_tv['area'] == i].reset_index(drop=True).drop('area', axis=1)
    globals()['y_tv'+str(i)] = y_tv.loc[y_tv['area'] == i, 'land_temp'].reset_index(drop=True)

In [25]:
# 앙상블 모델 적합
ensemble_X_tv = pd.DataFrame()
ensemble_y_tv = pd.DataFrame()

# 지역별로 분할했던 데이터를 합치기
for i in range(1,11):
    ensemble_X_tv = pd.concat([ensemble_X_tv, globals()['X_tv'+str(i)]], axis=0)
    ensemble_y_tv = pd.concat([ensemble_y_tv, globals()['y_tv'+str(i)]], axis=0)

ensemble_model.fit(ensemble_X_tv, ensemble_y_tv) 

## 예측 내보내기

In [27]:
test = pd.read_csv(path+"imputed_test_data0624.csv")

In [28]:
test_fe = feature_extraction(original_df = test) 
test_fe.head()

Unnamed: 0,area,year,month,day,hour,temp,dp_temp,humid,wind,rain,tf_rain,solar_amt,solar_time,snow,fog,hour_sin,hour_cos,temp_diff
0,1.0,F,2.0,1.0,0.0,0.6,-2.0,0.825,2.7,0.0,0.0,0.0,0.0,3.1,G,0.0,1.0,2.6
1,1.0,F,2.0,1.0,1.0,0.0,-5.2,0.683,3.2,0.0,0.0,0.0,0.0,3.1,R,0.258819,0.965926,5.2
2,1.0,F,2.0,1.0,2.0,-0.3,-6.4,0.637,2.7,0.0,0.0,0.0,0.0,3.1,C,0.5,0.866025,6.1
3,1.0,F,2.0,1.0,3.0,-1.0,-4.5,0.772,2.1,0.2,0.116667,0.0,0.0,4.1,R,0.707107,0.707107,3.5
4,1.0,F,2.0,1.0,4.0,-1.4,-3.1,0.883,2.9,0.6,0.05,0.0,0.0,4.7,R,0.866025,0.5,1.7


In [29]:
# wavelet transform
# train 전처리와는 달리 1년의 데이터이므로 area별로 처리
wv_df = pd.DataFrame()

for i in range(1, 4):
    tt = test_fe[test_fe.area==i]
    tt_1 = WT(tt, 'dp_temp', wavelet='db5', thresh=0.85)
    tt_2 = WT(tt, 'humid', wavelet='db5', thresh=0.85)
        
    tt_1 = pd.Series(tt_1, index=tt.index, name='dp_temp')
    tt_2 = pd.Series(tt_2, index=tt.index, name='humid')
        
    tt_df = pd.concat([tt_1, tt_2, tt.drop(['dp_temp','humid'], axis=1)], axis=1)
    wv_df = pd.concat([wv_df, tt_df], axis=0)

In [30]:
test_fe.shape, wv_df.shape # shape이 동일함을 확인

((26280, 18), (26280, 18))

In [31]:
# train 과정과 동일하게 pct 변수 추가
test_lag = add_lag(original_df = wv_df, 
                      lag_cols = [], #lag 추가하는 변수들 
                      pct_cols = ['temp', 'dp_temp', 'humid', 'wind', 'solar_amt'], #percetnage change 추가하는 변수들 
                      lags = [1, 2, 3, 6, 12, 24]) 
test_lag.head()

Unnamed: 0,dp_temp,humid,area,year,month,day,hour,temp,wind,rain,...,wind_solar_amt,temp_dp,humid_temp_wind,temp_solar_amt_time,temp_humid_solaramt,rain_count,snow_count,solar_amt_sum,after_snow,after_rain
0,-9.507007,0.558835,1.0,F,2.0,1.0,0.0,0.6,2.7,0.0,...,0.0,-5.704204,0.905312,0.0,0.0,0,1,,0,0
1,-9.532661,0.558806,1.0,F,2.0,1.0,1.0,0.0,3.2,0.0,...,0.0,-0.0,0.0,0.0,0.0,0,2,,0,0
2,-9.559404,0.558803,1.0,F,2.0,1.0,2.0,-0.3,2.7,0.0,...,0.0,2.867821,-0.452631,-0.0,-0.0,0,3,0.0,0,0
3,-9.586635,0.558815,1.0,F,2.0,1.0,3.0,-1.0,2.1,0.2,...,0.0,9.586635,-1.173511,-0.0,-0.0,1,4,0.0,0,0
4,-9.614442,0.558822,1.0,F,2.0,1.0,4.0,-1.4,2.9,0.6,...,0.0,13.460219,-2.268818,-0.0,-0.0,2,5,0.0,0,0


In [32]:
# 3시간 가중평균 추가
for var in variables:
    test_lag[var + '_wma_3'] = test_lag.groupby('area')[var].transform(weighted_moving_average)

In [33]:
# 3-term interaction의 1,2시간 lag 추가
for col in ['temp_humid_solaramt', "temp_solar_amt_time"]:
    for i in [1, 2]:
            test_lag[col + '_lag_' + str(i)] = test_lag.groupby('area')[col].shift(periods=i)

In [34]:
# winter : 맑음 / 눈&비 / 기타 구분 & snow에 대한 adjustment
f = {x:3 for x in test.fog.unique()} 
f.update({'C': 1, 'R':2, 'S':2})
winter_test = make_weather_df(original_df = test_lag, surface = test, 
                         weather = 'winter', 
                         adj = True, 
                         fog_split = f)

In [35]:
winter_test.columns

Index(['dp_temp', 'humid', 'area', 'year', 'month', 'day', 'hour', 'temp',
       'wind', 'rain', 'tf_rain', 'solar_amt', 'solar_time', 'snow',
       'hour_sin', 'hour_cos', 'temp_diff', 'temp_pct_1', 'temp_pct_2',
       'temp_pct_3', 'temp_pct_6', 'temp_pct_12', 'temp_pct_24',
       'dp_temp_pct_1', 'dp_temp_pct_2', 'dp_temp_pct_3', 'dp_temp_pct_6',
       'dp_temp_pct_12', 'dp_temp_pct_24', 'humid_pct_1', 'humid_pct_2',
       'humid_pct_3', 'humid_pct_6', 'humid_pct_12', 'humid_pct_24',
       'wind_pct_1', 'wind_pct_2', 'wind_pct_3', 'wind_pct_6', 'wind_pct_12',
       'wind_pct_24', 'solar_amt_pct_1', 'solar_amt_pct_2', 'solar_amt_pct_3',
       'solar_amt_pct_6', 'solar_amt_pct_12', 'solar_amt_pct_24', 'temp_humid',
       'temp_wind', 'temp_solar_amt', 'humid_wind', 'humid_solar_amt',
       'wind_solar_amt', 'temp_dp', 'humid_temp_wind', 'temp_solar_amt_time',
       'temp_humid_solaramt', 'rain_count', 'snow_count', 'solar_amt_sum',
       'after_snow', 'after_rain', 'temp_

In [36]:
winter_test_copy = winter_test.copy()

In [37]:
winter_test.drop(["area", "year", "hour", "month", "day"], axis=1, inplace=True)

In [38]:
# 검증 데이터 예측
y_pred_1 = ensemble_model.predict(winter_test)

In [39]:
winter_test_copy["test"] = y_pred_1

In [40]:
# 파일 내보내기
winter_test_copy[["area", "year", "month", "test"]].to_csv(path+"winter_0705_ver1.csv")