In [55]:
import numpy as np
import pandas as pd
import pywt
from statsmodels.robust import mad
import matplotlib.pyplot as plt
import seaborn as sns

import itertools
from sklearn.ensemble import VotingRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.compose import TransformedTargetRegressor
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.early_stop import no_progress_loss
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from skopt import gp_minimize
from skopt.space import Real, Integer
from skopt.utils import use_named_args
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings('ignore')

plt.rc('font', family='NanumGothic') # 한글 표현 for Windows
plt.rcParams['axes.unicode_minus'] = False  # 마이너스 부호 표현
%matplotlib inline
pd.set_option('display.max_rows', None)

In [None]:
# Cross-Validation
from sklearn.model_selection._split import _BaseKFold, indexable, _num_samples
from sklearn.utils.validation import _deprecate_positional_args

class GroupTimeSeriesSplit(_BaseKFold):
    @_deprecate_positional_args
    def __init__(self,
                 n_splits=5,
                 *,
                 max_train_size=None
                 ):
        super().__init__(n_splits, shuffle=False, random_state=None)
        self.max_train_size = max_train_size

    def split(self, X, y=None, groups=None):
        if groups is None:
            raise ValueError(
                "The 'groups' parameter should not be None")
        X, y, groups = indexable(X, y, groups)
        n_samples = _num_samples(X)
        n_splits = self.n_splits
        n_folds = n_splits + 1
        group_dict = {}
        u, ind = np.unique(groups, return_index=True)
        unique_groups = u[np.argsort(ind)]
        n_samples = _num_samples(X)
        n_groups = _num_samples(unique_groups)
        for idx in np.arange(n_samples):
            if (groups[idx] in group_dict):
                group_dict[groups[idx]].append(idx)
            else:
                group_dict[groups[idx]] = [idx]
        if n_folds > n_groups:
            raise ValueError(
                ("Cannot have number of folds={0} greater than"
                 " the number of groups={1}").format(n_folds,
                                                     n_groups))
        group_test_size = n_groups // n_folds
        group_test_starts = range(n_groups - n_splits * group_test_size,
                                  n_groups, group_test_size)
        for group_test_start in group_test_starts:
            train_array = []
            test_array = []
            for train_group_idx in unique_groups[:group_test_start]:
                train_array_tmp = group_dict[train_group_idx]
                train_array = np.sort(np.unique(
                                      np.concatenate((train_array,
                                                      train_array_tmp)),
                                      axis=None), axis=None)
            train_end = train_array.size
            if self.max_train_size and self.max_train_size < train_end:
                train_array = train_array[train_end -
                                          self.max_train_size:train_end]
            for test_group_idx in unique_groups[group_test_start:
                                                group_test_start +
                                                group_test_size]:
                test_array_tmp = group_dict[test_group_idx]
                test_array = np.sort(np.unique(
                                              np.concatenate((test_array,
                                                              test_array_tmp)),
                                     axis=None), axis=None)
            yield [int(i) for i in train_array], [int(i) for i in test_array]

In [56]:
# 웨이블릿 변환 코드
def WT(df, col, wavelet='db5', thresh=0.63):
    signal = df[col].values
    thresh = thresh*np.nanmax(signal)
    coeff = pywt.wavedec(signal, wavelet, mode="per" )
    coeff[1:] = (pywt.threshold(i, value=thresh, mode="soft" ) for i in coeff[1:])
    reconstructed_signal = pywt.waverec(coeff, wavelet, mode="per" )
    return reconstructed_signal

In [57]:
def gen_feature(df):
    
    # abnormal value adjustment
    df['solar_amt'] = df['solar_amt'].apply(lambda x: 0 if x<0 else x)
    df['solar_time'] = df['solar_time'].apply(lambda x: 0 if x<0 else x)
    df['rain'] = df['rain'].apply(lambda x: 0 if x<0 else x)
    df['tf_rain'] = df['tf_rain'].apply(lambda x: 0 if x<0 else x/60)
    df['humid'] = df['humid'].apply(lambda x: 0 if x<0 else x/100)
    df['snow'] = df['snow'].apply(lambda x: 0 if x<=-99 else x)
    
    # preprocessing
    df['hour_sin'] = np.sin(2 * np.pi * df['hour']/24.0)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour']/24.0)
    
    # difference
    df['temp_diff'] = df['temp'] - df['dp_temp']
    
    return df

In [58]:
def process_group(group, wthr):
    group = group.reset_index(drop=True)
    group[f'{wthr}_diff'] = group[wthr].diff()
    group[f'after_{wthr}'] = 0
    idx_list = group[group[f'{wthr}_diff']>0].index
    for idx in idx_list:
        for i in range(24):
            if idx+i < len(group):
                group.loc[idx+i, 'after_'+wthr] = i+1
                
    return group

# 웨이블릿 변환한 dataframe 반환
def gen_wv_df(original_df: pd.DataFrame, t: str):
    # t : 'train' or 'test'
    df = original_df.copy()
    
    wv_df = pd.DataFrame()
    if t == 'train':
        year_ls = ['A','B','C','D','E']
        area_ls = range(1,11)
    elif t == 'test':
        year_ls = ['F']
        area_ls = range(1,4)
    
    # 이슬점과 습도에 대한 웨이블릿 변환 후 dataframe 반환
    for j in year_ls:
        for i in area_ls:
            temp = df[(df.area==i) & (df.year==j)]

            temp_1 = WT(temp, 'dp_temp', wavelet='db5', thresh=0.85)
            temp_2 = WT(temp, 'humid', wavelet='db5', thresh=0.85)

            temp_1 = pd.Series(temp_1, index=temp.index, name='dp_temp')
            temp_2 = pd.Series(temp_2, index=temp.index, name='humid')

            temp_df = pd.concat([temp_1, temp_2, temp.drop(['dp_temp','humid'], axis=1)], axis=1)
            wv_df = pd.concat([wv_df, temp_df], axis=0)
    
    return wv_df

def add_lag(original_df: pd.DataFrame, lag_cols: list, lags: list, t: str):
    df = original_df.copy()
    
    # 웨이블릿 변환한 데이터프레임 반환
    df = gen_wv_df(df, t)
    
    for col in lag_cols:
        for i in lags:
            df[f'{col}_lag_{i}'] = df.groupby(['area'])[col].shift(periods=i)
            
    #cumulative
    df['rain_mask'] = (df['tf_rain'] != 0).astype(int)
    df['rain_group'] = (df['rain_mask'].diff() < 0).astype(int).cumsum()
    df['rain_count'] = df.groupby(['area', 'rain_group'])['rain_mask'].apply(lambda x: x.cumsum() * x).reset_index(drop=True)
    df.drop(['rain_mask', 'rain_group'], axis=1, inplace=True)
    
    df['solar_amt_sum'] = df.groupby('area')['solar_amt'].rolling(window=3).sum().reset_index(drop=True)
    
    df = df.groupby('area').apply(process_group, wthr="snow")
    df.drop('snow_diff', axis=1, inplace = True)
    df.reset_index(drop = True, inplace = True)
    
    df = df.groupby('area').apply(process_group, wthr="rain")
    df.drop('rain_diff', axis=1, inplace = True)
    df.reset_index(drop = True, inplace = True)

    # interaction 추가한 항
    df['temp_dp_temp'] = df[['temp', 'dp_temp']].product(axis=1)
    df['temp_humid'] = df[['temp','humid']].product(axis=1)
    df['temp_wind'] = df[['temp','wind']].product(axis=1)
    
    return df

In [59]:
def make_weather_df(original_df:pd.DataFrame, surface: pd.DataFrame, weather:str, adj:bool, fog_split:dict):
    # adj: snow adjustment 여부 
    # fog split: fog 변수 분할 방법
    
    df = original_df.copy()
    if adj == True:
        # temp에 대한 처리
        idx = df[df.snow!=0].index
        temp = df.loc[idx,'snow'].apply(lambda x: 1+abs(x)*8)
        df.loc[idx,'temp'] = df.loc[idx,'temp']/(temp)
        
        # solar_amt에 대한 처리
        idx_2 = df[(df.snow>=0.1)&(df.solar_amt>0)].index
        df.loc[idx_2, 'solar_amt'] = (df.loc[idx_2,'solar_amt']) / (df.loc[idx_2,'snow'].apply(lambda x: 1+abs(x)))
        
        # 주기성 제거
        df.loc[idx, ['hour_sin','hour_cos']] = 0
        
    if weather == 'spring':
        m_list = range(2, 5)
    elif weather == 'summer':
        m_list = range(5, 8)
    elif weather == 'fall':
        m_list = range(8, 11)
    elif weather == 'winter':
        m_list = [range(11, 13)] + [1] 
    
    df['fog'] = surface['fog'].replace(fog_split)
    df = pd.get_dummies(df, columns = ['fog'])
    
    # 여름, 가을 계절 : 적설량 변수 제거
    if weather in ['summer', 'fall']:
        snow_cols = [x for x in df.columns if x.endswith('snow')]
        df.drop(snow_cols, axis = 1, inplace = True)
    
    # 계절별 데이터 출력
    df = df[df['month'].isin(m_list)].reset_index(drop = True)
    
    # day -> week, month_week 범주화
    df['new_day'] = df['day'].apply(lambda x: 1 if x in range(1,8) else (2 if x in range(8,15) else (3 if x in range(15,22) else 4)))
    df['MMDD'] = df['month'].astype('int').astype('str') + '_' + df['new_day'].astype('int').astype('str').str.zfill(2)
    df = pd.get_dummies(df, columns = ['MMDD'])
    df.drop(['new_day'], axis=1, inplace=True)
        
    return df

In [60]:
# 가중평균 특성변수 생성
def create_weighted_average_feature(data, variables, lags):
    for variable in variables:
        for lag in lags:
            data[f'{variable}_lag{lag}'] = data[variable].shift(lag)
    for variable in variables:
        weights = [1 / lag for lag in lags]
        weighted_average = np.average(data[[f'{variable}_lag{lag}' for lag in lags]], axis=1, weights=weights)
        data[f'{variable}_weighted_avg'] = weighted_average
    data.drop([f'{variable}_lag{lag}' for variable in variables for lag in lags], axis=1, inplace=True)
    return data.dropna()

# create_weighted_average_feature함수를 이용해 가중평균합 변수 생성 & train/val set으로 분리
def gen_train_val(original_df):
    df = original_df.copy()

    # train / val dataframe 생성
    train = df[(df.year<='D') & (df.area.isin([1,2,3,4,5,6,7]))].reset_index(drop=True)
    val = df[(df.year=='E') & (df.area.isin([8,9,10]))].reset_index(drop=True)

    # 가중평균 해줄 변수
    variables = ['temp', 'dp_temp', 'rain', 'solar_amt', 'snow']
    lags = [1,2,3]
        
    # weighted average feature : area별로 생성
    df_train = pd.DataFrame()
    for i in np.unique(train.area).tolist():
        temp = train[train.area==i]        
        temp_ = create_weighted_average_feature(temp, variables, lags)
        df_train = pd.concat([df_train, temp_], axis=0)

    df_val = pd.DataFrame()
    for i in np.unique(val.area).tolist():
        temp = val[val.area==i]
        temp_ = create_weighted_average_feature(temp, variables, lags)
        df_val = pd.concat([df_val, temp_], axis=0)

    return df_train, df_val

# create_weighted_average_feature함수를 이용해 가중평균합 변수 생성 & train/val set으로 분리
# 여기서 train set = train + val
def gen_train_test(original_df):
    df = original_df.copy()

    # train / test dataframe 생성
    train = df[df.year<='E'].reset_index(drop=True)
    test = df[df.year=='F'].reset_index(drop=True)

    # weighted average feature area별로 생성
    df_train = pd.DataFrame()
    for i in np.unique(train.area).tolist():
        temp = train[train.area==i]
        variables = ['temp', 'dp_temp', 'rain', 'solar_amt', 'snow']
        lags = [1,2,3]
        temp_ = create_weighted_average_feature(temp, variables, lags)
        df_train = pd.concat([df_train, temp_], axis=0)

    df_test = pd.DataFrame()
    for i in np.unique(test.area).tolist():
        temp = test[test.area==i]
        temp_ = create_weighted_average_feature(temp, variables, lags)
        df_test = pd.concat([df_test, temp_], axis=0)

    return df_train, df_test

In [61]:
# 지역별 10개의 catboost 모델 튜닝 (Gaussian Process 이용) 후 ensemble 모형 생성
def gen_ensemble_model(train_df):
    
    # feature/target 분리
    X_tv = train_df.drop(['month','day','hour','land_temp'], axis = 1).reset_index(drop=True)
    y_tv = train_df[['area','land_temp']].reset_index(drop=True)
    
    # train area 별로 분리 (1 - 10)
    for i in range(1, 11):
        globals()['X_tv'+str(i)] = X_tv[X_tv['area'] == i].reset_index(drop=True).drop(['area'], axis=1)
        globals()['y_tv'+str(i)] = y_tv.loc[y_tv['area'] == i, 'land_temp'].reset_index(drop=True)
        
    # area 별로 catboost model tuning 후 저장
    for i in range(1, 11):
        print('area', i)
        space = [
            Integer(1,10, name='depth'),
            Real(0.01,0.3, 'uniform', name='learning_rate'),
            Integer(2,30,name='l2_leaf_reg'),
            Integer(100,500,name='iterations')
        ]
        groups = globals()['X_tv'+str(i)].year.reset_index(drop=True).values
        globals()['X_tv'+str(i)].drop(['year'], axis=1, inplace=True)
        
        @use_named_args(space)
        def objective(**params):
            model = CatBoostRegressor(random_seed=0, verbose = False)
            model.set_params(**params)

            gscv = GroupTimeSeriesSplit(n_splits=4)
            mae = []
            
            for train_index, valid_index in gscv.split(globals()['X_tv'+str(i)], groups=groups):
                X_train, X_valid = globals()['X_tv'+str(i)].loc[train_index], globals()['X_tv'+str(i)].loc[valid_index]
                y_train, y_valid = globals()['y_tv'+str(i)].loc[train_index], globals()['y_tv'+str(i)].loc[valid_index]

                model.fit(X_train, y_train, verbose=False)
                y_pred = model.predict(X_valid)
                mae.append(mean_absolute_error(y_valid, y_pred))
                
            return np.mean(mae)
        
        best = gp_minimize(objective, space, n_calls=50, random_state=0)
        
        best_params = {
            'depth': best.x[0],
            'learning_rate': best.x[1],
            'l2_leaf_reg': best.x[2],
            'iterations': best.x[3]
        }
        
        model = CatBoostRegressor(random_seed=0, verbose = False)
        model.set_params(**best_params)
        globals()['model_cat'+str(i)] = model
        
    catboost_models = []
    
    for i in range(1,11):
        catboost_models.append(('catboost_'+str(i), globals()['model_cat'+str(i)]))
        
    ensemble_model = VotingRegressor(estimators = catboost_models)
    
    return ensemble_model

In [62]:
dat = pd.read_excel('./subminssionUser.xlsx', sheet_name=None)['SPRING']

In [63]:
import numpy as np
import pandas as pd

path = 'C:/Users/dongk/Data_Handling/공모전/날씨 빅데이터 콘테스트/datasets/'
surface_train = pd.read_csv(path+'train0624.csv')
surface_train.drop(['Unnamed: 0'], axis=1, inplace=True)
surface_test = pd.read_csv(path+'imputed_test_data0624.csv')
surface_test.insert(11, column='land_temp', value=0)

area 1,2,3 : 3월 23일 14시 결측치

## train set

In [67]:
surface_train_fe = gen_feature(surface_train)

In [68]:
#'temp', 'dp_temp', 'humid' 3개 변수에 대해 1, 2, 12, 24 lag 변수와 해당 lag의 percentage change 추가 + 기타 lag 변수들과 interaction들 추가
surface_train_lag = add_lag(original_df = surface_train_fe, 
                            lag_cols = ['temp', 'dp_temp', 'humid'], #lag 추가하는 변수들 
                            lags = [1,2,12,24],
                            t = 'train') #추가 lag들
#spring: 맑음 / 눈&비 / 기타 구분 & snow에 대한 adjustment 
f = {x:3 for x in np.unique(surface_train.fog)}
f.update({'C':1, 'R':2, 'S':2})
spring_train = make_weather_df(original_df = surface_train_lag,
                               surface = surface_train,
                               weather = 'spring',
                               adj = True, #snow adjustment 여부
                               fog_split = f)

spring_train = spring_train.dropna(axis=0)

## Val set 검증

- train set의 area와 test set의 area가 상이한 특징 
- 이러한 test set의 특성을 반영하기 위해 train set : A,B,C,D년도 & 1,2,3,4,5,6,7 area 로 구성하였고 val set : E년도 & 8,9,10 area로 구성하여 검증을 진행하였다.

In [46]:
train_df, val_df = gen_train_val(spring_train)

In [47]:
groups = train_df.year.reset_index(drop=True).values
X_train = train_df.drop(['year','land_temp','day','area','hour','month'], axis=1).reset_index(drop=True)
y_train = train_df['land_temp'].reset_index(drop=True)

X_val = val_df.drop(['year','land_temp','day','area','hour','month'], axis=1).reset_index(drop=True)
y_val = val_df['land_temp'].reset_index(drop=True)

model = CatBoostRegressor(silent = True, random_state=0)
model.fit(X_train, y_train)

mae_ls = list()
for i in np.unique(val_df.area).tolist():
    globals()[f'X_val_{i}'] = val_df[val_df.area == i].drop(['year','land_temp','day','area','hour','month'], axis=1).reset_index(drop=True)
    globals()[f'y_val_{i}'] = val_df[val_df.area == i]['land_temp'].reset_index(drop=True)

    globals()[f'y_pred_{i}'] = model.predict(globals()[f'X_val_{i}'])
    mae = mean_absolute_error(globals()[f'y_val_{i}'], globals()[f'y_pred_{i}'])
    mae_ls.append(mae)

print(f'mae: {np.round(mae_ls, 4)}')
print(f'total mae: {np.mean(mae_ls):.4f}')

mae: [1.4346 1.5813 1.4805]
total mae: 1.4988


## test set

In [64]:
surface_test_fe = gen_feature(surface_test)

In [65]:
#'temp', 'dp_temp', 'humid' 3개 변수에 대해 1, 2, 12, 24 lag 변수와 해당 lag의 percentage change 추가 + 기타 lag 변수들과 interaction들 추가
surface_test_lag = add_lag(original_df = surface_test_fe, 
                      lag_cols = ['temp', 'dp_temp', 'humid'], #lag 추가하는 변수들 
                      lags = [1, 2, 12, 24],
                      t = 'test') #추가 lag들 
#spring: 맑음 / 눈&비 / 기타 구분 & snow에 대한 adjustment 
f = {x:3 for x in np.unique(surface_test.fog)}
f.update({'C':1, 'R':2, 'S':2})
spring_test = make_weather_df(original_df = surface_test_lag,
                              surface = surface_test,
                              weather = 'spring',
                              adj = True, #snow adjustment 여부 
                              fog_split = f)

spring_test = spring_test.dropna(axis=0)

In [None]:
print(f'train shape: {spring_train.shape}')
print(f'test shape: {spring_test.shape}')

- 전체 데이터를 학습 시키는데 사용할 데이터 구성

In [69]:
spring = pd.concat([spring_train, spring_test], axis=0).reset_index(drop=True)
spring.head(3)

Unnamed: 0,dp_temp,humid,area,year,month,day,hour,temp,wind,rain,...,MMDD_2_03,MMDD_2_04,MMDD_3_01,MMDD_3_02,MMDD_3_03,MMDD_3_04,MMDD_4_01,MMDD_4_02,MMDD_4_03,MMDD_4_04
0,-11.130176,0.72179,1.0,A,2.0,2.0,0.0,-9.0,0.8,0.0,...,0,0,0,0,0,0,0,0,0,0
1,-11.184849,0.733955,1.0,A,2.0,2.0,1.0,-9.9,0.7,0.0,...,0,0,0,0,0,0,0,0,0,0
2,-11.239775,0.740426,1.0,A,2.0,2.0,2.0,-10.7,0.7,0.0,...,0,0,0,0,0,0,0,0,0,0


In [70]:
spring[spring.isna().any(axis=1)]

Unnamed: 0,dp_temp,humid,area,year,month,day,hour,temp,wind,rain,...,MMDD_2_03,MMDD_2_04,MMDD_3_01,MMDD_3_02,MMDD_3_03,MMDD_3_04,MMDD_4_01,MMDD_4_02,MMDD_4_03,MMDD_4_04


In [71]:
# # create weighted average feature
train_df, test_df = gen_train_test(spring)
print(f'train: {train_df.shape}')
print(f'test: {test_df.shape}')

train: (105952, 57)
test: (6327, 57)


In [72]:
test_df['MMDDHH'] = test_df['month'].astype('int').astype('str') + test_df['day'].astype('int').astype('str').str.zfill(2) + test_df['hour'].astype('int').astype('str').str.zfill(2)
test_df['MMDDHH'] = test_df['MMDDHH'].astype('int64')

In [73]:
test_df = test_df.set_index('MMDDHH')
test_df.head(3)

Unnamed: 0_level_0,dp_temp,humid,area,year,month,day,hour,temp,wind,rain,...,MMDD_3_04,MMDD_4_01,MMDD_4_02,MMDD_4_03,MMDD_4_04,temp_weighted_avg,dp_temp_weighted_avg,rain_weighted_avg,solar_amt_weighted_avg,snow_weighted_avg
MMDDHH,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20203,-10.232145,0.56385,1.0,F,2.0,2.0,3.0,-4.4,2.3,0.0,...,0,0,0,0,0,-3.945455,-10.224309,0.017304,0.0,0.0
20204,-10.236587,0.563445,1.0,F,2.0,2.0,4.0,-4.7,1.4,0.0,...,0,0,0,0,0,-4.236364,-10.229201,0.0,0.0,0.0
20205,-10.241075,0.563039,1.0,F,2.0,2.0,5.0,-4.8,1.3,0.0,...,0,0,0,0,0,-4.527273,-10.233768,0.0,0.0,0.0


In [74]:
print(test_df.shape)

(6327, 57)


## 검증데이터 예측

In [None]:
ensemble_model = gen_ensemble_model(train_df)

area 1
area 2
area 3
area 4
area 5
area 6
area 7
area 8


In [None]:
# feature/target 분리
X_tv = train_df.drop(['land_temp','hour','day','month'], axis = 1).reset_index(drop=True)
y_tv = train_df[['area', 'land_temp']].reset_index(drop=True)

# train area 별로 분리 (1 - 10)
for i in range(1, 11):
    globals()['X_tv'+str(i)] = X_tv[X_tv['area'] == i].reset_index(drop=True).drop(['area','year'], axis=1)
    globals()['y_tv'+str(i)] = y_tv.loc[y_tv['area'] == i, 'land_temp'].reset_index(drop=True)

In [None]:
# Fit the ensemble model
ensemble_X_tv = pd.DataFrame()
ensemble_y_tv = pd.DataFrame()

for i in range(1,11):
    ensemble_X_tv = pd.concat([ensemble_X_tv, globals()['X_tv'+str(i)]], axis=0)
    ensemble_y_tv = pd.concat([ensemble_y_tv, globals()['y_tv'+str(i)]], axis=0)

ensemble_model.fit(ensemble_X_tv, ensemble_y_tv)

In [None]:
for i in test_df.area.unique():
    globals()[f'X_test_{i}'] = test_df[test_df.area==i].drop(['land_temp','hour','day','year','area','month'], axis=1)
    globals()[f'y_pred_{i}'] = ensemble_model.predict(globals()[f'X_test_{i}'])
    
    globals()[f'y_pred_{int(i)}'] = pd.Series(globals()[f'y_pred_{i}'], index = globals()[f'X_test_{i}'].index, name='pred')
    globals()[f'y_pred_{int(i)}'] = pd.DataFrame(globals()[f'y_pred_{int(i)}'])
    globals()[f'y_pred_{int(i)}'].reset_index(inplace=True)

In [None]:
pred_df = pd.DataFrame()
for i in range(1,4):
    area = dat.STN.unique().tolist()
    temp = dat[dat.STN==area[i-1]]
    temp_ = pd.merge(temp,globals()[f'y_pred_{int(i)}'], on='MMDDHH', how='left')
    pred_df = pd.concat([pred_df, temp_], axis=0)
    
pred_df.drop(['TS'], axis=1, inplace=True)

In [None]:
pred_df[pred_df.isna().any(axis=1)]

## Spring NA 값 예측 모델

- lag feature로 인해 생기는 nan 값을 예측하기 위해, shifting 시켜주지 않고 데이터 구성
- nan 값이 많지 않아 전체 모델링에 큰 영향을 미치지 않을 것이라 판단

### Train set

In [37]:
surface_train_fe = gen_feature(surface_train)

surface_train_wv = gen_wv_df(surface_train_fe, t: 'train')

#spring: 맑음 / 눈&비 / 기타 구분 & snow에 대한 adjustment 
f = {x:3 for x in np.unique(surface_train.fog)}
f.update({'C':1, 'R':2, 'S':2})
spring_train = make_weather_df(original_df = surface_train_wv,
                               surface = surface_train,
                               weather = 'spring',
                               adj = True, #snow adjustment 하는지 
                               fog_split = f)

### Test set

In [39]:
surface_test_fe = gen_feature(surface_test)

surface_test_wv = gen_wv_df(surface_test_fe, t: 'test')

#spring: 맑음 / 눈&비 / 기타 구분 & snow에 대한 adjustment 
f = {x:3 for x in np.unique(surface_test.fog)}
f.update({'C':1, 'R':2, 'S':2})
spring_test = make_weather_df(original_df = surface_test_wv,
                              surface = surface_test,
                              weather = 'spring',
                              adj = True, #snow adjustment 하는지 
                              fog_split = f)

In [None]:
print(f'train shape: {spring_train.shape}')
print(f'test shape: {spring_test.shape}')

(6408, 33)


In [41]:
train_df, test_df = spring_train.copy(), spring_test.copy()

test_df['MMDDHH'] = test_df['month'].astype('int').astype('str') + test_df['day'].astype('int').astype('str').str.zfill(2) + test_df['hour'].astype('int').astype('str').str.zfill(2)
test_df['MMDDHH'] = test_df['MMDDHH'].astype('int64')

test_df = test_df.set_index('MMDDHH')

### na 모델 생성 및 예측

In [111]:
ensemble_na_model = gen_ensemble_model(train_df)

area 1
area 2
area 3
area 4
area 5
area 6
area 7
area 8
area 9
area 10


In [42]:
# feature/target 분리
X_tv = train_df.drop(['land_temp','hour','day','month'], axis = 1).reset_index(drop=True)
y_tv = train_df[['area', 'land_temp']].reset_index(drop=True)

# train area 별로 분리 (1 - 10)
for i in range(1, 11):
    globals()['X_tv'+str(i)] = X_tv[X_tv['area'] == i].reset_index(drop=True).drop(['area','year'], axis=1)
    globals()['y_tv'+str(i)] = y_tv.loc[y_tv['area'] == i, 'land_temp'].reset_index(drop=True)

# Fit the ensemble model
ensemble_X_tv = pd.DataFrame()
ensemble_y_tv = pd.DataFrame()

for i in range(1,11):
    ensemble_X_tv = pd.concat([ensemble_X_tv, globals()['X_tv'+str(i)]], axis=0)
    ensemble_y_tv = pd.concat([ensemble_y_tv, globals()['y_tv'+str(i)]], axis=0)

ensemble_na_model.fit(ensemble_X_tv, ensemble_y_tv)

for i in test_df.area.unique():
    globals()[f'X_test_{i}'] = test_df[test_df.area==i].drop(['land_temp','hour','day','year','area','month'], axis=1)
    globals()[f'y_pred_{i}'] = ensemble_na_model.predict(globals()[f'X_test_{i}'])
    
    globals()[f'y_pred_{int(i)}'] = pd.Series(globals()[f'y_pred_{i}'], index = globals()[f'X_test_{i}'].index, name='pred')
    globals()[f'y_pred_{int(i)}'] = pd.DataFrame(globals()[f'y_pred_{int(i)}'])
    globals()[f'y_pred_{int(i)}'].reset_index(inplace=True)
    
pred_nona = pd.DataFrame()
for i in range(1,4):
    area = dat.STN.unique().tolist()
    temp = dat[dat.STN==area[i-1]]
    temp_ = pd.merge(temp,globals()[f'y_pred_{int(i)}'], on='MMDDHH', how='left')
    pred_nona = pd.concat([pred_nona, temp_], axis=0)
    
pred_nona.drop(['TS'], axis=1, inplace=True)

In [43]:
pred_nona[pred_nona.isna().any(axis=1)]

Unnamed: 0,STN,YEAR,MMDDHH,pred


In [44]:
pred_nona.shape

(6405, 4)

In [None]:
# lag변수로 인해 생기는 결측치 대체
idx = pred_df[pred_df.isna().any(axis=1)].index
pred_df.loc[idx,'pred'] = pred_nona.loc[idx,'pred']
pred_df[pred_df.isna().any(axis=1)]

In [45]:
pred_df.to_csv('제출 df.csv')