In [18]:
import pandas as pd
import os 

import matplotlib.pyplot as plt
plt.rcParams['font.family'] ='Malgun Gothic'
plt.rcParams['axes.unicode_minus'] =False

import seaborn as sns
import numpy as np

from pytimekr import pytimekr

import warnings
warnings.filterwarnings(action='ignore')

In [19]:
database = pd.read_csv('./data/electric_train_cp949.csv', encoding='cp949', index_col=0)
database['electric_train.tm'] = pd.to_datetime(database['electric_train.tm'])

print(database.shape)

(7593355, 16)


In [20]:
elec_cols = ['electric_train.'+ a for a in ['tm', 'hh24', 'weekday', 'week_name', 'sum_qctr', 'n', 'sum_load', 'n_mean_load', 'elec']]

weat_cols = ['electric_train.'+ a for a in ['num', 'stn', 'nph_ta', 'nph_hm', 'nph_ws_10m', 'nph_rn_60m', 'nph_ta_chi']]

reset_order_cols = elec_cols + weat_cols

df_new = database[reset_order_cols]
colunms = {}
for col in reset_order_cols:
    colunms[col] = col.split('.')[1]

df_new = df_new.rename(columns=colunms)

df_new['year'] = df_new['tm'].dt.year
df_new['month'] = df_new['tm'].dt.month
df_new['day'] = df_new['tm'].dt.day
df_new = df_new.sort_values(by='tm')

# 시즌을 결정하는 함수
def get_season(month):
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    elif month in [9, 10, 11]:
        return 'Fall'

# 'season' 컬럼 추가
df_new['season'] = df_new['tm'].dt.month.apply(get_season)


df_new.head()

Unnamed: 0,tm,hh24,weekday,week_name,sum_qctr,n,sum_load,n_mean_load,elec,num,stn,nph_ta,nph_hm,nph_ws_10m,nph_rn_60m,nph_ta_chi,year,month,day,season
2647991,2020-01-01 01:00:00,1,2,0,63850,39,9732.96,250.490543,99.63,13615,140,-8.5,74.5,0.9,0.0,-5.8,2020,1,1,Winter
5208382,2020-01-01 01:00:00,1,2,0,14750,20,2401.12,116.010234,103.49,18235,565,-8.8,25.6,1.9,0.0,-4.6,2020,1,1,Winter
5182078,2020-01-01 01:00:00,1,2,0,71630,32,12139.16,363.252769,104.43,18234,565,-8.8,25.6,1.9,0.0,-4.6,2020,1,1,Winter
5155774,2020-01-01 01:00:00,1,2,0,45840,28,6723.84,225.924357,106.29,18233,512,-6.3,31.3,2.5,0.0,-7.8,2020,1,1,Winter
1280137,2020-01-01 01:00:00,1,2,0,76450,44,12949.24,287.25077,102.45,11272,941,2.4,68.0,1.2,0.0,-5.3,2020,1,1,Winter


### 공휴일은 주말로 표기

In [21]:
kr_holidays_2021 = pytimekr.holidays(year=2021)
kr_holidays_2022 = pytimekr.holidays(year=2022)
kr_holidays_2023 = pytimekr.holidays(year=2023)

df_new.loc[df_new['tm'].isin(kr_holidays_2021),'week_name']=1
df_new.loc[df_new['tm'].isin(kr_holidays_2022),'week_name']=1
df_new.loc[df_new['tm'].isin(kr_holidays_2023),'week_name']=1

In [22]:
cols_for_test = [
        'tm', 'year', 'season', 'month','day', 'hh24', 'weekday','week_name', 
        'num',
        'stn', 'nph_ta','nph_hm', 'nph_ws_10m',
       'nph_rn_60m', 'nph_ta_chi', 'elec']

df_train = df_new[cols_for_test]

df_train.head()
# weekday: 요일, weel_name: 주중, 주말

Unnamed: 0,tm,year,season,month,day,hh24,weekday,week_name,num,stn,nph_ta,nph_hm,nph_ws_10m,nph_rn_60m,nph_ta_chi,elec
2647991,2020-01-01 01:00:00,2020,Winter,1,1,1,2,0,13615,140,-8.5,74.5,0.9,0.0,-5.8,99.63
5208382,2020-01-01 01:00:00,2020,Winter,1,1,1,2,0,18235,565,-8.8,25.6,1.9,0.0,-4.6,103.49
5182078,2020-01-01 01:00:00,2020,Winter,1,1,1,2,0,18234,565,-8.8,25.6,1.9,0.0,-4.6,104.43
5155774,2020-01-01 01:00:00,2020,Winter,1,1,1,2,0,18233,512,-6.3,31.3,2.5,0.0,-7.8,106.29
1280137,2020-01-01 01:00:00,2020,Winter,1,1,1,2,0,11272,941,2.4,68.0,1.2,0.0,-5.3,102.45


In [23]:
# 온도별 elec의 이상치 처리 
for idx in list(df_train.loc[df_train['elec'] < 0, 'nph_ta'].index):
    area_num = df_train.loc[idx]['num']
    season = df_train.loc[idx]['season']
    year = df_train.loc[idx]['year']
    temp = df_train.loc[idx]['nph_ta']
    df_train.loc[idx, 'elec'] = \
        df_train.loc[(df_train.loc[idx]['num'] == area_num) & (df_train.loc[idx]['year'] == year) & (df_train.loc[idx]['season'] == season) & (df_train['nph_ta'] == temp), :]['elec'].mean()
        # 해당 지역의 해당 연도의 해당 시즌의 해당 온도와 같은 날의 elec의 평균

In [24]:
# 풍속이 음수인 값을 갖는 이상치 처리

ano_list = list(df_train.loc[df_train['nph_ws_10m'] < 0, 'nph_ws_10m'].index)
for idx in ano_list:
    pre_idx = idx - 1
    nex_idx = idx + 1

    previous_value = df_train.loc[pre_idx, 'nph_ws_10m']
    next_value = df_train.loc[nex_idx, 'nph_ws_10m']
    mean_val = (previous_value + next_value) / 2
    df_train.loc[idx, 'nph_ws_10m'] = mean_val

In [None]:
## https://dacon.io/competitions/official/235680/codeshare/2366?page=1&dtype=recent
## sin cos 함수를 이용한 시간의 연속적 표현 (cyclical time encoding)
df_train['sin_time'] = np.sin(2*np.pi*df_train.hour/24)
df_train['cos_time'] = np.cos(2*np.pi*df_train.hour/24)

In [16]:
# from tqdm import tqdm
# #######################################
# ## 건물별, 요일별, 시간별 발전량 평균 넣어주기
# #######################################
# power_mean = pd.pivot_table(train, values = 'elec', index = ['num', 'hh24', 'weekday'], aggfunc = np.mean).reset_index()
# tqdm.pandas()
# train['weekday_hour_mean'] = train.progress_apply(lambda x : power_mean.loc[(power_mean.num == x['num']) & (power_mean.hh24 == x['hh24']) & (power_mean.weekday == x['weekday']) ,'elec'].values[0], axis = 1)

# #######################################
# ## 건물별 시간별 발전량 평균 넣어주기
# #######################################
# power_hour_mean = pd.pivot_table(train, values = 'elec', index = ['num', 'hh24'], aggfunc = np.mean).reset_index()
# tqdm.pandas()
# train['hour_mean'] = train.progress_apply(lambda x : power_hour_mean.loc[(power_hour_mean.num == x['num']) & (power_hour_mean.hh24 == x['hh24']) ,'elec'].values[0], axis = 1)

# #######################################
# ## 건물별 시간별 발전량 표준편차 넣어주기
# #######################################
# power_hour_std = pd.pivot_table(train, values = 'elec', index = ['num', 'hh24'], aggfunc = np.std).reset_index()
# tqdm.pandas()
# train['hour_std'] = train.progress_apply(lambda x : power_hour_std.loc[(power_hour_std.num == x['num']) & (power_hour_std.hh24 == x['hh24']) ,'elec'].values[0], axis = 1)



  6%|▌         | 463654/7593355 [03:40<53:04, 2239.10it/s]  

In [12]:
df_train.head()

Unnamed: 0,tm,year,season,month,day,hh24,weekday,week_name,num,stn,nph_ta,nph_hm,nph_ws_10m,nph_rn_60m,nph_ta_chi,elec
2647991,2020-01-01 01:00:00,2020,Winter,1,1,1,2,0,13615,140,-8.5,74.5,0.9,0.0,-5.8,99.63
5208382,2020-01-01 01:00:00,2020,Winter,1,1,1,2,0,18235,565,-8.8,25.6,1.9,0.0,-4.6,103.49
5182078,2020-01-01 01:00:00,2020,Winter,1,1,1,2,0,18234,565,-8.8,25.6,1.9,0.0,-4.6,104.43
5155774,2020-01-01 01:00:00,2020,Winter,1,1,1,2,0,18233,512,-6.3,31.3,2.5,0.0,-7.8,106.29
1280137,2020-01-01 01:00:00,2020,Winter,1,1,1,2,0,11272,941,2.4,68.0,1.2,0.0,-5.3,102.45
