# 준비

In [20]:
import pandas as pd
import numpy as np
from tqdm import tqdm

In [5]:
train = pd.read_csv("data/train.csv", encoding="cp949")
test = pd.read_csv("data/test.csv", encoding="cp949")
submission = pd.read_csv("data/sample_submission.csv", encoding="cp949")

In [8]:
train.tail(3)

Unnamed: 0,num,date_time,전력사용량(kWh),기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유
122397,60,2020-08-24 21,3572.208,27.3,1.8,71.0,0.0,0.0,1.0,1.0
122398,60,2020-08-24 22,3299.184,27.1,1.8,74.0,0.0,0.0,1.0,1.0
122399,60,2020-08-24 23,3204.576,27.1,2.6,75.0,0.0,0.0,1.0,1.0


In [9]:
test.tail(3)

Unnamed: 0,num,date_time,기온(°C),풍속(m/s),습도(%),"강수량(mm, 6시간)","일조(hr, 3시간)",비전기냉방설비운영,태양광보유
10077,60,2020-08-31 21,27.9,4.1,68.0,,0.0,1.0,1.0
10078,60,2020-08-31 22,,,,,,,
10079,60,2020-08-31 23,,,,,,,


* num : 건물번호

# 전처리

## 변수 영문명 변경

In [10]:
cols = ['num', 'date_time', 'power', 'temp', 'wind','hum' ,'prec', 'sun', 'non_elec', 'solar']
train.columns = cols

## 시간 요일피처 생성

In [12]:
date = pd.to_datetime(train.date_time)
train['hour'] = date.dt.hour
train['day'] = date.dt.weekday
train['month'] = date.dt.month
train['week'] = date.dt.weekofyear

  train['week'] = date.dt.weekofyear


In [13]:
train

Unnamed: 0,num,date_time,power,temp,wind,hum,prec,sun,non_elec,solar,hour,day,month,week
0,1,2020-06-01 00,8179.056,17.6,2.5,92.0,0.8,0.0,0.0,0.0,0,0,6,23
1,1,2020-06-01 01,8135.640,17.7,2.9,91.0,0.3,0.0,0.0,0.0,1,0,6,23
2,1,2020-06-01 02,8107.128,17.5,3.2,91.0,0.0,0.0,0.0,0.0,2,0,6,23
3,1,2020-06-01 03,8048.808,17.1,3.2,91.0,0.0,0.0,0.0,0.0,3,0,6,23
4,1,2020-06-01 04,8043.624,17.0,3.3,92.0,0.0,0.0,0.0,0.0,4,0,6,23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122395,60,2020-08-24 19,4114.368,27.8,2.3,68.0,0.0,0.7,1.0,1.0,19,0,8,35
122396,60,2020-08-24 20,3975.696,27.3,1.2,71.0,0.0,0.0,1.0,1.0,20,0,8,35
122397,60,2020-08-24 21,3572.208,27.3,1.8,71.0,0.0,0.0,1.0,1.0,21,0,8,35
122398,60,2020-08-24 22,3299.184,27.1,1.8,74.0,0.0,0.0,1.0,1.0,22,0,8,35


## 건물별, 요일별, 시간별 발전량 평균

In [21]:
power_mean = pd.pivot_table(train, values = 'power', index = ['num', 'hour', 'day'], aggfunc = np.mean).reset_index()
tqdm.pandas()
train['day_hour_mean'] = train.progress_apply(lambda x : power_mean.loc[(power_mean.num == x['num']) & (power_mean.hour == x['hour']) & (power_mean.day == x['day']) ,'power'].values[0], axis = 1)

100%|████████████████████████████████████████████████████████████████████████| 122400/122400 [01:04<00:00, 1885.54it/s]


## 건물별, 시간별 발전량 평균

In [25]:
power_hour_mean = pd.pivot_table(train, values = 'power', index = ['num', 'hour'], aggfunc = np.mean).reset_index()
tqdm.pandas()
train['hour_mean'] = train.progress_apply(lambda x : power_hour_mean.loc[(power_hour_mean.num == x['num']) & (power_hour_mean.hour == x['hour']) ,'power'].values[0], axis = 1)

100%|████████████████████████████████████████████████████████████████████████| 122400/122400 [00:44<00:00, 2750.61it/s]


In [26]:
train

Unnamed: 0,num,date_time,power,temp,wind,hum,prec,sun,non_elec,solar,hour,day,month,week,day_hour_mean,hour_mean
0,1,2020-06-01 00,8179.056,17.6,2.5,92.0,0.8,0.0,0.0,0.0,0,0,6,23,8528.627077,8540.373176
1,1,2020-06-01 01,8135.640,17.7,2.9,91.0,0.3,0.0,0.0,0.0,1,0,6,23,8513.723077,8517.174776
2,1,2020-06-01 02,8107.128,17.5,3.2,91.0,0.0,0.0,0.0,0.0,2,0,6,23,8496.625846,8509.055718
3,1,2020-06-01 03,8048.808,17.1,3.2,91.0,0.0,0.0,0.0,0.0,3,0,6,23,8480.076923,8493.313129
4,1,2020-06-01 04,8043.624,17.0,3.3,92.0,0.0,0.0,0.0,0.0,4,0,6,23,8472.051692,8479.522165
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122395,60,2020-08-24 19,4114.368,27.8,2.3,68.0,0.0,0.7,1.0,1.0,19,0,8,35,3536.584615,3552.788329
122396,60,2020-08-24 20,3975.696,27.3,1.2,71.0,0.0,0.0,1.0,1.0,20,0,8,35,3467.364923,3487.246306
122397,60,2020-08-24 21,3572.208,27.3,1.8,71.0,0.0,0.0,1.0,1.0,21,0,8,35,3359.298462,3412.515388
122398,60,2020-08-24 22,3299.184,27.1,1.8,74.0,0.0,0.0,1.0,1.0,22,0,8,35,3228.369231,3269.548800


## 건물별 시간별 발전량 표준편차

In [29]:
power_hour_std = pd.pivot_table(train, values = 'power', index = ['num', 'hour'], aggfunc = np.std).reset_index()
tqdm.pandas()
train['hour_std'] = train.progress_apply(lambda x : power_hour_std.loc[(power_hour_std.num == x['num']) & (power_hour_std.hour == x['hour']) ,'power'].values[0], axis = 1)

100%|████████████████████████████████████████████████████████████████████████| 122400/122400 [00:44<00:00, 2741.26it/s]


## 공휴일 변수 추가

In [30]:
train['holiday'] = train.apply(lambda x : 0 if x['day']<5 else 1, axis = 1)

In [43]:
holiday = pd.read_csv("holiday2004-2022.csv")

In [47]:
holiday_list = holiday["date"].to_list()

In [48]:
def holiday_check(x):
    if x in holiday_list:
        return 1
    else:
        return 0

In [53]:
train["date"] = train["date_time"].map(lambda x: x.split()[0])

In [56]:
train["holiday2"] = train["date"].map(lambda x: holiday_check(x))

In [57]:
train["holiday"] = train["holiday"]+train["holiday2"]

In [61]:
train.loc[train["holiday"]==2, "holiday"]=1

In [67]:
train = train.drop(["holiday2","date"], axis=1)

## 시간 -> sin, cos

In [68]:
train['sin_time'] = np.sin(2*np.pi*train.hour/24)
train['cos_time'] = np.cos(2*np.pi*train.hour/24)

## 불쾌지수

In [69]:
train['THI'] = 9/5*train['temp'] - 0.55*(1-train['hum']/100)*(9/5*train['hum']-26)+32

In [71]:
def CDH(xs):
    ys = []
    for i in range(len(xs)):
        if i < 11:
            ys.append(np.sum(xs[:(i+1)]-26))
        else:
            ys.append(np.sum(xs[(i-11):(i+1)]-26))
    return np.array(ys)

cdhs = np.array([])
for num in range(1,61,1):
    temp = train[train['num'] == num]
    cdh = CDH(temp['temp'].values)
    cdhs = np.concatenate([cdhs, cdh])
train['CDH'] = cdhs

train.drop(['non_elec','solar','hour'], axis = 1, inplace = True)

Unnamed: 0,num,date_time,power,temp,wind,hum,prec,sun,day,month,week,day_hour_mean,hour_mean,hour_std,holiday,sin_time,cos_time,THI,CDH
0,1,2020-06-01 00,8179.056,17.6,2.5,92.0,0.8,0.0,0,6,23,8528.627077,8540.373176,118.793252,0,0.0,1.0,57.5376,-8.4
1,1,2020-06-01 01,8135.64,17.7,2.9,91.0,0.3,0.0,0,6,23,8513.723077,8517.174776,137.989738,0,0.258819,0.965926,57.0389,-16.7
2,1,2020-06-01 02,8107.128,17.5,3.2,91.0,0.0,0.0,0,6,23,8496.625846,8509.055718,122.381197,0,0.5,0.866025,56.6789,-25.2
3,1,2020-06-01 03,8048.808,17.1,3.2,91.0,0.0,0.0,0,6,23,8480.076923,8493.313129,122.054777,0,0.707107,0.707107,55.9589,-34.1
4,1,2020-06-01 04,8043.624,17.0,3.3,92.0,0.0,0.0,0,6,23,8472.051692,8479.522165,124.472447,0,0.866025,0.5,56.4576,-43.1


In [76]:
train

Unnamed: 0,num,date_time,power,temp,wind,hum,prec,sun,day,month,week,day_hour_mean,hour_mean,hour_std,holiday,sin_time,cos_time,THI,CDH
0,1,2020-06-01 00,8179.056,17.6,2.5,92.0,0.8,0.0,0,6,23,8528.627077,8540.373176,118.793252,0,0.000000,1.000000,57.5376,-8.4
1,1,2020-06-01 01,8135.640,17.7,2.9,91.0,0.3,0.0,0,6,23,8513.723077,8517.174776,137.989738,0,0.258819,0.965926,57.0389,-16.7
2,1,2020-06-01 02,8107.128,17.5,3.2,91.0,0.0,0.0,0,6,23,8496.625846,8509.055718,122.381197,0,0.500000,0.866025,56.6789,-25.2
3,1,2020-06-01 03,8048.808,17.1,3.2,91.0,0.0,0.0,0,6,23,8480.076923,8493.313129,122.054777,0,0.707107,0.707107,55.9589,-34.1
4,1,2020-06-01 04,8043.624,17.0,3.3,92.0,0.0,0.0,0,6,23,8472.051692,8479.522165,124.472447,0,0.866025,0.500000,56.4576,-43.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122395,60,2020-08-24 19,4114.368,27.8,2.3,68.0,0.0,0.7,0,8,35,3536.584615,3552.788329,316.517477,0,-0.965926,0.258819,65.0736,30.8
122396,60,2020-08-24 20,3975.696,27.3,1.2,71.0,0.0,0.0,0,8,35,3467.364923,3487.246306,296.244268,0,-0.866025,0.500000,64.9029,32.3
122397,60,2020-08-24 21,3572.208,27.3,1.8,71.0,0.0,0.0,0,8,35,3359.298462,3412.515388,263.115812,0,-0.707107,0.707107,64.9029,32.5
122398,60,2020-08-24 22,3299.184,27.1,1.8,74.0,0.0,0.0,0,8,35,3228.369231,3269.548800,256.238151,0,-0.500000,0.866025,65.4504,31.3


In [None]:

AIzaSyBJ6Eddm_k0UQ57GjwOZQW__XDm7tefrNA