In [50]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [51]:
train = pd.read_csv('train_pr.csv')
test = pd.read_csv('test_pr.csv')

In [52]:
train['date_time'] = pd.to_datetime(train['date_time'])
test['date_time'] = pd.to_datetime(test['date_time'])
train['month'] = train['date_time'].dt.month
train['weekday'] = train['date_time'].dt.weekday
train['day'] = train['date_time'].dt.day
train['hour'] = train['date_time'].dt.hour
test['month'] = test['date_time'].dt.month
test['weekday'] = test['date_time'].dt.weekday
test['day'] = test['date_time'].dt.day
test['hour'] = test['date_time'].dt.hour

In [53]:
def change_h(x) :
    
    if x ==23  or 0<= x < 9 : return 0 
    elif x ==9 or x == 12 or 17<= x < 23:  return 1
    elif 10 <= x <12 or 13<= x < 17 : return 2

train['day_c'] = train['hour'].apply(change_h)
test['day_c'] = test['hour'].apply(change_h)

train.loc[(train['weekday']==5) & (train['day_c']==2),'day_c'] = 1
train.loc[(train['weekday']==6),'day_c'] = 0

test.loc[(test['weekday']==5) & (test['day_c']==2),'day_c'] = 1
test.loc[(test['weekday']==6),'day_c'] = 0

In [54]:
def holiday(x):
    if str(x.date()) == '2020-06-06' or '2020-08-15' or '2020-08-17':
        return 1
    elif x.weekday() == 5 or x.weekday() == 6:
        return 1
    else: 
        return 0

train['holiday'] = train['date_time'].apply(lambda x: holiday(x))
test['holiday'] = test['date_time'].apply(lambda x: holiday(x))

In [55]:
test = test.rename(columns={'강수량(mm, 6시간)':'강수량(mm)'})
test = test.rename(columns={'일조(hr, 3시간)':'일조(hr)'})

#강수량의 경우 수치가 0인 경우가 과반수를 넘어 카테고리형태로 변환
def binary(x,y):
    x[y].astype(int)
    for i in range(len(x)):
        if x[y][i] == 0:
            x[y][i] = 0
        else: 
            x[y][i] = 1

binary(train, '강수량(mm)')
binary(test, '강수량(mm)')

In [57]:
sun = train.groupby('num')['태양광보유'].unique()
non_ele = train.groupby('num')['비전기냉방설비운영'].unique()

In [58]:
test['태양광보유'] = test['num'].map(sun).astype(int)
test['비전기냉방설비운영'] = test['num'].map(non_ele).astype(int)

In [59]:
train['불쾌지수'] = 9/5*train['기온(°C)'] - 0.55*(1-train['습도(%)']/100)*(9/5*train['기온(°C)']-26) + 32
test['불쾌지수'] = 9/5*test['기온(°C)'] - 0.55*(1-test['습도(%)']/100)*(9/5*test['기온(°C)']-26) + 32

train['불쾌지수_척도'] = pd.cut(train['불쾌지수'], bins=[0, 68, 75, 80, 200], labels = [1,2,3,4])
test['불쾌지수_척도'] = pd.cut(test['불쾌지수'], bins=[0, 68, 75, 80, 200], labels = [1,2,3,4])

In [60]:
train['체감온도(°C)'] = 13.12 + 0.6215 * train['기온(°C)'] - 11.37 * (train['풍속(m/s)']*3.6)**0.16 + 0.3965 * (train['풍속(m/s)']*3.6)**0.16 * train['기온(°C)']
test['체감온도(°C)'] = 13.12 + 0.6215 * test['기온(°C)'] - 11.37 * (test['풍속(m/s)']*3.6)**0.16 + 0.3965 * (test['풍속(m/s)']*3.6)**0.16 * test['기온(°C)']

In [61]:
train['체감온도(°C)'] = train['체감온도(°C)'].round(1)
test['체감온도(°C)'] = test['체감온도(°C)'].round(1)

In [63]:
import plotly.express as px
px.line(train, x='hour', y='전력사용량(kWh)', color='불쾌지수_척도')

In [64]:
px.line(train, x='hour', y='전력사용량(kWh)', color='체감온도(°C)')

In [65]:
answer = test['전력사용량(kWh)']

In [66]:
train2 = train.drop(['전력사용량(kWh)','date_time'], axis=1)
test2 = test.drop(['date_time','전력사용량(kWh)'], axis=1)

In [67]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(train2)
train2_scaled = pd.DataFrame(scaler.transform(train2), columns=train2.columns)
test2_scaled = pd.DataFrame(scaler.transform(test2), columns=test2.columns)

In [68]:
from lightgbm import LGBMRegressor
from sklearn.model_selection import KFold
li = LGBMRegressor(iterator=10000, random_state=42, learning_rate=0.2, n_estimators=1500, max_depth=7)

In [69]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
i = 0
result2 = 0
for train_idx, test_idx in kf.split(train2_scaled, train['전력사용량(kWh)']):
    x_train, x_val = train2_scaled.iloc[train_idx], train2_scaled.iloc[test_idx]
    y_train, y_val = train['전력사용량(kWh)'][train_idx], train['전력사용량(kWh)'][test_idx]
    li.fit(x_train, y_train, eval_set=[(x_val, y_val)], verbose=100, early_stopping_rounds=300)
    result2 += li.predict(test2_scaled)/5
    i += 1
    print(f'----------------------------------------n_fold:{i}----------------------------------------')

Training until validation scores don't improve for 300 rounds
[100]	valid_0's l2: 19420.5
[200]	valid_0's l2: 19275.6
[300]	valid_0's l2: 19395.7
Early stopping, best iteration is:
[46]	valid_0's l2: 18454.9
----------------------------------------n_fold:1----------------------------------------
Training until validation scores don't improve for 300 rounds
[100]	valid_0's l2: 23977.7
[200]	valid_0's l2: 23610.7
[300]	valid_0's l2: 23838.5
[400]	valid_0's l2: 24156.2
Early stopping, best iteration is:
[192]	valid_0's l2: 23528
----------------------------------------n_fold:2----------------------------------------
Training until validation scores don't improve for 300 rounds
[100]	valid_0's l2: 14503.4
[200]	valid_0's l2: 15014.6
[300]	valid_0's l2: 15375.9
Early stopping, best iteration is:
[82]	valid_0's l2: 14320
----------------------------------------n_fold:3----------------------------------------
Training until validation scores don't improve for 300 rounds
[100]	valid_0's l2: 19

In [70]:
def SMAPE(true, pred):
    return np.mean((np.abs(true-pred))*2/(np.abs(true) + np.abs(pred)))*100

print(SMAPE(answer,result2))

11.596021618321517


In [71]:
from xgboost import XGBRegressor
xgb = XGBRegressor(learning_rate=0.1,
                   n_estimators=1500,
                   colsample_bytree=0.9, 
                   subsample=0.7, 
                   n_jobs=-1,
                   scale_pos_weight=1.4,
                   tree_method='auto')

In [72]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
i = 0
result = 0
best_score = 0
for train_idx, test_idx in kf.split(train2_scaled, train['전력사용량(kWh)']):
    x_train, x_val = train2_scaled.iloc[train_idx], train2_scaled.iloc[test_idx]
    y_train, y_val = train['전력사용량(kWh)'][train_idx], train['전력사용량(kWh)'][test_idx]
    xgb.fit(x_train, y_train, eval_set=[(x_val, y_val)], verbose=50, early_stopping_rounds=100)
    best_socre = xgb.best_score/5
    result += xgb.predict(test2_scaled)/5
    i += 1
    print(f'----------------------------------------n_fold:{i}----------------------------------------')

[0]	validation_0-rmse:1402.77893
[50]	validation_0-rmse:118.82623
[100]	validation_0-rmse:106.97968
[150]	validation_0-rmse:104.88599
[200]	validation_0-rmse:103.73743
[250]	validation_0-rmse:103.31482
[300]	validation_0-rmse:103.23281
[350]	validation_0-rmse:103.13125
[400]	validation_0-rmse:103.10684
[450]	validation_0-rmse:103.05820
[500]	validation_0-rmse:102.98946
[550]	validation_0-rmse:102.99277
[600]	validation_0-rmse:103.00494
[615]	validation_0-rmse:103.01358
----------------------------------------n_fold:1----------------------------------------
[0]	validation_0-rmse:1389.44482



Use subset (sliced data) of np.ndarray is not recommended because it will generate extra copies and increase memory consumption



[50]	validation_0-rmse:145.92804
[100]	validation_0-rmse:129.84351
[150]	validation_0-rmse:125.99238
[200]	validation_0-rmse:123.63782
[250]	validation_0-rmse:122.87347
[300]	validation_0-rmse:122.44272
[350]	validation_0-rmse:122.41792
[400]	validation_0-rmse:122.36134
[450]	validation_0-rmse:122.33903
[500]	validation_0-rmse:122.33614
[526]	validation_0-rmse:122.32942
----------------------------------------n_fold:2----------------------------------------
[0]	validation_0-rmse:1413.91223
[50]	validation_0-rmse:131.39767
[100]	validation_0-rmse:122.89651
[150]	validation_0-rmse:120.93033
[200]	validation_0-rmse:120.51318
[250]	validation_0-rmse:120.17223
[300]	validation_0-rmse:120.01916
[350]	validation_0-rmse:120.06137
[400]	validation_0-rmse:119.94340
[450]	validation_0-rmse:119.88508
[500]	validation_0-rmse:119.85865
[550]	validation_0-rmse:119.86049
[600]	validation_0-rmse:119.87820
[602]	validation_0-rmse:119.88107
----------------------------------------n_fold:3----------------

In [73]:
print(SMAPE(answer,result))

8.175538907003013
