# xgb alpha

## preprocess

In [1]:
import random
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedKFold

from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

os.makedirs('subs', exist_ok=True)
# import warnings
# warnings.filterwarnings(action='ignore')

In [2]:
seed = 990313

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(seed) # Seed 고정

In [3]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
infos = pd.read_csv('building_info.csv')
infos = infos.replace('-', np.NaN)
infos[['태양광용량(kW)', 'ESS저장용량(kWh)', 'PCS용량(kW)']] = infos[['태양광용량(kW)', 'ESS저장용량(kWh)', 'PCS용량(kW)']].astype(float)
train_df.head()

Unnamed: 0,num_date_time,건물번호,일시,기온(C),강수량(mm),풍속(m/s),습도(%),일조(hr),일사(MJ/m2),전력소비량(kWh)
0,1_20220601 00,1,20220601 00,18.6,,0.9,42.0,,,1085.28
1,1_20220601 01,1,20220601 01,18.0,,1.1,45.0,,,1047.36
2,1_20220601 02,1,20220601 02,17.7,,1.5,45.0,,,974.88
3,1_20220601 03,1,20220601 03,16.7,,1.4,48.0,,,953.76
4,1_20220601 04,1,20220601 04,18.4,,2.8,43.0,,,986.4


In [4]:
infos['태양광용량(kW)'] = infos['태양광용량(kW)'].fillna(0)
infos['태양광용량(kW)'] = np.where(infos['태양광용량(kW)'] > 0, 1, 0)
infos['태양광용량(kW)'].value_counts()

태양광용량(kW)
0    64
1    36
Name: count, dtype: int64

In [5]:
infos.isnull().sum()

건물번호             0
건물유형             0
연면적(m2)          0
냉방면적(m2)         0
태양광용량(kW)        0
ESS저장용량(kWh)    95
PCS용량(kW)       95
dtype: int64

In [6]:
train_df = pd.merge(train_df, infos[['건물번호', '건물유형', '태양광용량(kW)']], how = 'outer',on = '건물번호')
test_df = pd.merge(test_df, infos[['건물번호', '건물유형', '태양광용량(kW)']], how = 'outer',on = '건물번호')
# train_df = pd.merge(train_df, infos[['건물번호', '건물유형', '연면적(m2)', '냉방면적(m2)']], how = 'outer',on = '건물번호')
# test_df = pd.merge(test_df, infos[['건물번호', '건물유형', '연면적(m2)', '냉방면적(m2)']], how = 'outer',on = '건물번호')

In [8]:
date = pd.to_datetime(train_df.일시)
train_df['hour'] = date.dt.hour
train_df['day'] = date.dt.weekday
train_df['month'] = date.dt.month
#train_df['week'] = date.dt.weekofyear
train_df['week'] = date.dt.isocalendar().week  # 수정된 부분

date = pd.to_datetime(test_df.일시)
test_df['hour'] = date.dt.hour
test_df['day'] = date.dt.weekday
test_df['month'] = date.dt.month
#test_df['week'] = date.dt.weekofyear
test_df['week'] = date.dt.isocalendar().week  # 수정된 부분

# train_df[['강수량(mm)', '풍속(m/s)', '습도(%)']] = train_df.groupby(['month', '건물번호'])[['강수량(mm)', '풍속(m/s)', '습도(%)']].transform(lambda x: x.fillna(x.mean()))

for month in train_df['month'].unique():
    train_df.loc[train_df['month'] == month, ['강수량(mm)', '풍속(m/s)', '습도(%)']] = train_df.loc[train_df['month'] == month, ['강수량(mm)', '풍속(m/s)', '습도(%)']].fillna(train_df.loc[train_df['month'] == month, ['강수량(mm)', '풍속(m/s)', '습도(%)']].mean())
# for month in train_df['month'].unique():
#     train_df.loc[train_df['month'] == month, ['풍속(m/s)', '습도(%)']] = train_df.loc[train_df['month'] == month, ['풍속(m/s)', '습도(%)']].fillna(train_df.loc[train_df['month'] == month, ['풍속(m/s)', '습도(%)']].mean())

In [9]:
train_df = train_df.fillna(0)
test_df = test_df.fillna(0)

In [10]:
train_df.loc[(train_df['건물번호'] == 95) & (train_df['전력소비량(kWh)'] < 1), '전력소비량(kWh)'] = np.NaN
train_df['전력소비량(kWh)'] = train_df['전력소비량(kWh)'].interpolate(limit_direction='both', method='linear').round(3)

In [11]:
power_mean = pd.pivot_table(train_df, values = '전력소비량(kWh)', index = ['건물번호', 'day', 'month'], aggfunc = np.mean).reset_index()
train_df['month_day_mean'] = train_df.apply(lambda x : power_mean.loc[(power_mean.건물번호 == x['건물번호']) & (power_mean.day == x['day']) & (power_mean.month == x['month']) ,'전력소비량(kWh)'].values[0], axis = 1)
test_df['month_day_mean'] = test_df.apply(lambda x : power_mean.loc[(power_mean.건물번호 == x['건물번호']) & (power_mean.day == x['day']) & (power_mean.month == x['month']) ,'전력소비량(kWh)'].values[0], axis = 1)

In [None]:
power_mean = pd.pivot_table(train_df, values = '전력소비량(kWh)', index = ['건물번호', 'hour', 'day'], aggfunc = np.mean).reset_index()
train_df['day_hour_mean'] = train_df.apply(lambda x : power_mean.loc[(power_mean.건물번호 == x['건물번호']) & (power_mean.hour == x['hour']) & (power_mean.day == x['day']) ,'전력소비량(kWh)'].values[0], axis = 1)
test_df['day_hour_mean'] = test_df.apply(lambda x : power_mean.loc[(power_mean.건물번호 == x['건물번호']) & (power_mean.hour == x['hour']) & (power_mean.day == x['day']) ,'전력소비량(kWh)'].values[0], axis = 1)

power_std = pd.pivot_table(train_df, values = '전력소비량(kWh)', index = ['건물번호', 'hour', 'day'], aggfunc = np.std).reset_index()
train_df['day_hour_std'] = train_df.apply(lambda x : power_std.loc[(power_std.건물번호 == x['건물번호']) & (power_std.hour == x['hour']) & (power_std.day == x['day']) ,'전력소비량(kWh)'].values[0], axis = 1)
test_df['day_hour_std'] = test_df.apply(lambda x : power_std.loc[(power_std.건물번호 == x['건물번호']) & (power_std.hour == x['hour']) & (power_std.day == x['day']) ,'전력소비량(kWh)'].values[0], axis = 1)

power_hour_mean = pd.pivot_table(train_df, values = '전력소비량(kWh)', index = ['건물번호', 'hour'], aggfunc = np.mean).reset_index()
train_df['hour_mean'] = train_df.apply(lambda x : power_hour_mean.loc[(power_hour_mean.건물번호 == x['건물번호']) & (power_hour_mean.hour == x['hour']) ,'전력소비량(kWh)'].values[0], axis = 1)
test_df['hour_mean'] = test_df.apply(lambda x : power_hour_mean.loc[(power_hour_mean.건물번호 == x['건물번호']) & (power_hour_mean.hour == x['hour']) ,'전력소비량(kWh)'].values[0], axis = 1)

power_hour_std = pd.pivot_table(train_df, values = '전력소비량(kWh)', index = ['건물번호', 'hour'], aggfunc = np.std).reset_index()
train_df['hour_std'] = train_df.apply(lambda x : power_hour_std.loc[(power_hour_std.건물번호 == x['건물번호']) & (power_hour_std.hour == x['hour']) ,'전력소비량(kWh)'].values[0], axis = 1)
test_df['hour_std'] = test_df.apply(lambda x : power_hour_std.loc[(power_hour_std.건물번호 == x['건물번호']) & (power_hour_std.hour == x['hour']) ,'전력소비량(kWh)'].values[0], axis = 1)


power_mean = pd.pivot_table(train_df, values = '전력소비량(kWh)', index = ['건물유형', 'hour', 'day'], aggfunc = np.mean).reset_index()
train_df['type_day_hour_mean'] = train_df.apply(lambda x : power_mean.loc[(power_mean.건물유형 == x['건물유형']) & (power_mean.hour == x['hour']) & (power_mean.day == x['day']) ,'전력소비량(kWh)'].values[0], axis = 1)
test_df['type_day_hour_mean'] = test_df.apply(lambda x : power_mean.loc[(power_mean.건물유형 == x['건물유형']) & (power_mean.hour == x['hour']) & (power_mean.day == x['day']) ,'전력소비량(kWh)'].values[0], axis = 1)

power_std = pd.pivot_table(train_df, values = '전력소비량(kWh)', index = ['건물유형', 'hour', 'day'], aggfunc = np.std).reset_index()
train_df['type_day_hour_std'] = train_df.apply(lambda x : power_std.loc[(power_std.건물유형 == x['건물유형']) & (power_std.hour == x['hour']) & (power_std.day == x['day']) ,'전력소비량(kWh)'].values[0], axis = 1)
test_df['type_day_hour_std'] = test_df.apply(lambda x : power_std.loc[(power_std.건물유형 == x['건물유형']) & (power_std.hour == x['hour']) & (power_std.day == x['day']) ,'전력소비량(kWh)'].values[0], axis = 1)

power_hour_mean = pd.pivot_table(train_df, values = '전력소비량(kWh)', index = ['건물유형', 'hour'], aggfunc = np.mean).reset_index()
train_df['type_hour_mean'] = train_df.apply(lambda x : power_hour_mean.loc[(power_hour_mean.건물유형 == x['건물유형']) & (power_hour_mean.hour == x['hour']) ,'전력소비량(kWh)'].values[0], axis = 1)
test_df['type_hour_mean'] = test_df.apply(lambda x : power_hour_mean.loc[(power_hour_mean.건물유형 == x['건물유형']) & (power_hour_mean.hour == x['hour']) ,'전력소비량(kWh)'].values[0], axis = 1)

power_hour_std = pd.pivot_table(train_df, values = '전력소비량(kWh)', index = ['건물유형', 'hour'], aggfunc = np.std).reset_index()
train_df['type_hour_std'] = train_df.apply(lambda x : power_hour_std.loc[(power_hour_std.건물유형 == x['건물유형']) & (power_hour_std.hour == x['hour']) ,'전력소비량(kWh)'].values[0], axis = 1)
test_df['type_hour_std'] = test_df.apply(lambda x : power_hour_std.loc[(power_hour_std.건물유형 == x['건물유형']) & (power_hour_std.hour == x['hour']) ,'전력소비량(kWh)'].values[0], axis = 1)


### 공휴일 변수 추가
train_df['holiday'] = train_df.apply(lambda x : 0 if x['day']<5 else 1, axis = 1)
train_df.loc[('20220601' <= train_df.일시)&(train_df.일시 < '20220602'), 'holiday'] = 1
train_df.loc[('20220606' <= train_df.일시)&(train_df.일시 < '20220607'), 'holiday'] = 1
train_df.loc[('20220815' <= train_df.일시)&(train_df.일시 < '20220816'), 'holiday'] = 1

test_df['holiday'] = test_df.apply(lambda x : 0 if x['day']<5 else 1, axis = 1)

train_df['sin_time'] = np.sin(2*np.pi*train_df.hour/24)
train_df['cos_time'] = np.cos(2*np.pi*train_df.hour/24)
test_df['sin_time'] = np.sin(2*np.pi*test_df.hour/24)
test_df['cos_time'] = np.cos(2*np.pi*test_df.hour/24)

train_df['THI'] = 9/5*train_df['기온(C)'] - 0.55*(1-train_df['습도(%)']/100)*(9/5*train_df['습도(%)']-26)+32
test_df['THI'] = 9/5*test_df['기온(C)'] - 0.55*(1-test_df['습도(%)']/100)*(9/5*test_df['습도(%)']-26)+32

def CDH(xs):
    ys = []
    for i in range(len(xs)):
        if i < 11:
            ys.append(np.sum(xs[:(i+1)]-26))
        else:
            ys.append(np.sum(xs[(i-11):(i+1)]-26))
    return np.array(ys)

train_df['CDH'] = 0
for num in range(1,101,1):
    temp = train_df[train_df['건물번호'] == num]
    cdh = CDH(temp['기온(C)'].values)
    train_df.loc[train_df['건물번호'] == num, 'CDH'] = cdh

test_df['CDH'] = 0
for num in range(1,101,1):
    temp = test_df[test_df['건물번호'] == num]
    cdh = CDH(temp['기온(C)'].values)
    test_df.loc[test_df['건물번호'] == num, 'CDH'] = cdh

In [None]:
train_df['7_shifted_전력소비량'] = train_df['전력소비량(kWh)'].shift(24*7)
train_df = train_df[train_df.일시 >= '20220608'].reset_index(drop=True)

for i in train_df['건물번호'].unique():
    test_df.loc[test_df['건물번호'] == i, '7_shifted_전력소비량'] = train_df.loc[train_df['건물번호'] == i, '전력소비량(kWh)'][-7*24:].values

In [None]:
# # 화씨온도
# train_df['temperature_F'] = (train_df['기온(C)'] * 9/5) + 32
# test_df['temperature_F'] = (test_df['기온(C)'] * 9/5) + 32

# 둘째, 넷째 주 일요일
# train_df['2_4_sunday'] = 0
# train_df.loc[('20220612' <= train_df.일시)&(train_df.일시 < '20220613'), '2_4_sunday'] = 1
# train_df.loc[('20220626' <= train_df.일시)&(train_df.일시 < '20220617'), '2_4_sunday'] = 1
# train_df.loc[('20220710' <= train_df.일시)&(train_df.일시 < '20220711'), '2_4_sunday'] = 1
# train_df.loc[('20220724' <= train_df.일시)&(train_df.일시 < '20220725'), '2_4_sunday'] = 1
# train_df.loc[('20220814' <= train_df.일시)&(train_df.일시 < '20220815'), '2_4_sunday'] = 1

# test_df['2_4_sunday'] = 0
# test_df.loc[('20220828' <= test_df.일시)&(test_df.일시 < '20220829'), '2_4_sunday'] = 1

In [None]:
train_x = train_df.drop(columns=['num_date_time', '일시', '일조(hr)', '일사(MJ/m2)', '전력소비량(kWh)', '건물유형'])
train_x_type = train_df.drop(columns=['num_date_time', '일시', '일조(hr)', '일사(MJ/m2)', '전력소비량(kWh)'])
train_y = train_df['전력소비량(kWh)']

test_x = test_df[train_x.columns]
test_x_type = test_df[train_x_type.columns]
train_x.head()

Unnamed: 0,건물번호,기온(C),강수량(mm),풍속(m/s),습도(%),태양광용량(kW),hour,day,month,week,...,type_day_hour_mean,type_day_hour_std,type_hour_mean,type_hour_std,holiday,sin_time,cos_time,THI,CDH,7_shifted_전력소비량
0,1,19.0,1.889045,3.2,61.0,0,0,2,6,23,...,1620.812462,886.546805,1616.129012,909.159339,0,0.0,1.0,48.2249,-38.9,1085.28
1,1,18.8,1.889045,2.6,61.0,0,1,2,6,23,...,1608.544,906.0297,1603.843635,921.827876,0,0.258819,0.965926,47.8649,-45.1,1047.36
2,1,18.5,1.889045,2.6,62.0,0,2,2,6,23,...,1577.389538,900.72973,1575.108376,916.04389,0,0.5,0.866025,47.4096,-51.4,974.88
3,1,18.1,1.889045,2.2,63.0,0,3,2,6,23,...,1561.107077,900.693834,1553.335094,908.635022,0,0.707107,0.707107,46.7941,-58.0,953.76
4,1,17.2,1.889045,3.2,66.0,0,4,2,6,23,...,1546.894154,885.287208,1543.278965,900.520543,0,0.866025,0.5,45.6064,-64.9,986.4


## modeling

In [None]:
# custom objective function for forcing model not to underestimate
def weighted_mse(alpha = 1):
    def weighted_mse_fixed(label, pred):
        residual = (label - pred).astype("float")
        grad = np.where(residual>0, -2*alpha*residual, -2*residual)
        hess = np.where(residual>0, 2*alpha, 2.0)
        return grad, hess
    return weighted_mse_fixed

def SMAPE(true, pred):
    return np.mean((np.abs(true-pred))/(np.abs(true) + np.abs(pred))) * 100


In [None]:
# train_x_type["hour"] = train_x_type["hour"].astype("category")
# test_x_type["hour"] = test_x_type["hour"].astype("category")

In [None]:
alphas = range(15, 35)

In [None]:
from tqdm import tqdm

valid_df = train_x.copy()
valid_df['전력소비량(kWh)'] = 0
valid_df['pred'] = 0
pred_df = test_x.copy()
pred_df['전력소비량(kWh)'] = 0

labels = []

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

for i in tqdm(train_x['건물번호'].unique()):
    X = train_x[train_x['건물번호'] == i].drop(columns='건물번호')
    y = train_y[train_x['건물번호'] == i]
    valid_df.loc[train_x['건물번호'] == i, '전력소비량(kWh)'] = y
    y = np.log1p(y)
    x_test = test_x[test_x['건물번호'] == i].drop(columns='건물번호')

    cols = X.columns[X.nunique() != 1]
    X = X[cols]
    x_test = x_test[cols]

    tmp_preds = []
    for alpha in alphas:

        model = XGBRegressor(random_state=seed)
        # model = CatBoostRegressor(random_state=seed)
        # model = LGBMRegressor(random_state=seed)
        # model = RandomForestRegressor(random_state=seed)

        results = []

        for fold_, (train_index, val_index) in enumerate(skf.split(X, X['day'])):

            x_train, x_valid = X.iloc[train_index], X.iloc[val_index]
            y_train, y_valid = y.iloc[train_index], y.iloc[val_index]

            model.set_params(**{'objective':weighted_mse(alpha)})
            model.fit(x_train, y_train, eval_set=[(x_valid, y_valid)], verbose=0)

            val_pred = model.predict(x_valid)
            valid_df.loc[x_valid.index, 'pred'] += np.expm1(val_pred)
            pred = model.predict(x_test)
            results.append(pred)

        preds = np.mean(results, axis=0)
        preds = np.expm1(preds)
        tmp_preds.append(preds)

    preds = np.mean(tmp_preds, axis=0)
    pred_df.loc[test_x['건물번호'] == i, '전력소비량(kWh)'] = preds

    labels.append([valid_df.loc[train_x['건물번호'] == i, '전력소비량(kWh)'], preds])

# 2.171737450532235
valid_df['pred'] = valid_df['pred'] / len(alphas)
val_score = SMAPE(valid_df['전력소비량(kWh)'], valid_df['pred'])
val_score

100%|██████████| 100/100 [1:59:25<00:00, 71.65s/it]


1.7183125894172449

In [None]:
train_x_type["건물번호"] = train_x_type["건물번호"].astype("category")
test_x_type["건물번호"] = test_x_type["건물번호"].astype("category")

valid_df_type = train_x_type.copy()
valid_df_type['전력소비량(kWh)'] = 0
valid_df_type['pred'] = 0
pred_df_type = test_x_type.copy()
pred_df_type['전력소비량(kWh)'] = 0

labels_type = []

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

for i in tqdm(train_x_type['건물유형'].unique()):
    X = train_x_type[train_x_type['건물유형'] == i].drop(columns='건물유형')
    y = train_y[train_x_type['건물유형'] == i]
    valid_df_type.loc[train_x_type['건물유형'] == i, '전력소비량(kWh)'] = y
    y = np.log1p(y)
    x_test = test_x_type[test_x_type['건물유형'] == i].drop(columns='건물유형')

    tmp_preds = []
    for alpha in alphas:
        model = XGBRegressor(random_state=seed, tree_method="hist", enable_categorical=True)
        # model = CatBoostRegressor(random_state=seed)
        # model = LGBMRegressor(random_state=seed)
        # model = RandomForestRegressor(random_state=seed)

        results = []

        for fold_, (train_index, val_index) in enumerate(skf.split(X, X['day'])):

            x_train, x_valid = X.iloc[train_index], X.iloc[val_index]
            y_train, y_valid = y.iloc[train_index], y.iloc[val_index]

            model.set_params(**{'objective':weighted_mse(alpha)})
            model.fit(x_train, y_train, eval_set=[(x_valid, y_valid)], verbose=0)

            val_pred = model.predict(x_valid)
            valid_df_type.loc[x_valid.index, 'pred'] += np.expm1(val_pred)
            pred = model.predict(x_test)
            results.append(pred)


        preds = np.mean(results, axis=0)
        preds = np.expm1(preds)
        tmp_preds.append(preds)

    preds = np.mean(tmp_preds, axis=0)
    pred_df_type.loc[test_x_type['건물유형'] == i, '전력소비량(kWh)'] = preds

    labels_type.append([valid_df_type.loc[train_x_type['건물유형'] == i, '전력소비량(kWh)'], preds])

# 3.462504576825228
valid_df_type['pred'] = valid_df_type['pred'] / len(alphas)
type_val_score = SMAPE(valid_df_type['전력소비량(kWh)'], valid_df_type['pred'])
type_val_score

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_x_type["건물번호"] = test_x_type["건물번호"].astype("category")
100%|██████████| 12/12 [39:13<00:00, 196.16s/it]


2.287520125412716

In [None]:
# (1.9531097965617121, 2.557635430408023, 2.2553726134848677)
# (1.9530552780909811, 2.5519303451985462, 2.2524928116447636)
val_score, type_val_score, np.mean([val_score, type_val_score])

(1.7183125894172449, 2.287520125412716, 2.00291635741498)

In [None]:
from sklearn.linear_model import LinearRegression

counts = 0
w = 0.97
# 1.9803304293430894
for num in train_df['건물번호'].unique():
    X = pd.concat([valid_df.loc[valid_df['건물번호'] == num, 'pred'], valid_df_type.loc[valid_df_type['건물번호'] == num, 'pred']], axis=1).values
    y = valid_df.loc[valid_df['건물번호'] == num, '전력소비량(kWh)']
    reg = LinearRegression().fit(X, y)
    p = reg.predict(X)
    score = SMAPE(valid_df.loc[valid_df['건물번호'] == num, '전력소비량(kWh)'], p)
    # print(f'{num}, score: {score}')
    pred_x = pd.concat([pred_df.loc[pred_df['건물번호'] == num, '전력소비량(kWh)'], pred_df_type.loc[pred_df['건물번호'] == num, '전력소비량(kWh)']], axis=1).values
    pred = reg.predict(pred_x)

    threshold = pd.pivot_table(train_df.loc[train_df['건물번호'] == num], values = '전력소비량(kWh)', index = ['day', 'hour'], aggfunc = min).reset_index()
    df_pred = pred_df.loc[pred_df['건물번호'] == num].reset_index(drop=True)
    for j in range(len(pred)):
        min_power = threshold.loc[(threshold.day == df_pred.day[j])&(threshold.hour == df_pred.hour[j]), '전력소비량(kWh)'].values[0]
        if pred[j] < min_power:
            pred[j] = min_power
            counts += 1

    threshold = pd.pivot_table(train_df.loc[train_df['건물번호'] == num], values = '전력소비량(kWh)', index = ['day', 'hour'], aggfunc = max).reset_index()
    df_pred = pred_df.loc[pred_df['건물번호'] == num].reset_index(drop=True)
    for j in range(len(pred)):
        max_power = threshold.loc[(threshold.day == df_pred.day[j])&(threshold.hour == df_pred.hour[j]), '전력소비량(kWh)'].values[0]
        if pred[j] > max_power:
            pred[j] = max_power
            counts += 1

    pred_df.loc[pred_df['건물번호'] == num, '전력소비량(kWh)'] = pred

    valid_df.loc[valid_df['건물번호'] == num, 'pred'] = p

SMAPE(valid_df['전력소비량(kWh)'], valid_df['pred'])

1.6871001637864327

In [None]:
counts # 573

1580

In [None]:
type_val_score / (val_score + type_val_score), SMAPE(valid_df['전력소비량(kWh)'], valid_df['pred'])

(0.5710473422777008, 1.6871001637864327)

In [None]:
# plt.figure(figsize=(26, 300))
# for i in train_df['건물번호'].unique():
#     plt.subplot(train_df['건물번호'].nunique(), 1, i)
#     plt.title(i)
#     plt.plot(range(len(labels[i-1][0])), labels[i-1][0])
#     plt.plot(range(len(labels[i-1][0]), len(labels[i-1][0])+len(labels[i-1][1])), pred_df[pred_df['건물번호']==i]['전력소비량(kWh)'])
# plt.show()

In [None]:
submission = pd.read_csv('sample_submission.csv')
submission['answer'] = pred_df['전력소비량(kWh)']
submission.to_csv('jh_990313.csv', index=False)