In [6]:
import random
import pandas as pd
import numpy as np
import os
import holidays
korea_holidays = holidays.SouthKorea(years=2024)

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error

import warnings
warnings.filterwarnings(action='ignore') 

from autogluon.timeseries import TimeSeriesDataFrame, TimeSeriesPredictor
from pycaret import regression

In [7]:
def smape(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    denominator = (np.abs(y_true) + np.abs(y_pred))
    diff = np.abs(y_true - y_pred)
    
    smape_val = 100 * np.mean(2 * diff / np.where(denominator == 0, 1e-8, denominator))
    return smape_val

In [20]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')
submission = pd.read_csv('./sample_submission.csv')
info = pd.read_csv('./building_info.csv')

In [11]:
submission

Unnamed: 0,num_date_time,answer
0,1_20240825 00,0
1,1_20240825 01,0
2,1_20240825 02,0
3,1_20240825 03,0
4,1_20240825 04,0
...,...,...
16795,100_20240831 19,0
16796,100_20240831 20,0
16797,100_20240831 21,0
16798,100_20240831 22,0


In [14]:
24*7*100 # 24시간 7일 100개건물

16800

In [24]:
print(info[:50])
print(info[50:])

    건물번호      건물유형     연면적(m2)   냉방면적(m2) 태양광용량(kW) ESS저장용량(kWh) PCS용량(kW)
0      1        호텔   82912.710   77586.00         -            -         -
1      2        상용   40658.900   30392.82         -            -         -
2      3        병원  560431.000  418992.00    278.58            -         -
3      4        호텔   41813.290   23715.71         -            -         -
4      5        학교  403749.390  248507.00   1983.05         1025       250
5      6        상용  157835.000  157835.00         -            -         -
6      7      건물기타  118346.000   78237.38    389.76            -         -
7      8        학교  167751.000   82112.00    217.92            -         -
8      9        호텔  136757.000  109024.00         -            -         -
9     10        호텔  435993.500  341983.00         -            -         -
10    11       아파트  271233.000  233263.00         -            -         -
11    12        학교  581897.000  311452.00   1349.03            -         -
12    13       연구소   9737

In [21]:
pd.DataFrame(info.groupby('건물유형'))[1][2]

Unnamed: 0,건물번호,건물유형,연면적(m2),냉방면적(m2),태양광용량(kW),ESS저장용량(kWh),PCS용량(kW)
32,33,공공,138156.0,88119.0,344.96,-,-
37,38,공공,61375.47,19800.0,83.2,-,-
49,50,공공,135785.69,100746.2,1039.86,-,-
65,66,공공,145056.0,60193.0,540.57,-,-
67,68,공공,111365.97,36356.07,94.38,-,-
71,72,공공,77599.0,48251.0,1340.1,2000,1000
79,80,공공,373141.0,75340.0,790.08,201.1,101
91,92,공공,27915.29,5628.0,322.9,209,100


In [26]:
train[train['건물번호'] == 1]

Unnamed: 0,num_date_time,건물번호,일시,기온(°C),강수량(mm),풍속(m/s),습도(%),일조(hr),일사(MJ/m2),전력소비량(kWh)
0,1_20240601 00,1,20240601 00,18.3,0.0,2.6,82.0,0.0,0.00,5794.80
1,1_20240601 01,1,20240601 01,18.3,0.0,2.7,82.0,0.0,0.00,5591.85
2,1_20240601 02,1,20240601 02,18.1,0.0,2.6,80.0,0.0,0.00,5338.17
3,1_20240601 03,1,20240601 03,18.0,0.0,2.6,81.0,0.0,0.00,4554.42
4,1_20240601 04,1,20240601 04,17.8,0.0,1.3,81.0,0.0,0.00,3602.25
...,...,...,...,...,...,...,...,...,...,...
2035,1_20240824 19,1,20240824 19,29.3,0.0,2.7,75.0,0.0,0.32,7216.53
2036,1_20240824 20,1,20240824 20,28.3,0.0,2.7,72.0,0.0,0.01,5860.98
2037,1_20240824 21,1,20240824 21,27.8,0.0,1.5,73.0,0.0,0.00,4135.86
2038,1_20240824 22,1,20240824 22,27.3,0.0,1.0,75.0,0.0,0.00,4571.97


In [27]:
test[test['건물번호'] == 1]

Unnamed: 0,num_date_time,건물번호,일시,기온(°C),강수량(mm),풍속(m/s),습도(%)
0,1_20240825 00,1,20240825 00,26.5,0.0,0.7,80.0
1,1_20240825 01,1,20240825 01,26.1,0.0,0.0,80.0
2,1_20240825 02,1,20240825 02,25.9,0.0,0.3,83.0
3,1_20240825 03,1,20240825 03,25.7,0.0,1.1,83.0
4,1_20240825 04,1,20240825 04,25.5,0.0,1.0,86.0
...,...,...,...,...,...,...,...
163,1_20240831 19,1,20240831 19,29.7,0.0,2.5,53.0
164,1_20240831 20,1,20240831 20,28.8,0.0,2.2,56.0
165,1_20240831 21,1,20240831 21,27.9,0.0,1.4,66.0
166,1_20240831 22,1,20240831 22,27.4,0.0,0.3,72.0


In [45]:
train = pd.read_csv('./train.csv')
train['num_date_time'] = train['num_date_time'].apply(lambda x:x[-11:])
train['timestamp'] = pd.to_datetime(train['num_date_time'],format='%Y%m%d %H')
train = train.drop(['num_date_time','일시'],axis=1)
train = train.rename(columns={'건물번호':'item_id','전력소비량(kWh)':'target'})
train

Unnamed: 0,item_id,기온(°C),강수량(mm),풍속(m/s),습도(%),일조(hr),일사(MJ/m2),target,timestamp
0,1,18.3,0.0,2.6,82.0,0.0,0.00,5794.80,2024-06-01 00:00:00
1,1,18.3,0.0,2.7,82.0,0.0,0.00,5591.85,2024-06-01 01:00:00
2,1,18.1,0.0,2.6,80.0,0.0,0.00,5338.17,2024-06-01 02:00:00
3,1,18.0,0.0,2.6,81.0,0.0,0.00,4554.42,2024-06-01 03:00:00
4,1,17.8,0.0,1.3,81.0,0.0,0.00,3602.25,2024-06-01 04:00:00
...,...,...,...,...,...,...,...,...,...
203995,100,29.1,0.0,4.4,76.0,0.4,0.18,3276.00,2024-08-24 19:00:00
203996,100,28.6,0.0,3.7,74.0,0.0,0.00,3197.52,2024-08-24 20:00:00
203997,100,28.3,0.0,2.9,74.0,0.0,0.00,3006.60,2024-08-24 21:00:00
203998,100,28.0,0.0,1.7,76.0,0.0,0.00,2649.72,2024-08-24 22:00:00


In [49]:
data = TimeSeriesDataFrame(train)
predictor = TimeSeriesPredictor( 
    prediction_length=24*3,
    target="target",
    eval_metric='RMSE',
)
# seed 고정
predictor.fit(data, random_seed=42)
predictor.refit_full()
predj = predictor.predict(data, random_seed=42)

No path specified. Models will be saved in: "AutogluonModels\ag-20250730_130129\"
TimeSeriesPredictor.fit() called
Fitting with arguments:
{'enable_ensemble': True,
 'evaluation_metric': 'RMSE',
 'excluded_model_types': None,
 'hyperparameter_tune_kwargs': None,
 'hyperparameters': 'default',
 'num_val_windows': 1,
 'prediction_length': 72,
 'random_seed': 42,
 'target': 'target',
 'time_limit': None,
 'verbosity': 2}
Provided training data set with 204000 rows, 100 items (item = single time series). Average time series length is 2040.0. Data frequency is 'H'.
Global seed set to 42
AutoGluon will save models to AutogluonModels\ag-20250730_130129\
AutoGluon will gauge predictive performance using evaluation metric: 'RMSE'
	This metric's sign has been flipped to adhere to being 'higher is better'. The reported score can be multiplied by -1 to get the metric value.

Provided dataset contains following columns:
	target:           'target'
	past covariates:  ['기온(°C)', '강수량(mm)', '풍속(m/s)',