In [1]:
import random
import pandas as pd
import numpy as np
import os

from sklearn.ensemble import RandomForestRegressor

import warnings
warnings.filterwarnings(action='ignore') 

In [2]:
def seed_everything(seed):
    '''
    fix random seed
    '''
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

In [19]:
# Feature engineering 1 - scaling --> effects none ...

In [40]:
'''
feature extraction
'''

train_df = pd.read_csv('./data/train.csv')

train_df = train_df.fillna(0)


#시계열 특성을 학습에 반영하기 위해 일시를 월, 일, 시간으로 나눕니다
train_df['month'] = train_df['일시'].apply(lambda x : int(x[4:6]))
train_df['day'] = train_df['일시'].apply(lambda x : int(x[6:8]))
train_df['time'] = train_df['일시'].apply(lambda x : int(x[9:11]))


In [41]:
train_df

Unnamed: 0,num_date_time,건물번호,일시,기온(C),강수량(mm),풍속(m/s),습도(%),일조(hr),일사(MJ/m2),전력소비량(kWh),month,day,time
0,1_20220601 00,1,20220601 00,18.6,0.0,0.9,42.0,0.0,0.0,1085.28,6,1,0
1,1_20220601 01,1,20220601 01,18.0,0.0,1.1,45.0,0.0,0.0,1047.36,6,1,1
2,1_20220601 02,1,20220601 02,17.7,0.0,1.5,45.0,0.0,0.0,974.88,6,1,2
3,1_20220601 03,1,20220601 03,16.7,0.0,1.4,48.0,0.0,0.0,953.76,6,1,3
4,1_20220601 04,1,20220601 04,18.4,0.0,2.8,43.0,0.0,0.0,986.40,6,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
203995,100_20220824 19,100,20220824 19,23.1,0.0,0.9,86.0,0.5,0.0,881.04,8,24,19
203996,100_20220824 20,100,20220824 20,22.4,0.0,1.3,86.0,0.0,0.0,798.96,8,24,20
203997,100_20220824 21,100,20220824 21,21.3,0.0,1.0,92.0,0.0,0.0,825.12,8,24,21
203998,100_20220824 22,100,20220824 22,21.0,0.0,0.3,94.0,0.0,0.0,640.08,8,24,22


In [42]:
## outer join with building info
building_info = pd.read_csv("./data/building_info.csv")
building_info

Unnamed: 0,건물번호,건물유형,연면적(m2),냉방면적(m2),태양광용량(kW),ESS저장용량(kWh),PCS용량(kW)
0,1,건물기타,110634.00,39570.00,-,-,-
1,2,건물기타,122233.47,99000.00,-,-,-
2,3,건물기타,171243.00,113950.00,40,-,-
3,4,건물기타,74312.98,34419.62,60,-,-
4,5,건물기타,205884.00,150000.00,-,2557,1000
...,...,...,...,...,...,...,...
95,96,호텔및리조트,93314.00,60500.00,-,-,-
96,97,호텔및리조트,55144.67,25880.00,-,-,-
97,98,호텔및리조트,53578.62,17373.75,-,-,-
98,99,호텔및리조트,53499.00,40636.00,-,-,-


In [43]:
merged_train_df = pd.merge(train_df, building_info, on='건물번호', how='outer')
merged_train_df.replace('-', 0, inplace=True)

merged_train_df.fillna(0)

Unnamed: 0,num_date_time,건물번호,일시,기온(C),강수량(mm),풍속(m/s),습도(%),일조(hr),일사(MJ/m2),전력소비량(kWh),month,day,time,건물유형,연면적(m2),냉방면적(m2),태양광용량(kW),ESS저장용량(kWh),PCS용량(kW)
0,1_20220601 00,1,20220601 00,18.6,0.0,0.9,42.0,0.0,0.0,1085.28,6,1,0,건물기타,110634.00,39570.00,0,0,0
1,1_20220601 01,1,20220601 01,18.0,0.0,1.1,45.0,0.0,0.0,1047.36,6,1,1,건물기타,110634.00,39570.00,0,0,0
2,1_20220601 02,1,20220601 02,17.7,0.0,1.5,45.0,0.0,0.0,974.88,6,1,2,건물기타,110634.00,39570.00,0,0,0
3,1_20220601 03,1,20220601 03,16.7,0.0,1.4,48.0,0.0,0.0,953.76,6,1,3,건물기타,110634.00,39570.00,0,0,0
4,1_20220601 04,1,20220601 04,18.4,0.0,2.8,43.0,0.0,0.0,986.40,6,1,4,건물기타,110634.00,39570.00,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203995,100_20220824 19,100,20220824 19,23.1,0.0,0.9,86.0,0.5,0.0,881.04,8,24,19,호텔및리조트,57497.84,40035.23,0,0,0
203996,100_20220824 20,100,20220824 20,22.4,0.0,1.3,86.0,0.0,0.0,798.96,8,24,20,호텔및리조트,57497.84,40035.23,0,0,0
203997,100_20220824 21,100,20220824 21,21.3,0.0,1.0,92.0,0.0,0.0,825.12,8,24,21,호텔및리조트,57497.84,40035.23,0,0,0
203998,100_20220824 22,100,20220824 22,21.0,0.0,0.3,94.0,0.0,0.0,640.08,8,24,22,호텔및리조트,57497.84,40035.23,0,0,0


In [44]:
#시계열 특성을 학습에 반영하기 위해 일시를 월, 일, 시간으로 나눕니다
merged_train_df['month'] = merged_train_df['일시'].apply(lambda x : int(x[4:6]))
merged_train_df['day'] = merged_train_df['일시'].apply(lambda x : int(x[6:8]))
merged_train_df['time'] = merged_train_df['일시'].apply(lambda x : int(x[9:11]))


merged_train_df = merged_train_df.drop(columns=['num_date_time', '일시', '일조(hr)', '일사(MJ/m2)'])

In [45]:
merged_train_df

Unnamed: 0,건물번호,기온(C),강수량(mm),풍속(m/s),습도(%),전력소비량(kWh),month,day,time,건물유형,연면적(m2),냉방면적(m2),태양광용량(kW),ESS저장용량(kWh),PCS용량(kW)
0,1,18.6,0.0,0.9,42.0,1085.28,6,1,0,건물기타,110634.00,39570.00,0,0,0
1,1,18.0,0.0,1.1,45.0,1047.36,6,1,1,건물기타,110634.00,39570.00,0,0,0
2,1,17.7,0.0,1.5,45.0,974.88,6,1,2,건물기타,110634.00,39570.00,0,0,0
3,1,16.7,0.0,1.4,48.0,953.76,6,1,3,건물기타,110634.00,39570.00,0,0,0
4,1,18.4,0.0,2.8,43.0,986.40,6,1,4,건물기타,110634.00,39570.00,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203995,100,23.1,0.0,0.9,86.0,881.04,8,24,19,호텔및리조트,57497.84,40035.23,0,0,0
203996,100,22.4,0.0,1.3,86.0,798.96,8,24,20,호텔및리조트,57497.84,40035.23,0,0,0
203997,100,21.3,0.0,1.0,92.0,825.12,8,24,21,호텔및리조트,57497.84,40035.23,0,0,0
203998,100,21.0,0.0,0.3,94.0,640.08,8,24,22,호텔및리조트,57497.84,40035.23,0,0,0


In [53]:
building_types = list(merged_train_df['건물유형'].unique())

In [7]:
'''
feature extraction
'''

train_df = pd.read_csv('./data/train.csv')

train_df = train_df.fillna(0)


#시계열 특성을 학습에 반영하기 위해 일시를 월, 일, 시간으로 나눕니다
train_df['month'] = train_df['일시'].apply(lambda x : int(x[4:6]))
train_df['day'] = train_df['일시'].apply(lambda x : int(x[6:8]))
train_df['time'] = train_df['일시'].apply(lambda x : int(x[9:11]))


train_x = train_df.drop(columns=['num_date_time', '일시', '일조(hr)', '일사(MJ/m2)', '전력소비량(kWh)'])

train_y = train_df['전력소비량(kWh)']

# Model Fit

In [36]:
'''
random forest fit
'''
# model = RandomForestRegressor()
# model.fit(reduced_train_x, train_y)

In [64]:
# K-Fold
from sklearn.model_selection import KFold

models_dict = {}

for building_type in building_types:
    train_x = merged_train_df[merged_train_df['건물유형']==building_type]
    train_x.fillna(0)
    train_x.drop(columns=["건물유형"])
    train_y = train_x['전력소비량(kWh)']

    n_splits = 5  # 폴드 개수
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=0)

    best_model = None
    best_score = -float('inf')  # 초기화: 음수 무한대로 설정

    for train_idx, test_idx in kf.split(train_df):
        X_train, X_test = train_x.to_numpy()[train_idx], train_x.to_numpy()[test_idx]
        y_train, y_test = train_y.to_numpy()[train_idx], train_y.to_numpy()[test_idx]

        # 모델 훈련 및 평가 (여기에서는 간단한 선형 회귀 모델 사용)
        model = RandomForestRegressor()
        model.fit(X_train, y_train)
        score = model.score(X_test, y_test)

        if score > best_score:
            best_score = score
            best_model = model

        print(f"{building_type} Test Score:", score)
    models_dict[building_type] = best_model

IndexError: index 30600 is out of bounds for axis 0 with size 30600

In [63]:
merged_train_df.shape

(204000, 15)

In [61]:
train_x

Unnamed: 0,건물번호,기온(C),강수량(mm),풍속(m/s),습도(%),전력소비량(kWh),month,day,time,건물유형,연면적(m2),냉방면적(m2),태양광용량(kW),ESS저장용량(kWh),PCS용량(kW)
0,1,18.6,0.0,0.9,42.0,1085.28,6,1,0,건물기타,110634.00,39570.00,0,0,0
1,1,18.0,0.0,1.1,45.0,1047.36,6,1,1,건물기타,110634.00,39570.00,0,0,0
2,1,17.7,0.0,1.5,45.0,974.88,6,1,2,건물기타,110634.00,39570.00,0,0,0
3,1,16.7,0.0,1.4,48.0,953.76,6,1,3,건물기타,110634.00,39570.00,0,0,0
4,1,18.4,0.0,2.8,43.0,986.40,6,1,4,건물기타,110634.00,39570.00,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30595,15,21.7,0.0,1.6,91.0,1819.44,8,24,19,건물기타,167012.31,167012.31,0,0,0
30596,15,21.8,0.0,1.7,92.0,1918.17,8,24,20,건물기타,167012.31,167012.31,0,0,0
30597,15,21.4,0.0,1.1,92.0,1999.53,8,24,21,건물기타,167012.31,167012.31,0,0,0
30598,15,21.4,0.0,0.3,92.0,2043.63,8,24,22,건물기타,167012.31,167012.31,0,0,0


# Inference & Submission

In [26]:
### Test set pre-process
test_df = pd.read_csv('./data/test.csv')

test_df['month'] = test_df['일시'].apply(lambda x : int(x[4:6]))
test_df['day'] = test_df['일시'].apply(lambda x : int(x[6:8]))
test_df['time'] = test_df['일시'].apply(lambda x : int(x[9:11]))

test_x = test_df.drop(columns=['num_date_time', '일시'])


In [14]:
test_df

Unnamed: 0,num_date_time,건물번호,일시,기온(C),강수량(mm),풍속(m/s),습도(%),month,day,time
0,1_20220825 00,1,20220825 00,23.5,0.0,2.2,72,8,25,0
1,1_20220825 01,1,20220825 01,23.0,0.0,0.9,72,8,25,1
2,1_20220825 02,1,20220825 02,22.7,0.0,1.5,75,8,25,2
3,1_20220825 03,1,20220825 03,22.1,0.0,1.3,78,8,25,3
4,1_20220825 04,1,20220825 04,21.8,0.0,1.0,77,8,25,4
...,...,...,...,...,...,...,...,...,...,...
16795,100_20220831 19,100,20220831 19,22.5,0.0,0.9,84,8,31,19
16796,100_20220831 20,100,20220831 20,20.7,0.0,0.4,95,8,31,20
16797,100_20220831 21,100,20220831 21,20.2,0.0,0.4,98,8,31,21
16798,100_20220831 22,100,20220831 22,20.1,0.0,1.1,97,8,31,22


In [16]:
# inference
preds = best_model.predict(test_x)


In [17]:
# sub mission
submission = pd.read_csv('./data/sample_submission.csv')
submission['answer'] = preds
submission.head(5)

Unnamed: 0,num_date_time,answer
0,1_20220825 00,2114.1696
1,1_20220825 01,2100.0192
2,1_20220825 02,1974.7272
3,1_20220825 03,1950.408
4,1_20220825 04,1938.1152


In [18]:
submission.to_csv("./data/submission_random_forest.csv", index=False)