In [1]:
import pandas as pd
import numpy as np
import random
import os

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader 

from tqdm.auto import tqdm

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(42) # Seed 고정

In [3]:
train_df = pd.read_csv('./data/train.csv')
test_df = pd.read_csv('./data/test.csv')
info_df = pd.read_csv('./data/building_info.csv')
sample_submission = pd.read_csv('./data/sample_submission.csv')

In [4]:
#  df.loc['B']
build_map_dictionary = {}
for i in range(100):
     build_map_dictionary.update({
         i+1 : info_df['건물유형'][i]
     })
area_warm_dictionary = {}
for i in range(100):
     area_warm_dictionary.update({
         i+1 : info_df['연면적(m2)'][i]
     })
area_cool_map_dictionary = {}
for i in range(100):
     area_cool_map_dictionary.update({
         i+1 : info_df['냉방면적(m2)'][i]
     })


In [5]:
train_df['건물유형'] = train_df['건물번호'].map(build_map_dictionary) 
train_df['연면적(m2)'] = train_df['건물번호'].map(area_warm_dictionary) 
train_df['냉방면적(m2)'] = train_df['건물번호'].map(area_cool_map_dictionary) 

In [6]:
# 강수량 결측치 0.0으로 채우기
train_df['강수량(mm)'].fillna(0.0, inplace=True)

# 풍속, 습도 결측치 평균으로 채우고 반올림하기
train_df['풍속(m/s)'].fillna(round(train_df['풍속(m/s)'].mean(),2), inplace=True)
train_df['습도(%)'].fillna(round(train_df['습도(%)'].mean(),2), inplace=True)

In [7]:
train_df['month'] = train_df['일시'].apply(lambda x : float(x[4:6]))
train_df['day'] = train_df['일시'].apply(lambda x : float(x[6:8]))
train_df['time'] = train_df['일시'].apply(lambda x : float(x[9:11]))
train_df

Unnamed: 0,num_date_time,건물번호,일시,기온(C),강수량(mm),풍속(m/s),습도(%),일조(hr),일사(MJ/m2),전력소비량(kWh),건물유형,연면적(m2),냉방면적(m2),month,day,time
0,1_20220601 00,1,20220601 00,18.6,0.0,0.9,42.0,,,1085.28,건물기타,110634.00,39570.00,6.0,1.0,0.0
1,1_20220601 01,1,20220601 01,18.0,0.0,1.1,45.0,,,1047.36,건물기타,110634.00,39570.00,6.0,1.0,1.0
2,1_20220601 02,1,20220601 02,17.7,0.0,1.5,45.0,,,974.88,건물기타,110634.00,39570.00,6.0,1.0,2.0
3,1_20220601 03,1,20220601 03,16.7,0.0,1.4,48.0,,,953.76,건물기타,110634.00,39570.00,6.0,1.0,3.0
4,1_20220601 04,1,20220601 04,18.4,0.0,2.8,43.0,,,986.40,건물기타,110634.00,39570.00,6.0,1.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203995,100_20220824 19,100,20220824 19,23.1,0.0,0.9,86.0,0.5,,881.04,호텔및리조트,57497.84,40035.23,8.0,24.0,19.0
203996,100_20220824 20,100,20220824 20,22.4,0.0,1.3,86.0,0.0,,798.96,호텔및리조트,57497.84,40035.23,8.0,24.0,20.0
203997,100_20220824 21,100,20220824 21,21.3,0.0,1.0,92.0,,,825.12,호텔및리조트,57497.84,40035.23,8.0,24.0,21.0
203998,100_20220824 22,100,20220824 22,21.0,0.0,0.3,94.0,,,640.08,호텔및리조트,57497.84,40035.23,8.0,24.0,22.0


In [8]:
# 순서 재배치
train_df = train_df[train_df.columns[:9].to_list() + train_df.columns[10:].to_list() + train_df.columns[9:10].to_list()]


In [9]:
train_df

Unnamed: 0,num_date_time,건물번호,일시,기온(C),강수량(mm),풍속(m/s),습도(%),일조(hr),일사(MJ/m2),건물유형,연면적(m2),냉방면적(m2),month,day,time,전력소비량(kWh)
0,1_20220601 00,1,20220601 00,18.6,0.0,0.9,42.0,,,건물기타,110634.00,39570.00,6.0,1.0,0.0,1085.28
1,1_20220601 01,1,20220601 01,18.0,0.0,1.1,45.0,,,건물기타,110634.00,39570.00,6.0,1.0,1.0,1047.36
2,1_20220601 02,1,20220601 02,17.7,0.0,1.5,45.0,,,건물기타,110634.00,39570.00,6.0,1.0,2.0,974.88
3,1_20220601 03,1,20220601 03,16.7,0.0,1.4,48.0,,,건물기타,110634.00,39570.00,6.0,1.0,3.0,953.76
4,1_20220601 04,1,20220601 04,18.4,0.0,2.8,43.0,,,건물기타,110634.00,39570.00,6.0,1.0,4.0,986.40
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203995,100_20220824 19,100,20220824 19,23.1,0.0,0.9,86.0,0.5,,호텔및리조트,57497.84,40035.23,8.0,24.0,19.0,881.04
203996,100_20220824 20,100,20220824 20,22.4,0.0,1.3,86.0,0.0,,호텔및리조트,57497.84,40035.23,8.0,24.0,20.0,798.96
203997,100_20220824 21,100,20220824 21,21.3,0.0,1.0,92.0,,,호텔및리조트,57497.84,40035.23,8.0,24.0,21.0,825.12
203998,100_20220824 22,100,20220824 22,21.0,0.0,0.3,94.0,,,호텔및리조트,57497.84,40035.23,8.0,24.0,22.0,640.08


In [10]:
build_type_map = {}
for idx, type in enumerate(train_df['건물유형'].unique()):
    build_type_map.update({
        type : idx
    })
train_df['building'] = train_df['건물유형'].map(build_type_map)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['building'] = train_df['건물유형'].map(build_type_map)


In [11]:
building_df = pd.get_dummies(train_df['building'],prefix='building')

train_df=train_df.drop(['건물유형','building','일조(hr)','일사(MJ/m2)','건물번호','num_date_time','일시'],axis=1)
train_df.columns

converted_train_df = pd.concat([building_df, train_df], axis = 1)

In [12]:
# train_df.drop(['건물유형','building','일조(hr)','일사(MJ/m2)','건물번호','num_date_time','일시'],axis=1)
# train_df.drop('건물유형',axis=1)
# train_df.head()
building_df.columns
converted_train_df.columns
# train_df.columns

# train_df['건물번호']

Index(['building_0', 'building_1', 'building_2', 'building_3', 'building_4',
       'building_5', 'building_6', 'building_7', 'building_8', 'building_9',
       'building_10', 'building_11', '기온(C)', '강수량(mm)', '풍속(m/s)', '습도(%)',
       '연면적(m2)', '냉방면적(m2)', 'month', 'day', 'time', '전력소비량(kWh)'],
      dtype='object')

In [13]:
# train_df['건물유형']

In [14]:
converted_train_df

Unnamed: 0,building_0,building_1,building_2,building_3,building_4,building_5,building_6,building_7,building_8,building_9,...,기온(C),강수량(mm),풍속(m/s),습도(%),연면적(m2),냉방면적(m2),month,day,time,전력소비량(kWh)
0,1,0,0,0,0,0,0,0,0,0,...,18.6,0.0,0.9,42.0,110634.00,39570.00,6.0,1.0,0.0,1085.28
1,1,0,0,0,0,0,0,0,0,0,...,18.0,0.0,1.1,45.0,110634.00,39570.00,6.0,1.0,1.0,1047.36
2,1,0,0,0,0,0,0,0,0,0,...,17.7,0.0,1.5,45.0,110634.00,39570.00,6.0,1.0,2.0,974.88
3,1,0,0,0,0,0,0,0,0,0,...,16.7,0.0,1.4,48.0,110634.00,39570.00,6.0,1.0,3.0,953.76
4,1,0,0,0,0,0,0,0,0,0,...,18.4,0.0,2.8,43.0,110634.00,39570.00,6.0,1.0,4.0,986.40
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203995,0,0,0,0,0,0,0,0,0,0,...,23.1,0.0,0.9,86.0,57497.84,40035.23,8.0,24.0,19.0,881.04
203996,0,0,0,0,0,0,0,0,0,0,...,22.4,0.0,1.3,86.0,57497.84,40035.23,8.0,24.0,20.0,798.96
203997,0,0,0,0,0,0,0,0,0,0,...,21.3,0.0,1.0,92.0,57497.84,40035.23,8.0,24.0,21.0,825.12
203998,0,0,0,0,0,0,0,0,0,0,...,21.0,0.0,0.3,94.0,57497.84,40035.23,8.0,24.0,22.0,640.08


In [15]:
len(converted_train_df.columns
)

22

In [16]:
# 하이퍼파라미터
input_size = 22  # feature의 개수
hidden_size = 64
num_layers = 2
output_size = 1
num_epochs = 5
window_size = 24  # 예측에 사용될 시간 윈도우 크기
batch_size = 64
learning_rate = 0.001

In [17]:
class TimeSeriesDataset(Dataset):
    def __init__(self, df, window_size):
        self.df = df
        self.window_size = window_size

    def __len__(self):
        return len(self.df) - self.window_size

    def __getitem__(self, idx):
        x = torch.tensor(self.df[idx:idx+self.window_size, :], dtype=torch.float)
#         print(x)
        if self.df.shape[1] > 1:
            y = torch.tensor(self.df[idx+self.window_size, -1], dtype=torch.float)
        else:
            y = None
            
        return x, y

def create_data_loader(df, window_size, batch_size):
    dataset = TimeSeriesDataset(df, window_size)
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    return data_loader

In [18]:
# normalization
scaler = MinMaxScaler()
train_data = scaler.fit_transform(converted_train_df.values)
train_loader = create_data_loader(train_data, window_size, batch_size)

In [19]:
# train_df.values[0]

In [20]:
class BiLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(BiLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, output_size)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(device)
        c0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(device)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

In [21]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTM, self).__init__()

        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device) 
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)

        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])

        return out

In [22]:
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('mps:0' if torch.backends.mps.is_available() else 'cpu')
print(f"current device: {device}")

model = BiLSTM(input_size, hidden_size, num_layers, output_size).to(device)

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

current device: mps:0


In [None]:
for epoch in range(num_epochs):
    for i, (inputs, labels) in enumerate(train_loader):
        inputs = inputs.to(device)
#         print(inputs)
        labels = labels.unsqueeze(1).to(device)
#         print(labels)
        # Forward
        outputs = model(inputs)
#         print(outputs)
        loss = criterion(outputs, labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (i+1) % 300 == 0:
            print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 
                   .format(epoch+1, num_epochs, i+1, len(train_loader), loss.item()))

In [None]:
# 학습 데이터에서 마지막 행 가져오기
last_train_data = train_df.drop(['num_date_time', '건물번호', '일시',], axis=1).loc[204000-24:,:]

# 실수형 데이터로 변환
test_df['습도(%)'] = test_df['습도(%)'].astype('float64')

# 날짜 데이터 추가
test_df['month'] = test_df['일시'].apply(lambda x : float(x[4:6]))
test_df['day'] = test_df['일시'].apply(lambda x : float(x[6:8]))
test_df['time'] = test_df['일시'].apply(lambda x : float(x[9:11]))

# 전력소비량 열 생성
final_df = pd.concat((test_df.drop(['num_date_time', '건물번호', '일시',], axis=1), pd.DataFrame(np.zeros(test_df.shape[0]))),axis=1)
final_df = final_df.rename({0:'전력소비량(kWh)'},axis=1)

In [None]:
test_df = pd.concat((last_train_data, final_df)).reset_index(drop=True)
test_data = scaler.transform(test_df.values) # train과 동일하게 scaling
test_data.shape

In [None]:
model.eval()

test_predictions = []

with torch.no_grad():
    for i in range(test_data.shape[0] - window_size):
        x = torch.Tensor(test_data[i:i+window_size,:]).to(device)
        new_x = model(x.view(1,window_size,-1))
        
        test_data[i+window_size,-1] = new_x # 입력 업데이트
        test_predictions.append(new_x.detach().cpu().numpy().item()) # 예측 결과 저장