In [1]:
import psycopg2

import copy
import random
import numpy as np
import pandas as pd

import joblib
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

import torch
from torch import nn
from torch.utils.data import Subset, DataLoader

from Dataset.Embedding_Dataset import Embedding_Dataset
from Model.Embedding import Embedding

from Dataset.Apartment_Complex_Dataset import Apartment_Complex_Dataset
from Model.LSTM import LSTM
from Model.GRU import GRU
from Model.Transformer import Transformer

from Dataset.Dong_Dataset import Dong_Dataset
from Model.LSTM_Attention import LSTMAttention
from Model.GRU_Attention import GRUAttention
from Model.Transformer_Attention import TransformerAttention

from utils import RMSE, save_train_val_losses

SEED = 1234
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

table_1 = pd.read_csv('../데이터/Table/table_1.csv') 
table_2 = pd.read_csv('../데이터/Table/table_2.csv') 
table_3 = pd.read_csv('../데이터/Table/table_3.csv') 

## Embedding

In [2]:
epochs = 10000

embedding_lrs = [1e-2, 1e-3, 1e-4, 1e-5, 1e-6] 
embedding_batchs = [64, 128, 256, 512, 1024, 2048]
encoder_dim_1 = 128
encoder_dim_2 = 256
encoder_dim_3 = 512
embedding_dim = 1024
decoder_dim_1 = 512
decoder_dim_2 = 256
decoder_dim_3 = 128

In [None]:
dataset = Embedding_Dataset(table_1, table_2, table_3)

train_ratio = 0.7
val_ratio = 0.2
test_ratio = 0.1
dataset_length = len(dataset)
train_size = int(train_ratio * dataset_length)
train_indices = range(0, train_size)
val_size = int(val_ratio * dataset_length)
val_indices = range(train_size, train_size + val_size)
test_size = int(test_ratio * dataset_length)
test_indices = range(train_size + val_size, dataset_length)

train_dataset = Subset(dataset, train_indices)
val_dataset = Subset(dataset, val_indices)
test_dataset = Subset(dataset, test_indices)

In [None]:
results_df = pd.DataFrame(columns=['Learning Rate', 'Batch Size', 'Epochs', 'Train Loss', 'Validation Loss'])
for embedding_lr in embedding_lrs:
    for embedding_batch in embedding_batchs:
        train_dataloader = DataLoader(train_dataset, batch_size=embedding_batch, shuffle=False, drop_last=True)
        val_dataloader = DataLoader(val_dataset, batch_size=embedding_batch, shuffle=False, drop_last=True)

        model = Embedding(encoder_dim_1, encoder_dim_2, encoder_dim_3, embedding_dim, decoder_dim_1, decoder_dim_2, decoder_dim_3).to(DEVICE)
        criterion = RMSE()
        optimizer = torch.optim.Adam(model.parameters(), lr=embedding_lr)

        train_losses = []
        val_losses = []

        max_early_stop_count = 3
        early_stop_count = 0
        best_val_loss = float('inf')
        best_model_weights = None

        for epoch in range(epochs):
            model.train()
            total_train_loss = 0
            for data in train_dataloader:
                input = data[0].to(DEVICE)
                target = data[1].to(DEVICE)
                output = model(input).to(DEVICE)

                train_loss = criterion(output, target)
                total_train_loss += train_loss.item()

                optimizer.zero_grad()
                train_loss.backward()
                optimizer.step()

            avg_train_loss = total_train_loss / len(train_dataloader)
            train_losses.append(avg_train_loss)

            model.eval()
            total_val_loss = 0
            with torch.no_grad():
                for data in val_dataloader:
                    input = data[0].to(DEVICE)
                    target = data[1].to(DEVICE)
                    output = model(input).to(DEVICE)

                    val_loss = criterion(output, target)
                    total_val_loss += val_loss.item()

            avg_val_loss = total_val_loss / len(val_dataloader)
            val_losses.append(avg_val_loss)

            if  best_val_loss > avg_val_loss:
                best_val_loss = avg_val_loss
                best_model_weights = copy.deepcopy(model.state_dict())
                early_stop_count = 0
            else:
                early_stop_count += 1

            if early_stop_count >= max_early_stop_count:
                print(f'Epoch [{epoch+1}/{epochs}], Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f} \nEarly Stop Triggered!')
                model.load_state_dict(best_model_weights)
                torch.save(model, f'../데이터/Checkpoint/emb/embedding_lr_{embedding_lr}_batch_{embedding_batch}_epochs_{epoch+1}.pth')
                break

            print(f'Epoch [{epoch+1}/{epochs}], Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}')
            
        save_train_val_losses(train_losses, val_losses, f'../데이터/Checkpoint/emb/embedding_lr_{embedding_lr}_batch_{embedding_batch}_epochs_{epoch+1}')
        
        results_df.append({
                'Learning Rate': embedding_lr,
                'Batch Size': embedding_batch,
                'Epochs': epoch + 1,
                'Train Loss': min(train_losses),
                'Validation Loss': min(val_losses)
        }, ignore_index=True)
        
results_df.to_excel('../데이터/Checkpoint/emb/embedding_results.xlsx')

In [None]:
from itertools import product

combinations = list(product(embedding_lrs, embedding_batchs))

results_df = pd.DataFrame(columns=['Learning Rate', 'Batch Size', 'Train Loss', 'Validation Loss'])
for embedding_lr, embedding_batch in combinations:
      for epoch in range(3300):
            try:
                  train_losses_file_path = f'../데이터/Checkpoint/emb/embedding_lr_{embedding_lr}_batch_{embedding_batch}_epochs_{epoch}_train_losses.txt'
                  val_losses_file_path = f'../데이터/Checkpoint/emb/embedding_lr_{embedding_lr}_batch_{embedding_batch}_epochs_{epoch}_val_losses.txt'
                  with open(train_losses_file_path, 'r') as train_losses_file: 
                        train_losses_lines = train_losses_file.readlines()            
                  train_losses = [float(line.strip()) for line in train_losses_lines]
                  with open(val_losses_file_path, 'r') as val_losses_file: 
                        val_losses_lines = val_losses_file.readlines()            
                  val_losses = [float(line.strip()) for line in val_losses_lines]

                  min_train_loss = min(train_losses) 
                  min_val_loss = min(val_losses)  

                  results_df = results_df.append({
                        'Learning Rate': embedding_lr,
                        'Batch Size': embedding_batch,
                        'Train Loss': min_train_loss,
                        'Validation Loss':min_val_loss
                  }, ignore_index=True)
            except:
                  pass

results_df.sort_values('Validation Loss',inplace=True)
results_df.reset_index(drop=True,inplace=True)
results_df.to_excel('../데이터/Checkpoint/emb/embedding_results.xlsx')

In [None]:
time = len(table_2)
window_size = 7
ML_DL = 'DL'

# 데이터프레임 복사본 생성
table_1_copy = table_1.copy()
table_2_copy = table_2.copy()
table_3_copy = table_3.copy()

# 정규화
scaler = StandardScaler()
table_1_copy[[cols for cols in table_1_copy.columns if cols not in ['aid','location','name']]] = scaler.fit_transform(table_1_copy[[cols for cols in table_1_copy.columns if cols not in ['aid','location','name']]])
scaler.fit(table_2_copy[[cols for cols in table_2_copy.columns if cols not in ['did','year','month']]][:135])
table_2_copy[[cols for cols in table_2_copy.columns if cols not in ['did','year','month']]] = scaler.transform(table_2_copy[[cols for cols in table_2_copy.columns if cols not in ['did','year','month']]])
table_3_copy['price'] = table_3_copy['price'] * 0.0001 # 억 단위

# 동 이름 바꾸기
old_and_new_dongs = {'용산동5가':'한강로동','한강로2가':'한강로동','창동':'창제동','돈암동':'정릉동','거여동':'위례동','문정동':'위례동','장지동':'위례동','문배동':'원효로동','산천동':'원효로동','신창동':'원효로동','원효로1가':'원효로동','화곡동':'우장산동','내발산동':'우장산동','영등포동8가':'영등포동','양평동3가':'양평동','안암동1가':'안암동','염리동':'아현동','성수동2가':'성수2가제2동','성수동1가':'성수1가제1동','중동':'성산동','노고산동':'서교동','신정동':'서강동','창전동':'서강동','삼선동4가':'삼선동','보문동3가':'보문동','동소문동7가':'동선동','당산동4가':'당산제2동','당산동5가':'당산제2동','당산동':'당산제2동','당산동3가':'당산제1동','당산동1가':'당산제1동','당산동2가':'당산제1동','본동':'노량진동','신수동':'노고산동','대흥동':'노고산동','금호동4가':'금호동','금호동2가':'금호동','충무로4가':'광희동','방화동':'공항동','도화동':'공덕동','신공덕동':'공덕동','일원동':'개포동'}
def change_dongs(location):
    parts = location.split(' ')
    if parts[2] in old_and_new_dongs:
        parts[2] = old_and_new_dongs[parts[2]]
    return ' '.join(parts)
table_1_copy['location'] = table_1_copy['location'].apply(change_dongs)

# DL: (전체 단지 개수 * 204-window_size, window_size, embedding_dim) # (136188, 10, 1024)
# ML: (전체 단지 개수 * 204-window_size 중 y값 있는 것, window_size * embedding_dim) # (55135, 10240)
apartment_complexes_embedding_matrix_with_window_size = [] 
# DL: (전체 단지 개수 * 204-window_size, 1) # (136188, 1)
# ML: (전체 단지 개수 * 204-window_size 중 y값 있는 것, 1) # (55135, 1)
apartment_complexes_price_with_window_size = [] 

model = Embedding(encoder_dim_1, encoder_dim_2, encoder_dim_3, embedding_dim, decoder_dim_1, decoder_dim_2, decoder_dim_3).to(DEVICE)
if model != 'None': # 임베딩 벡터를 사용할 때
    model.eval()
    model.to(DEVICE)

apartment_complexes_locations = table_1_copy['location']
apartment_complexes_names = table_1_copy['name']
for apartment_complex_location, apartment_complex_name in zip(apartment_complexes_locations, apartment_complexes_names): # 단지별로(702)
    apartment_complex_values = table_1_copy[(table_1_copy['name'] == apartment_complex_name) * (table_1_copy['location'] == apartment_complex_location)][[cols for cols in table_1_copy.columns if cols not in ['aid','location','name']]]
    apartment_complex_tensor = torch.FloatTensor(apartment_complex_values).to(DEVICE).repeat(time, 1)
    
    economy_values = table_2_copy[['call_rate','m2']].values
    economy_tensor = torch.FloatTensor(economy_values).to(DEVICE)
    encoder_input_tensor = torch.cat((apartment_complex_tensor, economy_tensor), dim=1) # 2006/01~2022/12까지(204) 12개의 features를 가지는 encoder_input_tensor 생성 # (204, 12)

    if embedding_dim != 'None' and model != 'None': # 임베딩 벡터를 사용할 때
        apartment_complex_embedding_matrix = np.zeros((encoder_input_tensor.shape[0], embedding_dim)) # (204, 1024)
        with torch.no_grad():
            for i in range(encoder_input_tensor.shape[0]): # 2006/01~2022/12까지 기간별로(204)
                apartment_complex_embedding_vector = model.encoder(encoder_input_tensor[i].unsqueeze(0)).squeeze() # 12 features -> 1024 embedding_dim
                if apartment_complex_embedding_vector.is_cuda:
                    apartment_complex_embedding_vector = apartment_complex_embedding_vector.cpu()
                apartment_complex_embedding_matrix[i] = apartment_complex_embedding_vector.numpy()
        apartment_complex_embedding_matrix_tensor = torch.FloatTensor(apartment_complex_embedding_matrix).to(DEVICE) # (204, 1024)

    apartment_complex_aid = table_1_copy[(table_1_copy['name'] == apartment_complex_name) * (table_1_copy['location'] == apartment_complex_location)]['aid'].squeeze()
    price_values = pd.DataFrame({'did': range(0, time)}).merge(table_3_copy[table_3_copy['aid'] == apartment_complex_aid][['did','price']], on='did', how='outer').fillna(0).set_index('did').values
    price_tensor = torch.FloatTensor(price_values).to(DEVICE) # (204, 1)

    if ML_DL == 'DL':
        if embedding_dim == 'None': # 임베딩 벡터가 없을 때
            apartment_complex_embedding_matrix_tensor = encoder_input_tensor
        for i in range(apartment_complex_embedding_matrix_tensor.shape[0]-window_size):
            apartment_complexes_embedding_matrix_with_window_size.append(apartment_complex_embedding_matrix_tensor[i:i+window_size, :])
            apartment_complexes_price_with_window_size.append(price_tensor[i+window_size, :])
    elif ML_DL == 'ML':
        if embedding_dim == 'None': # 임베딩 벡터가 없을 때
            apartment_complex_embedding_matrix_tensor = encoder_input_tensor
        for i in range(apartment_complex_embedding_matrix_tensor.shape[0]-window_size):
            if price_tensor[i+window_size, :] != 0: # 가격이 있는 것만 취급
                if embedding_dim == 'None': # 임베딩 벡터가 없을 때
                    embedding_dim = 12
                for window in range(window_size):
                    apartment_complex_embedding_matrix_concat_tensor = torch.zeros(1, embedding_dim * window_size)
                    apartment_complex_embedding_matrix_concat_tensor[:, window*embedding_dim:(window+1)*embedding_dim] = apartment_complex_embedding_matrix_tensor[i+window:i+window+1, :]
                apartment_complexes_embedding_matrix_with_window_size.append(apartment_complex_embedding_matrix_concat_tensor) # (1, 10240)
                apartment_complexes_price_with_window_size.append(price_tensor[i+window_size, :]) # (1, )
    else:
        raise ValueError("Invalid value for 'ML_DL'. It must be either 'DL' or 'ML'.")