In [1]:
# import psycopg2

import copy
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# import xgboost as xgb
# import lightgbm as lgb
# from catboost import CatBoostRegressor

import joblib
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

import torch
from torch import nn
from torch.utils.data import Subset, DataLoader

from Dataset.Embedding_Dataset import Embedding_Dataset
from Model.Embedding import Embedding

from Dataset.Apartment_Complex_Dataset import Apartment_Complex_Dataset
from Model.LSTM import LSTM
from Model.GRU import GRU
from Model.Transformer import Transformer
from Model.Informer import Informer
from Model.Pyraformer import Pyraformer
from Model.N_BEATS import NBeats
from Model.NLinear import NLinear

from Dataset.District_Dataset import District_Dataset
from Model.LSTM_Attention import LSTMAttention
from Model.GRU_Attention import GRUAttention
from Model.Transformer_Attention import TransformerAttention
from Model.Informer_Attention import InformerAttention
from Model.Pyraformer_Attention import PyraformerAttention

from utils import RMSE, rmse, mse, mae, save_train_val_losses, early_stops, plot_train_val_losses

SEED = 1234
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = True

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# connection_info = "host=localhost dbname=postgres user=postgres password=hd219833 port=5432"
# conn = psycopg2.connect(connection_info)
# table_1_query = '''
#     SELECT * FROM building
#     '''
# table_2_query = '''
#     SELECT * FROM economy
#     '''
# table_3_query = '''
#     SELECT * FROM building_price
#     '''
# table_1 = pd.read_sql(table_1_query,conn) 
# table_2 = pd.read_sql(table_2_query,conn)
# table_3 = pd.read_sql(table_3_query,conn) 

table_1 = pd.read_csv('../데이터/Table/table_1.csv') 
table_2 = pd.read_csv('../데이터/Table/table_2.csv') 
table_3 = pd.read_csv('../데이터/Table/table_3.csv') 

In [10]:
old_and_new_dongs = {'용산동5가':'한강로동','한강로2가':'한강로동','창동':'창제동','돈암동':'정릉동','거여동':'위례동','문정동':'위례동','장지동':'위례동','문배동':'원효로동','산천동':'원효로동','신창동':'원효로동','원효로1가':'원효로동','화곡동':'우장산동','내발산동':'우장산동','영등포동8가':'영등포동','양평동3가':'양평동','안암동1가':'안암동','염리동':'아현동','성수동2가':'성수2가제2동','성수동1가':'성수1가제1동','중동':'성산동','노고산동':'서교동','신정동':'서강동','창전동':'서강동','삼선동4가':'삼선동','보문동3가':'보문동','동소문동7가':'동선동','당산동4가':'당산제2동','당산동5가':'당산제2동','당산동':'당산제2동','당산동3가':'당산제1동','당산동1가':'당산제1동','당산동2가':'당산제1동','본동':'노량진동','신수동':'노고산동','대흥동':'노고산동','금호동4가':'금호동','금호동2가':'금호동','충무로4가':'광희동','방화동':'공항동','도화동':'공덕동','신공덕동':'공덕동','일원동':'개포동'}
def change_dongs(location):
    parts = location.split(' ')
    if parts[2] in old_and_new_dongs:
        parts[2] = old_and_new_dongs[parts[2]]
    return ' '.join(parts)
table_1['location'] = table_1['location'].apply(change_dongs)

scaler = StandardScaler()
table_1[[cols for cols in table_1.columns if cols not in ['aid','location','name']]] = scaler.fit_transform(table_1[[cols for cols in table_1.columns if cols not in ['aid','location','name']]])
scaler.fit(table_2[[cols for cols in table_2.columns if cols not in ['did','year','month']]][:135])
table_2[[cols for cols in table_2.columns if cols not in ['did','year','month']]] = scaler.transform(table_2[[cols for cols in table_2.columns if cols not in ['did','year','month']]])

table_merge = pd.merge(table_1, table_3, how='left', on='aid')
table_merge = pd.merge(table_merge, table_2, how='left', on='did')
table_merge.sort_values(by='did',inplace=True)

input_values = table_merge[[cols for cols in table_merge.columns if cols not in ['aid','location','name','did','year','month','price']]].values
output_values = table_merge[['price']].values * 0.0001

In [22]:
table_merge.to_excel('../데이터/Table/전체데이터.xlsx', index=False)

In [23]:
columns

['lat',
 'lng',
 'year_built',
 'education',
 'household',
 'parking',
 'complex_facility',
 'move_in_day',
 'subway_distance',
 'convenience_facility',
 'call_rate',
 'm2']

In [17]:
table_merge[columns]

Unnamed: 0,lat,lng,year_built,education,household,parking,complex_facility,move_in_day,subway_distance,convenience_facility,call_rate,m2
0,-1.151331,0.800366,-1.080250,-0.784628,-0.546221,-0.147409,-0.789355,1.128989,-1.047801,0.899218,0.733665,-1.763061
36428,-0.890615,0.286372,-1.473084,-1.338345,2.139815,0.740824,-0.789355,1.487926,0.101926,-0.158244,0.733665,-1.763061
36660,-0.933928,0.320179,-0.818360,-0.784628,-0.101661,-0.147409,-0.789355,0.922555,-0.760369,-1.215707,0.733665,-1.763061
36920,-1.119778,0.401985,-1.342139,-0.784628,-0.323318,-0.147409,-0.789355,1.264349,1.539085,0.899218,0.733665,-1.763061
37153,-1.145696,0.425000,0.360145,-0.784628,-0.679464,-0.147409,-0.789355,-0.424977,1.539085,0.370487,0.733665,-1.763061
...,...,...,...,...,...,...,...,...,...,...,...,...
15407,-0.800259,-1.335940,-0.032690,-0.784628,0.057733,-0.147409,1.003477,-0.033896,-0.185506,0.899218,0.546090,5.465540
39480,0.056382,0.596076,-1.211194,0.322805,-0.566145,-0.147409,-0.072222,1.198633,0.101926,-0.686976,0.546090,5.465540
4860,1.306131,0.352236,1.407705,-0.230912,0.716478,-0.147409,-0.789355,-1.402500,0.964222,-1.744438,0.546090,5.465540
26623,0.941836,0.922835,1.538650,-0.230912,-0.466524,-0.147409,-0.789355,-1.576790,0.964222,0.370487,0.546090,5.465540


### Embedding

In [None]:
epochs = 10000
lr = 1e-4
hidden_dim = 1024
sub = True # True
embedding_dim = 1024 # 1024
window_size = 12 # 12

In [None]:
# 파라미터
train_ratio = 0.7
val_ratio = 0.0
test_ratio = 0.3
embedding_batch = 64
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

embedding_lr = 0.00001
embedding_weight_decay = 0
embedding_batch = 128
embedding_epochs = 150
encoder_dim_1 = 128
encoder_dim_2 = 256
encoder_dim_3 = 512
embedding_dim = 1024
decoder_dim_1 = 512
decoder_dim_2 = 256
decoder_dim_3 = 128

In [None]:
dataset = Embedding_Dataset(table_1, table_2, table_3, DEVICE)
dataset_length = len(dataset)
split_point = int(train_ratio * len(dataset))
train_indices = range(0, split_point)
val_indices = range(split_point, dataset_length)

train_dataset = Subset(dataset, train_indices)
val_dataset = Subset(dataset, val_indices)

train_dataloader = DataLoader(train_dataset, batch_size=embedding_batch, shuffle=False, drop_last=True)
val_dataloader = DataLoader(val_dataset, batch_size=embedding_batch, shuffle=False, drop_last=True)

In [None]:
model = Embedding(encoder_dim_1, encoder_dim_2, encoder_dim_3, embedding_dim, decoder_dim_1, decoder_dim_2, decoder_dim_3).to(DEVICE)
criterion = RMSE()
optimizer = torch.optim.Adam(model.parameters(), lr=embedding_lr, weight_decay=embedding_weight_decay)

train_losses = []
val_losses = []
best_val_loss = float('inf')
consecutive_val_loss_increases = 0
max_consecutive_val_loss_increases = 3

for epoch in range(embedding_epochs):
    model.train()
    total_train_loss = 0
    for data in train_dataloader:
        input = data[0].to(DEVICE)
        target = data[1].to(DEVICE)
        output = model(input).to(DEVICE)

        train_loss = criterion(output, target)
        total_train_loss += train_loss.item()

        optimizer.zero_grad()
        train_loss.backward()
        optimizer.step()

    avg_train_loss = total_train_loss / len(train_dataloader)
    train_losses.append(avg_train_loss)

    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for data in val_dataloader:
            input = data[0].to(DEVICE)
            target = data[1].to(DEVICE)
            output = model(input).to(DEVICE)

            val_loss = criterion(output, target)
            total_val_loss += val_loss.item()

    avg_val_loss = total_val_loss / len(val_dataloader)
    val_losses.append(avg_val_loss)

    early_stop, consecutive_val_loss_increases = early_stops(val_losses, consecutive_val_loss_increases, max_consecutive_val_loss_increases)
    
    if early_stop:
        print(f'Epoch [{epoch+1}/{embedding_epochs}], Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f} \nEarly Stop Triggered!')
        torch.save(model, f'../데이터/Checkpoint/embedding_tr_{train_ratio}_lr_{embedding_lr}_wd_{embedding_weight_decay}_batch_{embedding_batch}_epochs_{epoch+1}_e1_{encoder_dim_1}_e2_{encoder_dim_1}_e3_{encoder_dim_3}_emb_{embedding_dim}_d1{decoder_dim_1}_d2_{decoder_dim_2}_d3_{decoder_dim_3}.pth')
        break

    print(f'Epoch [{epoch+1}/{embedding_epochs}], Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}')

plot_train_val_losses(train_losses, val_losses)