In [None]:
import psycopg2

import copy
import random
import numpy as np
import pandas as pd

import joblib
from sklearn.metrics import mean_squared_error

import torch
from torch import nn
from torch.utils.data import Subset, DataLoader

from Dataset.Embedding_Dataset import Embedding_Dataset
from Model.Embedding import Embedding

from Dataset.Apartment_Complex_Dataset import Apartment_Complex_Dataset
from Model.LSTM import LSTM
from Model.GRU import GRU
from Model.Transformer import Transformer

from Dataset.Dong_Dataset import Dong_Dataset
from Model.LSTM_Attention import LSTMAttention
from Model.GRU_Attention import GRUAttention
from Model.Transformer_Attention import TransformerAttention

from utils import RMSE, save_train_val_losses

SEED = 1234
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

table_1 = pd.read_csv('../데이터/Table/table_1.csv') 
table_2 = pd.read_csv('../데이터/Table/table_2.csv') 
table_3 = pd.read_csv('../데이터/Table/table_3.csv') 

## Embedding

In [None]:
embedding_lrs = [1e-2, 1e-3, 1e-4, 1e-5, 1e-6] 
embedding_batchs = [64, 128, 256, 512, 1024, 2048]
embedding_epochs = 10000
encoder_dim_1 = 128
encoder_dim_2 = 256
encoder_dim_3 = 512
embedding_dim = 1024
decoder_dim_1 = 512
decoder_dim_2 = 256
decoder_dim_3 = 128

In [None]:
dataset = Embedding_Dataset(table_1, table_2, table_3)

train_ratio = 0.7
val_ratio = 0.2
test_ratio = 0.1
dataset_length = len(dataset)
train_size = int(train_ratio * dataset_length)
train_indices = range(0, train_size)
val_size = int(val_ratio * dataset_length)
val_indices = range(train_size, train_size + val_size)
test_size = int(test_ratio * dataset_length)
test_indices = range(train_size + val_size, dataset_length)

train_dataset = Subset(dataset, train_indices)
val_dataset = Subset(dataset, val_indices)
test_dataset = Subset(dataset, test_indices)

In [None]:
results_df = pd.DataFrame(columns=['Learning Rate', 'Batch Size', 'Epochs', 'Train Loss', 'Validation Loss'])
for embedding_lr in embedding_lrs:
    for embedding_batch in embedding_batchs:
        train_dataloader = DataLoader(train_dataset, batch_size=embedding_batch, shuffle=False, drop_last=True)
        val_dataloader = DataLoader(val_dataset, batch_size=embedding_batch, shuffle=False, drop_last=True)

        model = Embedding(encoder_dim_1, encoder_dim_2, encoder_dim_3, embedding_dim, decoder_dim_1, decoder_dim_2, decoder_dim_3).to(DEVICE)
        criterion = RMSE()
        optimizer = torch.optim.Adam(model.parameters(), lr=embedding_lr)

        train_losses = []
        val_losses = []

        max_early_stop_count = 3
        early_stop_count = 0
        best_val_loss = float('inf')
        best_model_weights = None

        for epoch in range(embedding_epochs):
            model.train()
            total_train_loss = 0
            for data in train_dataloader:
                input = data[0].to(DEVICE)
                target = data[1].to(DEVICE)
                output = model(input).to(DEVICE)

                train_loss = criterion(output, target)
                total_train_loss += train_loss.item()

                optimizer.zero_grad()
                train_loss.backward()
                optimizer.step()

            avg_train_loss = total_train_loss / len(train_dataloader)
            train_losses.append(avg_train_loss)

            model.eval()
            total_val_loss = 0
            with torch.no_grad():
                for data in val_dataloader:
                    input = data[0].to(DEVICE)
                    target = data[1].to(DEVICE)
                    output = model(input).to(DEVICE)

                    val_loss = criterion(output, target)
                    total_val_loss += val_loss.item()

            avg_val_loss = total_val_loss / len(val_dataloader)
            val_losses.append(avg_val_loss)

            if  best_val_loss > avg_val_loss:
                best_val_loss = avg_val_loss
                best_model_weights = copy.deepcopy(model.state_dict())
                early_stop_count = 0
            else:
                early_stop_count += 1

            if early_stop_count >= max_early_stop_count:
                print(f'Epoch [{epoch+1}/{embedding_epochs}], Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f} \nEarly Stop Triggered!')
                model.load_state_dict(best_model_weights)
                torch.save(model, f'../데이터/Checkpoint/emb/embedding_lr_{embedding_lr}_batch_{embedding_batch}_epochs_{epoch+1}.pth')
                break

            print(f'Epoch [{epoch+1}/{embedding_epochs}], Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}')
            
        save_train_val_losses(train_losses, val_losses, f'../데이터/Checkpoint/emb/embedding_lr_{embedding_lr}_batch_{embedding_batch}_epochs_{epoch+1}')
        
        results_df.append({
                'Learning Rate': embedding_lr,
                'Batch Size': embedding_batch,
                'Epochs': epoch + 1,
                'Train Loss': min(train_losses),
                'Validation Loss': min(val_losses)
        }, ignore_index=True)
        
results_df.to_excel('../데이터/Checkpoint/emb/embedding_results.xlsx')

In [None]:
from itertools import product

combinations = list(product(embedding_lrs, embedding_batchs))

results_df = pd.DataFrame(columns=['Learning Rate', 'Batch Size', 'Train Loss', 'Validation Loss'])
for embedding_lr, embedding_batch in combinations:
      for epoch in range(3300):
            try:
                  train_losses_file_path = f'../데이터/Checkpoint/emb/embedding_lr_{embedding_lr}_batch_{embedding_batch}_epochs_{epoch}_train_losses.txt'
                  val_losses_file_path = f'../데이터/Checkpoint/emb/embedding_lr_{embedding_lr}_batch_{embedding_batch}_epochs_{epoch}_val_losses.txt'
                  with open(train_losses_file_path, 'r') as train_losses_file: 
                        train_losses_lines = train_losses_file.readlines()            
                  train_losses = [float(line.strip()) for line in train_losses_lines]
                  with open(val_losses_file_path, 'r') as val_losses_file: 
                        val_losses_lines = val_losses_file.readlines()            
                  val_losses = [float(line.strip()) for line in val_losses_lines]

                  min_train_loss = min(train_losses) 
                  min_val_loss = min(val_losses)  

                  results_df = results_df.append({
                        'Learning Rate': embedding_lr,
                        'Batch Size': embedding_batch,
                        'Train Loss': min_train_loss,
                        'Validation Loss':min_val_loss
                  }, ignore_index=True)
            except:
                  pass

results_df.sort_values('Validation Loss',inplace=True)
results_df.reset_index(drop=True,inplace=True)
results_df.to_excel('../데이터/Checkpoint/emb/embedding_results.xlsx')