In [1]:
import warnings
warnings.filterwarnings(action='ignore')

import os
import gc
import math
import random
import pickle
import pandas as pd
import numpy as np
import multiprocessing
from tqdm.auto import tqdm

from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler

from transformers import get_cosine_schedule_with_warmup

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, TensorDataset, sampler

In [2]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda')

In [3]:
random_seed = 41

def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed) 
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    os.environ["PYTHONHASHSEED"] = str(seed)
    
def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

seed_everything(seed=random_seed) # Seed 고정

In [4]:
train = pd.read_csv("./data/df_train6.csv")
test = pd.read_csv("./data/df_test6.csv")
train.shape, test.shape

((193, 33), (175, 33))

In [5]:
y = torch.LongTensor(train['class'].values)
X = train.drop(['id', 'class'], axis=1).to_numpy()
X_test = test.drop(['id','class'], axis=1).to_numpy()
y

tensor([0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1,
        0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0,
        1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1,
        1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0,
        0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
        0])

In [6]:
y2 = nn.functional.one_hot(y, num_classes=2).to(device).long()

In [7]:
total = np.concatenate([X, X_test], axis=0)
total.shape

(368, 31)

In [8]:
class Encoder(nn.Module):
    def __init__(self, n_features, latent_dim):
        super().__init__()
        
        self.lstm0 = nn.Sequential(nn.Linear(n_features, latent_dim**2),
                                    nn.GELU(),
                                    nn.Dropout(0.1))
        self.lstm1 = nn.Sequential(nn.Linear(latent_dim**2, latent_dim*3),
                                    nn.GELU(),
                                    nn.Dropout(0.1))
        self.lstm2 = nn.Sequential(nn.Linear(latent_dim*3, latent_dim),
                                    nn.GELU(),
                                    nn.Dropout(0.1))

    def forward(self, x):
        x1 = self.lstm0(x)
        x2 = self.lstm1(x1)
        x3 = self.lstm2(x2)
        
        return x3


class Decoder(nn.Module):
    def __init__(self, n_features, latent_dim):
        super().__init__()
        self.latent_dim = latent_dim
        
        self.lstm0 = nn.Sequential(nn.Linear(latent_dim, latent_dim*3),
                                    nn.GELU(),
                                    nn.Dropout(0.1))
        self.lstm1 = nn.Sequential(nn.Linear(latent_dim*3, latent_dim**2),
                                    nn.GELU(),
                                    nn.Dropout(0.1))
        self.lstm2 = nn.Sequential(nn.Linear(latent_dim**2, latent_dim*2),        
                                    nn.GELU(),
                                    nn.Dropout(0.1))
        
        
        self.linear = nn.Linear(in_features=latent_dim*2, out_features=n_features)

    def forward(self, x):        
        x = self.lstm0(x)
        x = self.lstm1(x)
        x = self.lstm2(x)
        
        x = self.linear(x)

        return x


class AutoEncoder(nn.Module):
    def __init__(self, n_features=31, latent_dim=7, device=None):
        super().__init__()
        
        self.encoder = Encoder(n_features, latent_dim).to(device)
        self.decoder = Decoder(n_features, latent_dim).to(device)

    def forward(self, x):
        x1 = self.encoder(x)
        x2 = self.decoder(x1)
        
        return x1, x2

In [9]:
encoder_model = AutoEncoder().to(device)
encoder_model.load_state_dict(torch.load(f'./models/AutoEncoder_total.pt', map_location=device))
encoder_model.eval()

AutoEncoder(
  (encoder): Encoder(
    (lstm0): Sequential(
      (0): Linear(in_features=31, out_features=49, bias=True)
      (1): GELU(approximate='none')
      (2): Dropout(p=0.1, inplace=False)
    )
    (lstm1): Sequential(
      (0): Linear(in_features=49, out_features=21, bias=True)
      (1): GELU(approximate='none')
      (2): Dropout(p=0.1, inplace=False)
    )
    (lstm2): Sequential(
      (0): Linear(in_features=21, out_features=7, bias=True)
      (1): GELU(approximate='none')
      (2): Dropout(p=0.1, inplace=False)
    )
  )
  (decoder): Decoder(
    (lstm0): Sequential(
      (0): Linear(in_features=7, out_features=21, bias=True)
      (1): GELU(approximate='none')
      (2): Dropout(p=0.1, inplace=False)
    )
    (lstm1): Sequential(
      (0): Linear(in_features=21, out_features=49, bias=True)
      (1): GELU(approximate='none')
      (2): Dropout(p=0.1, inplace=False)
    )
    (lstm2): Sequential(
      (0): Linear(in_features=49, out_features=14, bias=True)
    

In [10]:
train_dataset = TensorDataset(torch.from_numpy(total).type(torch.float), torch.zeros(len(total)).type(torch.float))
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=False, num_workers=2, worker_init_fn=seed_worker)

encodings = []
errors = []
criterion = nn.MSELoss().to(device)
for x, label in tqdm(iter(train_loader)):
    x = x.to(device)
    
    encoded_features, decoded_features = encoder_model(x)
    encodings += encoded_features.detach().cpu().numpy().tolist()
    
    loss = criterion(x, decoded_features)
    errors.append(loss.detach().cpu())
    
print(len(encodings), len(errors))

  0%|          | 0/368 [00:00<?, ?it/s]

368 368


In [11]:
encodings = np.array(encodings)
errors = np.expand_dims(np.array(errors), axis=1)
values = np.concatenate([encodings, errors], axis=1)
values.shape

(368, 8)

In [12]:
df = pd.DataFrame(data=values, columns=["encodings_"+str(x) for x in range(len(encodings[0]))] + ["errors"])
df

Unnamed: 0,encodings_0,encodings_1,encodings_2,encodings_3,encodings_4,encodings_5,encodings_6,errors
0,0.038362,-0.123360,0.028561,-0.007297,-0.014100,-0.022615,-0.071094,0.027966
1,0.018671,0.003465,-0.168691,-0.089982,-0.076945,0.159816,0.338336,0.047560
2,0.160368,0.028850,0.055501,-0.033374,0.042429,0.094334,-0.099921,0.078756
3,-0.052722,0.162948,-0.049646,0.077969,-0.072254,0.062971,-0.052153,0.046547
4,0.016150,0.077639,-0.008436,0.060002,-0.028739,0.026300,-0.134074,0.024998
...,...,...,...,...,...,...,...,...
363,-0.161607,-0.003663,-0.010378,-0.040992,0.144521,-0.028976,-0.080442,0.042534
364,0.103941,-0.034385,-0.122455,0.058987,-0.001012,0.226782,0.057702,0.016201
365,0.102044,-0.053995,-0.068710,0.085086,0.070044,0.063672,0.108152,0.080130
366,0.060815,0.051417,0.044992,-0.076959,0.077362,-0.004386,-0.092469,0.032291


In [13]:
df.to_csv("./data/ae_values.csv", index=False)