In [1]:
import warnings
warnings.filterwarnings(action='ignore')

import os
import gc
import math
import random
import pickle
import pandas as pd
import numpy as np
import multiprocessing
from tqdm.auto import tqdm

from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler

from transformers import get_cosine_schedule_with_warmup

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, TensorDataset, sampler

In [2]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda')

In [3]:
random_seed = 41

def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed) 
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    os.environ["PYTHONHASHSEED"] = str(seed)
    
def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

seed_everything(seed=random_seed) # Seed 고정

In [4]:
train = pd.read_csv("./data/df_train.csv")
test = pd.read_csv("./data/df_test.csv")
train.shape, test.shape

((262, 48), (175, 47))

In [5]:
y = torch.LongTensor(train['class'].values)
X = train.drop(['id', 'class'], axis=1).to_numpy()
X_test = test.drop(['id'], axis=1).to_numpy()
y

tensor([1, 2, 1, 0, 2, 1, 1, 1, 1, 2, 1, 1, 2, 2, 1, 0, 0, 0, 0, 2, 1, 1, 0, 1,
        2, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 2, 1, 0, 1, 1, 1, 1, 1, 0, 2, 1, 1, 0,
        2, 0, 0, 2, 0, 2, 1, 1, 0, 0, 2, 2, 1, 0, 2, 2, 0, 2, 2, 0, 1, 2, 2, 2,
        0, 1, 1, 1, 2, 1, 2, 0, 1, 1, 2, 0, 2, 0, 2, 2, 2, 1, 1, 1, 2, 2, 1, 1,
        2, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 0, 2, 2,
        1, 1, 2, 0, 0, 1, 1, 0, 2, 2, 2, 0, 1, 2, 0, 1, 2, 0, 2, 1, 1, 2, 2, 1,
        1, 1, 2, 2, 0, 1, 0, 2, 2, 2, 1, 0, 1, 1, 1, 1, 1, 0, 1, 2, 1, 1, 1, 0,
        0, 2, 2, 0, 0, 2, 2, 2, 2, 0, 2, 0, 0, 2, 0, 2, 2, 1, 1, 2, 1, 2, 2, 1,
        1, 1, 0, 1, 1, 2, 1, 2, 0, 1, 0, 1, 0, 2, 2, 1, 2, 1, 2, 1, 0, 0, 1, 2,
        0, 0, 2, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 2, 2, 1, 2, 1, 2, 1, 0, 2, 0,
        0, 2, 2, 1, 1, 1, 0, 1, 0, 1, 0, 2, 1, 0, 1, 1, 1, 1, 2, 0, 0, 1])

In [6]:
y2 = nn.functional.one_hot(y, num_classes=3).to(device).long()

In [7]:
total = np.concatenate([X, X_test], axis=0)
total.shape

(437, 46)

In [8]:
scaler = StandardScaler()
total = scaler.fit_transform(total)
# total = np.expand_dims(total, axis=1)
total.shape

(437, 46)

In [9]:
class Encoder(nn.Module):
    def __init__(self, n_features, latent_dim):
        super().__init__()
        
        self.lstm0 = nn.Linear(n_features, latent_dim**2)
        self.lstm1 = nn.Linear(latent_dim**2, latent_dim*3)        
        self.lstm2_1 = nn.Linear(latent_dim*3, 3)
        self.lstm2_2 = nn.Linear(latent_dim*3, latent_dim-3)

    def forward(self, x):
        x1 = self.lstm0(x)
        x2 = self.lstm1(x1)
        x3_1 = self.lstm2_1(x2)
        x3_2 = self.lstm2_2(x2)
        
        return x3_1, x3_2


class Decoder(nn.Module):
    def __init__(self, n_features, latent_dim):
        super().__init__()
        self.latent_dim = latent_dim
        
        self.lstm0 = nn.Linear(latent_dim, latent_dim*3)
        self.lstm1 = nn.Linear(latent_dim*3, latent_dim**2)        
        self.lstm2 = nn.Linear(latent_dim**2, latent_dim*2)        
        
        
        self.linear = nn.Linear(in_features=latent_dim*2, out_features=n_features)

    def forward(self, x):        
        x = self.lstm0(x)
        x = self.lstm1(x)
        x = self.lstm2(x)
        
        x = self.linear(x)

        return x


class AutoEncoder(nn.Module):
    def __init__(self, n_features=46, latent_dim=3+24, device=None):
        super().__init__()
        
        self.encoder = Encoder(n_features, latent_dim).to(device)
        self.decoder = Decoder(n_features, latent_dim).to(device)

    def forward(self, x):
        x1_1, x1_2 = self.encoder(x)
        x1 = torch.concat([x1_1, x1_2], axis=1)
        x2 = self.decoder(x1)
        
        return x1, x2

In [10]:
encoder_model = AutoEncoder().to(device)
encoder_model.load_state_dict(torch.load(f'./models/AutoEncoder_total.pt', map_location=device))
encoder_model.eval()

AutoEncoder(
  (encoder): Encoder(
    (lstm0): Linear(in_features=46, out_features=729, bias=True)
    (lstm1): Linear(in_features=729, out_features=81, bias=True)
    (lstm2_1): Linear(in_features=81, out_features=3, bias=True)
    (lstm2_2): Linear(in_features=81, out_features=24, bias=True)
  )
  (decoder): Decoder(
    (lstm0): Linear(in_features=27, out_features=81, bias=True)
    (lstm1): Linear(in_features=81, out_features=729, bias=True)
    (lstm2): Linear(in_features=729, out_features=54, bias=True)
    (linear): Linear(in_features=54, out_features=46, bias=True)
  )
)

In [11]:
train_dataset = TensorDataset(torch.from_numpy(total).type(torch.float), torch.zeros(len(total)).type(torch.float))
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=False, num_workers=2, worker_init_fn=seed_worker)

encodings = []
errors = []
criterion = nn.MSELoss().to(device)
for x, label in tqdm(iter(train_loader)):
    x = x.to(device)
    
    encoded_features, decoded_features = encoder_model(x)
    encodings += encoded_features.detach().cpu().numpy().tolist()
    
    loss = criterion(x, decoded_features)
    errors.append(loss.detach().cpu())
    
print(len(encodings), len(errors))

  0%|          | 0/437 [00:00<?, ?it/s]

437 437


In [12]:
encodings = np.array(encodings)
errors = np.expand_dims(np.array(errors), axis=1)
values = np.concatenate([encodings, errors], axis=1)
values.shape

(437, 28)

In [13]:
df = pd.DataFrame(data=values, columns=["encodings_"+str(x) for x in range(len(encodings[0]))] + ["errors"])
df

Unnamed: 0,encodings_0,encodings_1,encodings_2,encodings_3,encodings_4,encodings_5,encodings_6,encodings_7,encodings_8,encodings_9,...,encodings_18,encodings_19,encodings_20,encodings_21,encodings_22,encodings_23,encodings_24,encodings_25,encodings_26,errors
0,-3.235218,4.172423,-3.853053,-1.533935,2.644730,3.716698,-0.299315,-2.650086,0.783824,-0.406989,...,1.082240,-1.530903,-1.215420,-3.295104,0.245303,-0.055747,-1.089265,-1.649269,0.001300,0.008067
1,-3.164804,6.937546,-0.615424,0.798070,-2.365728,-1.246152,0.571691,0.340047,-0.374678,2.935323,...,-0.065099,-0.072708,-2.343938,0.866952,-0.382145,-2.748686,0.459386,0.983944,3.129371,0.009998
2,-3.450719,6.478036,-4.994792,-1.083578,-1.050726,-0.028033,-1.456105,-1.262530,-1.064003,2.599940,...,-3.403145,-1.100375,-1.358432,-1.019482,3.274151,3.080497,2.445034,1.530065,0.137097,0.040302
3,-5.078038,10.407413,-5.961476,-0.954541,-2.045539,0.913142,3.408923,-2.241139,1.254574,1.749235,...,-0.824590,1.962076,-0.140499,-1.557903,3.796697,3.281870,3.378306,1.271765,-2.542374,0.008206
4,-4.685338,6.908415,-5.594373,1.017355,-2.475780,1.170744,-1.731986,2.448224,-0.242145,-0.236407,...,-2.392161,-1.289728,-2.297753,2.706107,0.366826,1.816569,1.143190,4.631185,-0.518664,0.036693
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
432,-6.713977,4.797306,-6.859674,-0.427648,-1.465041,0.584409,-1.570887,0.707021,0.202451,-1.051615,...,0.948983,-1.454844,3.104134,-1.628868,1.587918,-1.379632,-0.742597,3.711644,0.533880,0.028781
433,-3.505607,6.129325,-2.930341,-0.359151,1.557368,0.367298,1.640623,2.402381,-2.167556,0.352788,...,-3.493095,0.081309,-3.520641,-0.392295,2.330890,1.521016,0.899050,4.925332,0.413980,0.015013
434,-1.853786,3.641998,0.470991,-1.175941,3.441618,1.693989,2.112146,1.143281,-0.975709,-2.530986,...,-1.952293,1.092013,-1.333903,1.238227,1.457066,0.588651,-1.395687,1.455708,-0.818322,0.050879
435,-3.249345,8.500051,-4.339647,-2.227715,-1.511902,0.807777,-0.969590,-1.554576,0.884828,-0.540825,...,-0.068143,-3.087835,0.714141,0.378201,2.740462,-0.029870,-0.452864,-0.030688,2.088013,0.007352


In [14]:
df.to_csv("./data/ae_values.csv", index=False)