배치를 직접 나눠서 원핫인코딩 시, 메모리초과 안나도록!

## 1. 결측치처리
**해당 노트북**
+ 전처리방법10 + vae 활용 + validation set 확인

In [1]:
import pandas as pd
import numpy as np
import random
import os
import gc

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

### 1.1. 전처리방법2 데이터 가져오기

In [2]:
train = pd.read_parquet('./data/train_preprocess_10.parquet')
test = pd.read_parquet('./data/test_preprocess_10.parquet')
sample_submission = pd.read_csv('sample_submission.csv', index_col = 0)

print(train.shape)
print(train.Delay.value_counts())

(976567, 10)
Not_Delayed    205036
Delayed         43985
Name: Delay, dtype: int64


In [3]:
print(train['Tail_Number'].nunique(), test['Tail_Number'].nunique())


6426 6493


### 1.3. label & unlabel split  / label_train & label_validation split

#### 1배치 데이터 흐름
1. vae에는 X_train_labeled와 X_unlabeled를 각각 onehot으로 만들어서 합쳐서 넣어주기
2. classifier에는 X_train_labeled를 onehot으로 만든 것 넣어주기


#### 필요한 것
1. labeled와 unlabeled 나누기
2. labeled에서 train과 validation 분리하기
3. X_train_labeld & X_unlabeled 를 이용한 onehot encoding
4. 전체 데이터에 onehot 적용하면 데이터 크기 너무 커지므로, 배치로 처리하기

#### 1.3.1. 데이터 쪼개기

In [4]:
# 1. labeled & unlabeld split
train_labeled , train_unlabeled = train[train['Delay'].notnull()], train[train['Delay'].isnull()]

X_labeled, y_labeled = train_labeled.drop(['ID','Delay'], axis=1), train_labeled[['Delay']]
change_cate2num = {'Not_Delayed':0, "Delayed":1}
y_labeled['Delay'] = y_labeled['Delay'].apply(lambda x : change_cate2num[x])
X_unlabeled = train_unlabeled.drop(['ID','Delay'], axis=1)

print(X_labeled.shape, X_unlabeled.shape)


# 2. train_labeled & val_labeled split
from sklearn.model_selection import train_test_split
X_train_labeled, X_val_labeled, y_train_labeled, y_val_labeled = train_test_split(X_labeled, y_labeled, test_size=0.2, random_state=42)


# 3. X_unlabled 의 크기를 X_train_labeled와 맞춰주기
# X_unlabeled = X_unlabeled.iloc[:len(X_train_labeled),:]
# print(X_unlabeled.shape, X_train_labeled.shape )

(249021, 8) (727546, 8)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_labeled['Delay'] = y_labeled['Delay'].apply(lambda x : change_cate2num[x])


#### 1.3.2. encoder 만들기

In [5]:
y_labeled

Unnamed: 0,Delay
5,0
6,0
8,0
10,1
12,0
...,...
999962,0
999963,1
999969,1
999985,0


In [6]:

# 3. 데이터 정리 & onehotencoding
from sklearn.preprocessing import OneHotEncoder


cate_cols = ['Origin_Airport_ID', 'Destination_Airport_ID', 'Carrier_ID(DOT)', 'Tail_Number','Day','EDT','EAT']

numeric_cols = ['Distance']

## 3.1. VAE 훈련에 쓸 데이터 : X_train_labeled, X_unlabeled
### 3.1.1. 데이터 정리
X_vae_train = pd.concat([X_train_labeled, X_unlabeled])
X_vae_train_cate = X_vae_train[cate_cols]

encoder = OneHotEncoder(handle_unknown='ignore')
encoder.fit(X_vae_train_cate)


### 1.4. 데이터 만들기

In [7]:
import torch
# 필요한 것 : X_train_labeled, y_train_labeled, X_vae_train, cate_cols, numeric_cols
def make_label_batch(batch_num, num_of_batch):
    # 1. 현재 배치 가져오기 - 추후 X_train_labeled를 섞는 과정도 필요
    n = len(X_train_labeled)
    start_loc = n//num_of_batch*(batch_num-1)
    end_loc = n//num_of_batch*batch_num
    
    X_cur_batch = X_train_labeled.iloc[start_loc:end_loc]
    y_cur_batch = y_train_labeled.iloc[start_loc:end_loc]
    
    # 2. category onehot으로 변환하기
    X_sample_category = X_cur_batch[cate_cols]
    X_sample_category = encoder.transform(X_sample_category)
    X_sample_category = X_sample_category.toarray()  # 추가된 코드: X_sample_category를 2차원 배열로 변환 : 희소행렬로 반환되는 onehot encoding 결과를, 일반적인 Numpy 배열로 변환해줌
    
    # 3. numeric은 0~1 사이로 바꿔주기
    X_sample_numeric = X_cur_batch[numeric_cols]
    max_values = X_vae_train[numeric_cols].max()
    X_sample_numeric = np.array(X_sample_numeric /max_values.values) # 0~1사이로 변환
#     print(X_sample_category.shape, X_sample_numeric.shape)
    
    # 4. category & numeric 합치기
    X_sample = np.hstack([X_sample_category,X_sample_numeric])

    # 5. 텐서로 변환하기 : (batch_size, column dim)
    X_sample = torch.tensor(X_sample, dtype=torch.float32)  
    y_sample = torch.tensor(y_cur_batch.values, dtype=torch.float32)
    
    return X_sample, y_sample

In [8]:
# 필요한 것 : X_unlabeled, X_vae_train, cate_cols, numeric_cols
def make_unlabel_batch(batch_num,num_of_batch):
    # 1. 현재 배치 가져오기 - 추후 X_unlabeled 섞는 과정도 필요
    n = len(X_train_labeled) # 임시방편
    start_loc = n//num_of_batch*(batch_num-1)
    end_loc = n//num_of_batch*batch_num
    
    X_cur_batch = X_unlabeled.iloc[start_loc:end_loc]
    
    # 2. category onehot으로 변환하기
    X_sample_category = X_cur_batch[cate_cols]
    X_sample_category = encoder.transform(X_sample_category)
    X_sample_category = X_sample_category.toarray()  # 추가된 코드: X_sample_category를 2차원 배열로 변환 : 희소행렬로 반환되는 onehot encoding 결과를, 일반적인 Numpy 배열로 변환해줌
    
    # 3. numeric은 0~1 사이로 바꿔주기
    X_sample_numeric = X_cur_batch[numeric_cols]
    max_values = X_vae_train[numeric_cols].max()
    X_sample_numeric = np.array(X_sample_numeric /max_values.values) # 0~1사이로 변환
#     print(X_sample_category.shape, X_sample_numeric.shape)
    
    # 4. category & numeric 합치기
    X_sample = np.hstack([X_sample_category,X_sample_numeric])

    # 5. 텐서로 변환하기 : (batch_size, column dim)
    X_sample = torch.tensor(X_sample, dtype=torch.float32)  
    
    return X_sample

In [9]:
# validation X를 tensor로 만들기
def make_validation_tensor():
    X_cur_batch = X_val_labeled.copy()
    
    # 2. category onehot으로 변환하기
    X_sample_category = X_cur_batch[cate_cols]
    X_sample_category = encoder.transform(X_sample_category)
    X_sample_category = X_sample_category.toarray()  # 추가된 코드: X_sample_category를 2차원 배열로 변환 : 희소행렬로 반환되는 onehot encoding 결과를, 일반적인 Numpy 배열로 변환해줌
    
    # 3. numeric은 0~1 사이로 바꿔주기
    X_sample_numeric = X_cur_batch[numeric_cols]
    max_values = X_vae_train[numeric_cols].max()
    X_sample_numeric = np.array(X_sample_numeric /max_values.values) # 0~1사이로 변환
#     print(X_sample_category.shape, X_sample_numeric.shape)
    
    # 4. category & numeric 합치기
    X_sample = np.hstack([X_sample_category,X_sample_numeric])

    # 5. 텐서로 변환하기 : (batch_size, column dim)
    X_sample = torch.tensor(X_sample, dtype=torch.float32)  
    
    return X_sample    

In [10]:
X_val_labeled_tensor = make_validation_tensor()
X_val_labeled_tensor.shape

torch.Size([49805, 7663])

In [11]:
# test는 batch 마다 예측 후 이어서 정답 만들기
def make_test_batch(batch_num,num_of_batch):
    n = len(X_test) # 임시방편
    start_loc = n//num_of_batch*(batch_num-1)
    end_loc = n//num_of_batch*batch_num 
    
    X_cur_batch = X_test.iloc[start_loc:end_loc] if batch_num!= num_of_batch else X_test.iloc[start_loc:]
    
    # 2. category onehot으로 변환하기
    X_sample_category = X_cur_batch[cate_cols]
    X_sample_category = encoder.transform(X_sample_category)
    X_sample_category = X_sample_category.toarray()  # 추가된 코드: X_sample_category를 2차원 배열로 변환 : 희소행렬로 반환되는 onehot encoding 결과를, 일반적인 Numpy 배열로 변환해줌
    
    # 3. numeric은 0~1 사이로 바꿔주기 & Nan 값 0으로 처리
    X_sample_numeric = X_cur_batch[numeric_cols].fillna(0)
    max_values = X_vae_train[numeric_cols].max()
    X_sample_numeric = np.array(X_sample_numeric /max_values.values) # 0~1사이로 변환
#     print(X_sample_category.shape, X_sample_numeric.shape)
    
    # 4. category & numeric 합치기
    X_sample = np.hstack([X_sample_category,X_sample_numeric])

    # 5. 텐서로 변환하기 : (batch_size, column dim)
    X_sample = torch.tensor(X_sample, dtype=torch.float32)  
    
    return X_sample

### 모델생성

In [12]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

# Define the VAE model
class VAE(nn.Module):
    def __init__(self, input_dim, hidden_dim, latent_dim):
        super(VAE, self).__init__()

        # Encoder
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, latent_dim * 2)
        )

        # Decoder
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, input_dim),
            nn.Sigmoid()
        )

    def reparameterize(self, mu, log_var):
        std = torch.exp(0.5 * log_var)
        eps = torch.randn_like(std)
        return mu + eps * std

    def forward(self, x):
        h = self.encoder(x)
        mu, log_var = torch.chunk(h, 2, dim=1)
        z = self.reparameterize(mu, log_var)
        x_hat = self.decoder(z)
        return x_hat, mu, log_var


In [13]:
# 분류기 생성
class Classifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(Classifier, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim),
#             nn.Softmax(dim=1)
        )

    def forward(self, x):
        return self.network(x)

    
# 분류기 생성
class Classifier_drop(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(Classifier_drop, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim),
#             nn.Softmax(dim=1)
        )

    def forward(self, x):
        return self.network(x)


In [48]:
# Hyperparameters
input_dim = X_val_labeled_tensor.shape[1]
print('input shape : ', X_val_labeled_tensor.shape[1])
hidden_dim = 64
latent_dim = 3
learning_rate = 0.001

# Initialize the model, optimizer and loss function
device = 'cuda:0'
vae = VAE(input_dim, hidden_dim, latent_dim)
vae.to(device)

reconstruction_loss = nn.BCELoss(reduction='sum')


# classifier 초기화
classifier = Classifier_drop(input_dim = latent_dim, hidden_dim = latent_dim//2 , output_dim= 2)
classifier.to(device)
classification_loss = nn.CrossEntropyLoss()


# optimizer
combined_parameters = list(vae.parameters()) + list(classifier.parameters())
optimizer = optim.Adam(combined_parameters, lr=learning_rate)


# vae.load_state_dict(torch.load('./VAE_new.pth'))
# classifier.load_state_dict(torch.load('./classifier_new.pth'))


input shape :  7663


### 모델훈련
현재 갑자기 training이 안되는 문제 발생

In [49]:
from tqdm import tqdm
from torch.optim.lr_scheduler import StepLR
device = 'cuda:0'
vae.to(device)
classifier.to(device)

scheduler = StepLR(optimizer, step_size=1, gamma=0.95)


for epoch in tqdm(range(30)):
    vae.train()
    classifier.train()
    # epoch 마다 data shuffle
    tmp = pd.concat([X_train_labeled, y_train_labeled], axis=1)
    tmp = tmp.sample(frac=1).reset_index(drop=True)
    X_train_labeled = tmp.drop(columns=['Delay'])
    y_train_labeled = tmp[['Delay']]
    
    # epoch 마다 loss 세기
    loss_dict = {'total_loss':0, 'labeled_loss':0, 'unlabeled_loss':0, 'class_loss':0}
    num_of_batch = 50
    for batch_num in range(1,num_of_batch+1):
        labeled_data, labeled_label = make_label_batch(batch_num, num_of_batch)
        unlabeled_data = make_unlabel_batch(batch_num, num_of_batch)

        optimizer.zero_grad()

        # Labeled data에 대한 VAE 훈련
        reconstructed_labeled_data, mu, log_var = vae(labeled_data.to(device))
        labeled_loss = reconstruction_loss(reconstructed_labeled_data, labeled_data.to(device)) - 0.5 * torch.sum(1 + log_var - mu.pow(2) - log_var.exp())

        # Unlabeled data에 대한 VAE 훈련
        reconstructed_unlabeled_data, mu_unlabeled, log_var_unlabeled = vae(unlabeled_data.to(device))
        unlabeled_loss = reconstruction_loss(reconstructed_unlabeled_data, unlabeled_data.to(device)) - 0.5 * torch.sum(1 + log_var_unlabeled - mu_unlabeled.pow(2) - log_var_unlabeled.exp())

        # Labeled data에 대한 Classifier 훈련
        latent_labeled_data = vae.reparameterize(mu, log_var)
        classifier_output = classifier(latent_labeled_data)
        labeled_label_indices = torch.argmax(labeled_label, dim=1).to(device) # 다중 타겟을 원핫으로 변환
        class_loss = classification_loss(classifier_output, labeled_label_indices)

        # 총 Loss 계산 및 업데이트
        total_loss = labeled_loss + unlabeled_loss + class_loss
        total_loss.backward()
        optimizer.step()
        

        loss_dict['total_loss'] += total_loss.item()
        loss_dict['labeled_loss'] += labeled_loss.item()
        loss_dict['unlabeled_loss'] += unlabeled_loss.item()
        loss_dict['class_loss'] += class_loss.item()

#         if batch_num % 10 ==0:
#             print(batch_num)

    print(f"epoch:{epoch+1}, total:{loss_dict['total_loss']/num_of_batch : .4f}, labeled_loss:{loss_dict['labeled_loss']/num_of_batch : .4f}, unlabeled_loss:{loss_dict['unlabeled_loss']/num_of_batch : .4f}, class_loss: {loss_dict['class_loss']/num_of_batch : .4f}")
    scheduler.step()
    # validation 점수 측정
    validation_score()
    torch.save(vae.state_dict(), f'./VAE_{epoch+1}.pth') # 모델 저장
    torch.save(classifier.state_dict(), f'./classifier_dropout_{epoch+1}.pth') # 모델 저장

  0%|                                                                                           | 0/30 [00:00<?, ?it/s]

epoch:1, total: 29697280.3600, labeled_loss: 14850167.9700, unlabeled_loss: 14847112.2200, class_loss:  0.6498


  3%|██▊                                                                                | 1/30 [01:26<41:59, 86.87s/it]

 Validation Log Loss = 0.6491
epoch:2, total: 3857490.9950, labeled_loss: 1929733.5731, unlabeled_loss: 1927756.7925, class_loss:  0.5984


  7%|█████▌                                                                             | 2/30 [02:53<40:24, 86.60s/it]

 Validation Log Loss = 0.6177
epoch:3, total: 661494.9750, labeled_loss: 331317.5363, unlabeled_loss: 330176.9031, class_loss:  0.5343


 10%|████████▎                                                                          | 3/30 [04:19<38:52, 86.38s/it]

 Validation Log Loss = 0.5734
epoch:4, total: 544019.6625, labeled_loss: 271809.2719, unlabeled_loss: 272209.9394, class_loss:  0.4467


 13%|███████████                                                                        | 4/30 [05:46<37:31, 86.60s/it]

 Validation Log Loss = 0.5229
epoch:5, total: 513525.0094, labeled_loss: 256650.8584, unlabeled_loss: 256873.7872, class_loss:  0.3624


 17%|█████████████▊                                                                     | 5/30 [07:13<36:06, 86.66s/it]

 Validation Log Loss = 0.4951
epoch:6, total: 495899.6356, labeled_loss: 247821.1244, unlabeled_loss: 248078.2078, class_loss:  0.3059


 20%|████████████████▌                                                                  | 6/30 [08:39<34:39, 86.66s/it]

 Validation Log Loss = 0.4976
epoch:7, total: 483214.8887, labeled_loss: 241673.3788, unlabeled_loss: 241541.2366, class_loss:  0.2712


 23%|███████████████████▎                                                               | 7/30 [10:07<33:20, 86.98s/it]

 Validation Log Loss = 0.5092
epoch:8, total: 473571.2894, labeled_loss: 236819.6284, unlabeled_loss: 236751.4219, class_loss:  0.2401


 27%|██████████████████████▏                                                            | 8/30 [11:34<31:54, 87.03s/it]

 Validation Log Loss = 0.5311


 27%|██████████████████████▏                                                            | 8/30 [11:59<32:58, 89.95s/it]


KeyboardInterrupt: 

In [22]:
# torch.save(vae.state_dict(), './VAE2.pth') # 모델 저장
# torch.save(classifier.state_dict(), './classifier_dropout1.pth') # 모델 저장

### classifier 성능 비교 1

In [16]:
# 1. train 이용한 경우 validation
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier


# 데이터 수치화
qual_col = ['Origin_Airport_ID', 'Destination_Airport_ID', 'Carrier_ID(DOT)', 'Tail_Number','Day','EDT','EAT','Distance']
X_train_labeled_le = X_train_labeled.copy()
X_val_labeled_le = X_val_labeled.copy()
test_le = test.copy()
for i in qual_col:
    le = LabelEncoder()
    le=le.fit(train[i]) # 사용할 수 있는 전체 X를 이용해서 LE
    X_train_labeled_le[i]=le.transform(X_train_labeled_le[i])
    X_val_labeled_le[i]=le.transform(X_val_labeled_le[i])
    
    for label in np.unique(test[i]):
        if label not in le.classes_: # train에 없는 label인 경우
            le.classes_ = np.append(le.classes_, label)
    test_le[i]=le.transform(test_le[i])
print('Done.')


# 모델 정의
models = {
    "Extra Trees Classifier": ExtraTreesClassifier(random_state=42),
    "Random Forest Classifier": RandomForestClassifier(random_state=42),
    "Light Gradient Boosting Machine": LGBMClassifier(random_state=42),
    "Decision Tree Classifier": DecisionTreeClassifier(random_state=42),
    "Gradient Boosting Classifier": GradientBoostingClassifier(random_state=42),
    "Ada Boost Classifier": AdaBoostClassifier(random_state=42),
    "Logistic Regression": LogisticRegression(random_state=42),
}

# 각 모델의 성능을 비교
for name, model in models.items():
    model.fit(X_train_labeled_le, y_train_labeled)
    y_pred = model.predict_proba(X_val_labeled_le)
    loss = log_loss(y_val_labeled, y_pred)
    print(f"{name}: Log Loss = {loss:.4f}")

Done.


  model.fit(X_train_labeled_le, y_train_labeled)


Extra Trees Classifier: Log Loss = 0.5567


  model.fit(X_train_labeled_le, y_train_labeled)


Random Forest Classifier: Log Loss = 0.4794


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


Light Gradient Boosting Machine: Log Loss = 0.4409
Decision Tree Classifier: Log Loss = 10.3076


  y = column_or_1d(y, warn=True)


Gradient Boosting Classifier: Log Loss = 0.4465


  y = column_or_1d(y, warn=True)


Ada Boost Classifier: Log Loss = 0.6821


  y = column_or_1d(y, warn=True)


Logistic Regression: Log Loss = 0.4759


### classifier성능비교 2 - catboost

In [38]:
# X 값 object 화
X_train_labeled_cate = X_train_labeled.copy()
X_val_labeled_cate = X_val_labeled.copy()

X_train_labeled_cate = X_train_labeled_cate.astype({'EDT':object, 'EAT':object, 'Distance':object, 'Origin_Airport_ID':object, \
                     'Destination_Airport_ID':object, 'Carrier_ID(DOT)':object, 'Day':object})
X_val_labeled_cate = X_val_labeled_cate.astype({'EDT':object, 'EAT':object, 'Distance':object, 'Origin_Airport_ID':object, \
                     'Destination_Airport_ID':object, 'Carrier_ID(DOT)':object, 'Day':object})

print("Object Done.")

Object Done.


In [40]:
# y값 0,1 비율로 가중치 설정
counts = list(y_train_labeled.value_counts())
class_weight = [counts[1]/sum(counts), counts[0]/sum(counts)]
print("weight :", class_weight)

weight : [0.17642157256445265, 0.8235784274355473]


In [41]:
from catboost import CatBoostClassifier, Pool
cat_features = [i for i in range(8)]
model = CatBoostClassifier(random_seed=42, cat_features=cat_features, class_weights=class_weight, verbose=0)
model.fit(X_train_labeled_cate, y_train_labeled)
y_pred = model.predict_proba(X_val_labeled_cate)
loss = log_loss(y_val_labeled, y_pred)
print(f"Catboost : Log Loss = {loss:.4f}")

  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,


Logistic Regression: Log Loss = 0.6224


### semi-supervised 신경망 validation 성능

In [46]:
# 2. semi-supervised 된 신경망 모델의 validation set 예측성능
from sklearn.metrics import log_loss
def validation_score():
    with torch.no_grad():
        vae.eval()
        classifier.eval()

        ## 잠재벡터화
        reconstructed_labeled_data, mu, log_var = vae(X_val_labeled_tensor.to(device))
        X_val_latent_labeled_data = vae.reparameterize(mu, log_var)

        ## 예측
        y_pred = classifier(X_val_latent_labeled_data)
        loss = log_loss(y_val_labeled, torch.softmax(y_pred,dim=1).cpu().detach().numpy()) # softmax로 확률로 바꿔줘야!
        print(f" Validation Log Loss = {loss:.4f}")
#         print(torch.softmax(y_pred,dim=1)[:20,:])
validation_score()

 Validation Log Loss = 0.5325


## submission

### 1. 모델 가져와서, 최선의 가중치 가져오기

In [14]:
# Hyperparameters
input_dim = X_val_labeled_tensor.shape[1]
print('input shape : ', X_val_labeled_tensor.shape[1])
hidden_dim = 64
latent_dim = 3

# Initialize the model, optimizer and loss function
device = 'cuda:0'
vae = VAE(input_dim, hidden_dim, latent_dim)
# classifier 초기화
classifier = Classifier_drop(input_dim = latent_dim, hidden_dim = latent_dim//2 , output_dim= 2)

# 저장된 가중치 가져오기
vae.load_state_dict(torch.load('VAE _8.pth'))
classifier.load_state_dict(torch.load('classifier_dropout_5.pth'))


input shape :  7663


<All keys matched successfully>

### 2. 예측하기
10개로 나눠서 진행

In [20]:
# 데이터 가져오기
X_test = test.drop(columns=['ID'])
test_batch = make_test_batch(1,20)

# 예측하기
device = 'cuda:2'
with torch.no_grad():
    vae.to(device)
    classifier.to(device)
    vae.eval()
    classifier.eval()

    ## 잠재벡터화
    reconstructed_labeled_data, mu, log_var = vae(test_batch.to(device))
    X_test_latent_labeled_data = vae.reparameterize(mu, log_var)

    ## 예측
    y_pred = classifier(X_test_latent_labeled_data)
    y_pred = torch.softmax(y_pred,dim=1).cpu().detach().numpy()

In [21]:
# 데이터 가져오기
for batch_num in range(2, 21):
    test_batch = make_test_batch(batch_num,20)

    # 예측하기
    device = 'cuda:2'
    with torch.no_grad():
        vae.to(device)
        classifier.to(device)
        vae.eval()
        classifier.eval()

        ## 잠재벡터화
        reconstructed_labeled_data, mu, log_var = vae(test_batch.to(device))
        X_test_latent_labeled_data = vae.reparameterize(mu, log_var)

        ## 예측
        y_pred_batch_num = classifier(X_test_latent_labeled_data)
        y_pred_batch_num = torch.softmax(y_pred_batch_num,dim=1).cpu().detach().numpy()
    y_pred = np.vstack([y_pred, y_pred_batch_num])
    print('batch num : ', batch_num)

batch num :  2
batch num :  3
batch num :  4
batch num :  5
batch num :  6
batch num :  7
batch num :  8
batch num :  9
batch num :  10
batch num :  11
batch num :  12
batch num :  13
batch num :  14
batch num :  15
batch num :  16
batch num :  17
batch num :  18
batch num :  19
batch num :  20


In [22]:
y_pred.shape

(1000000, 2)

In [23]:
# 제출본 만들기
sample_submission = pd.read_csv('sample_submission.csv', index_col = 0)
submission = pd.DataFrame(data=y_pred, columns=sample_submission.columns, index=sample_submission.index)

In [24]:
submission.to_csv('./submission/submission_semi-supervise-vae-with1stcod.csv', index=True)