배치를 직접 나눠서 원핫인코딩 시, 메모리초과 안나도록!

## 1. 결측치처리
**해당 노트북**
+ 전처리방법2 + x결측치삭제 + vae 활용 + validation set 확인

In [1]:
import pandas as pd
import numpy as np
import random
import os
import gc

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

### 1.1. 전처리방법2 데이터 가져오기

In [2]:
train = pd.read_parquet('./data/train_preprocess_2.parquet')
# test = pd.read_parquet('./test.parquet')
test = pd.read_parquet('./data/test_preprocess_2.parquet')
sample_submission = pd.read_csv('sample_submission.csv', index_col = 0)

print(train.shape)
print(train.Delay.value_counts())

(1000000, 19)
Not_Delayed    210001
Delayed         45000
Name: Delay, dtype: int64


### 1.2. 남은 결측치 처리 - 삭제

In [3]:
# print(train.isnull().sum())
# print(train.dropna().shape)
# print(train.dropna().isnull().sum())
train = train.dropna(subset=['Estimated_Departure_Time','Estimated_Arrival_Time','Carrier_Code(IATA)','Airline','Carrier_ID(DOT)'])
print(train.isnull().sum())


# 레이블(Delay)을 제외한 결측값이 존재하는 변수들을 unknown으로 대체합니다.
NaN_col = ['Origin_State','Destination_State','Airline','Estimated_Departure_Time', 'Estimated_Arrival_Time','Carrier_Code(IATA)','Carrier_ID(DOT)']

# for col in NaN_col:
    # mode = train[col].mode()[0]
    # train[col] = train[col].fillna(mode)
    
#     if col in test.columns:
#         test[col] = test[col].fillna('Unknown')
print('Done.')

ID                               0
Month                            0
Day_of_Month                     0
Estimated_Departure_Time         0
Estimated_Arrival_Time           0
Cancelled                        0
Diverted                         0
Origin_Airport                   0
Origin_Airport_ID                0
Origin_State                     0
Destination_Airport              0
Destination_Airport_ID           0
Destination_State                0
Distance                         0
Airline                          0
Carrier_Code(IATA)               0
Carrier_ID(DOT)                  0
Tail_Number                      0
Delay                       520399
dtype: int64
Done.


In [4]:
# test.head()

In [5]:
# 새로운 column 생성
# train['Estimated_Duration'] = train['Estimated_Arrival_Time'] -  train['Estimated_Departure_Time']
# test['Estimated_Duration'] = test['Estimated_Arrival_Time'] -  test['Estimated_Departure_Time']
test['Tail_Number'].nunique()

6493

### 1.3. label & unlabel split  / label_train & label_validation split

#### 1배치 데이터 흐름
1. vae에는 X_train_labeled와 X_unlabeled를 각각 onehot으로 만들어서 합쳐서 넣어주기
2. classifier에는 X_train_labeled를 onehot으로 만든 것 넣어주기


#### 필요한 것
1. labeled와 unlabeled 나누기
2. labeled에서 train과 validation 분리하기
3. X_train_labeld & X_unlabeled 를 이용한 onehot encoding
4. 전체 데이터에 onehot 적용하면 데이터 크기 너무 커지므로, 배치로 처리하기

#### 1.3.1. 데이터 쪼개기

In [6]:
# 1. labeled & unlabeld split
train_labeled , train_unlabeled = train[train['Delay'].notnull()], train[train['Delay'].isnull()]

X_labeled, y_labeled = train_labeled.drop(['ID','Delay'], axis=1), train_labeled[['Delay']]
change_cate2num = {'Not_Delayed':0, "Delayed":1}
y_labeled['Delay'] = y_labeled['Delay'].apply(lambda x : change_cate2num[x])
X_unlabeled = train_unlabeled.drop(['ID','Delay'], axis=1)

print(X_labeled.shape, X_unlabeled.shape)


# 2. train_labeled & val_labeled split
from sklearn.model_selection import train_test_split
X_train_labeled, X_val_labeled, y_train_labeled, y_val_labeled = train_test_split(X_labeled, y_labeled, test_size=0.2, random_state=42)


# 3. X_unlabled 의 크기를 X_train_labeled와 맞춰주기
# X_unlabeled = X_unlabeled.iloc[:len(X_train_labeled),:]
# print(X_unlabeled.shape, X_train_labeled.shape )

(178176, 17) (520399, 17)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_labeled['Delay'] = y_labeled['Delay'].apply(lambda x : change_cate2num[x])


#### 1.3.2. encoder 만들기

In [7]:

# 3. 데이터 정리 & onehotencoding
from sklearn.preprocessing import OneHotEncoder
cate_cols = ['Month', 'Day_of_Month', 'Cancelled', 'Diverted', 'Origin_Airport',
       'Origin_Airport_ID', 'Origin_State', 'Destination_Airport',
       'Destination_Airport_ID', 'Destination_State', 'Airline',
       'Carrier_Code(IATA)', 'Carrier_ID(DOT)', 'Tail_Number']


# Airport 2개 삭제함
cate_cols = ['Month', 'Day_of_Month', 'Cancelled', 'Diverted', 
       'Origin_Airport_ID', 'Origin_State', 
       'Destination_Airport_ID', 'Destination_State', 'Airline',
       'Carrier_Code(IATA)', 'Carrier_ID(DOT)', 'Tail_Number']

# cate_cols = ['Month', 'Day_of_Month',
#        'Origin_Airport_ID',
#        'Destination_Airport_ID', 
#              'Airline']

numeric_cols = ['Estimated_Departure_Time','Estimated_Arrival_Time','Distance']
numeric_cols = ['Distance']

## 3.1. VAE 훈련에 쓸 데이터 : X_train_labeled, X_unlabeled
### 3.1.1. 데이터 정리
X_vae_train = pd.concat([X_train_labeled, X_unlabeled])
X_vae_train_cate = X_vae_train[cate_cols]

encoder = OneHotEncoder(handle_unknown='ignore')
encoder.fit(X_vae_train_cate)


### 1.4. 데이터 만들기

In [8]:
import torch
# 필요한 것 : X_train_labeled, y_train_labeled, X_vae_train, cate_cols, numeric_cols
def make_label_batch(batch_num, num_of_batch):
    # 1. 현재 배치 가져오기 - 추후 X_train_labeled를 섞는 과정도 필요
    n = len(X_train_labeled)
    start_loc = n//num_of_batch*(batch_num-1)
    end_loc = n//num_of_batch*batch_num
    
    X_cur_batch = X_train_labeled.iloc[start_loc:end_loc]
    y_cur_batch = y_train_labeled.iloc[start_loc:end_loc]
    
    # 2. category onehot으로 변환하기
    X_sample_category = X_cur_batch[cate_cols]
    X_sample_category = encoder.transform(X_sample_category)
    X_sample_category = X_sample_category.toarray()  # 추가된 코드: X_sample_category를 2차원 배열로 변환 : 희소행렬로 반환되는 onehot encoding 결과를, 일반적인 Numpy 배열로 변환해줌
    
    # 3. numeric은 0~1 사이로 바꿔주기
    X_sample_numeric = X_cur_batch[numeric_cols]
    max_values = X_vae_train[numeric_cols].max()
    X_sample_numeric = np.array(X_sample_numeric /max_values.values) # 0~1사이로 변환
#     print(X_sample_category.shape, X_sample_numeric.shape)
    
    # 4. category & numeric 합치기
    X_sample = np.hstack([X_sample_category,X_sample_numeric])

    # 5. 텐서로 변환하기 : (batch_size, column dim)
    X_sample = torch.tensor(X_sample, dtype=torch.float32)  
    y_sample = torch.tensor(y_cur_batch.values, dtype=torch.float32)
    
    return X_sample, y_sample

In [9]:
# 필요한 것 : X_unlabeled, X_vae_train, cate_cols, numeric_cols
def make_unlabel_batch(batch_num,num_of_batch):
    # 1. 현재 배치 가져오기 - 추후 X_unlabeled 섞는 과정도 필요
    n = len(X_train_labeled) # 임시방편
    start_loc = n//num_of_batch*(batch_num-1)
    end_loc = n//num_of_batch*batch_num
    
    X_cur_batch = X_unlabeled.iloc[start_loc:end_loc]
    
    # 2. category onehot으로 변환하기
    X_sample_category = X_cur_batch[cate_cols]
    X_sample_category = encoder.transform(X_sample_category)
    X_sample_category = X_sample_category.toarray()  # 추가된 코드: X_sample_category를 2차원 배열로 변환 : 희소행렬로 반환되는 onehot encoding 결과를, 일반적인 Numpy 배열로 변환해줌
    
    # 3. numeric은 0~1 사이로 바꿔주기
    X_sample_numeric = X_cur_batch[numeric_cols]
    max_values = X_vae_train[numeric_cols].max()
    X_sample_numeric = np.array(X_sample_numeric /max_values.values) # 0~1사이로 변환
#     print(X_sample_category.shape, X_sample_numeric.shape)
    
    # 4. category & numeric 합치기
    X_sample = np.hstack([X_sample_category,X_sample_numeric])

    # 5. 텐서로 변환하기 : (batch_size, column dim)
    X_sample = torch.tensor(X_sample, dtype=torch.float32)  
    
    return X_sample

In [10]:
# validation X를 tensor로 만들기
def make_validation_tensor():
    X_cur_batch = X_val_labeled.copy()
    
    # 2. category onehot으로 변환하기
    X_sample_category = X_cur_batch[cate_cols]
    X_sample_category = encoder.transform(X_sample_category)
    X_sample_category = X_sample_category.toarray()  # 추가된 코드: X_sample_category를 2차원 배열로 변환 : 희소행렬로 반환되는 onehot encoding 결과를, 일반적인 Numpy 배열로 변환해줌
    
    # 3. numeric은 0~1 사이로 바꿔주기
    X_sample_numeric = X_cur_batch[numeric_cols]
    max_values = X_vae_train[numeric_cols].max()
    X_sample_numeric = np.array(X_sample_numeric /max_values.values) # 0~1사이로 변환
#     print(X_sample_category.shape, X_sample_numeric.shape)
    
    # 4. category & numeric 합치기
    X_sample = np.hstack([X_sample_category,X_sample_numeric])

    # 5. 텐서로 변환하기 : (batch_size, column dim)
    X_sample = torch.tensor(X_sample, dtype=torch.float32)  
    
    return X_sample    

In [11]:
X_val_labeled_tensor = make_validation_tensor()
X_val_labeled_tensor.shape

torch.Size([35636, 7375])

In [12]:
def make_test_batch(batch_num,num_of_batch):
    n = len(X_test) # 임시방편
    start_loc = n//num_of_batch*(batch_num-1)
    end_loc = n//num_of_batch*batch_num 
    
    X_cur_batch = X_test.iloc[start_loc:end_loc] if batch_num!= num_of_batch else X_test.iloc[start_loc:]
    
    # 2. category onehot으로 변환하기
    X_sample_category = X_cur_batch[cate_cols]
    X_sample_category = encoder.transform(X_sample_category)
    X_sample_category = X_sample_category.toarray()  # 추가된 코드: X_sample_category를 2차원 배열로 변환 : 희소행렬로 반환되는 onehot encoding 결과를, 일반적인 Numpy 배열로 변환해줌
    
    # 3. numeric은 0~1 사이로 바꿔주기 & Nan 값 0으로 처리
    X_sample_numeric = X_cur_batch[numeric_cols].fillna(0)
    max_values = X_vae_train[numeric_cols].max()
    X_sample_numeric = np.array(X_sample_numeric /max_values.values) # 0~1사이로 변환
#     print(X_sample_category.shape, X_sample_numeric.shape)
    
    # 4. category & numeric 합치기
    X_sample = np.hstack([X_sample_category,X_sample_numeric])

    # 5. 텐서로 변환하기 : (batch_size, column dim)
    X_sample = torch.tensor(X_sample, dtype=torch.float32)  
    
    return X_sample

### 모델생성

In [13]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

# Define the VAE model
class VAE(nn.Module):
    def __init__(self, input_dim, hidden_dim, latent_dim):
        super(VAE, self).__init__()

        # Encoder
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, latent_dim * 2)
        )

        # Decoder
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, input_dim),
            nn.Sigmoid()
        )

    def reparameterize(self, mu, log_var):
        std = torch.exp(0.5 * log_var)
        eps = torch.randn_like(std)
        return mu + eps * std

    def forward(self, x):
        h = self.encoder(x)
        mu, log_var = torch.chunk(h, 2, dim=1)
        z = self.reparameterize(mu, log_var)
        x_hat = self.decoder(z)
        return x_hat, mu, log_var


In [14]:
# 분류기 생성
class Classifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(Classifier, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim),
#             nn.Softmax(dim=1)
        )

    def forward(self, x):
        return self.network(x)

    
    # 분류기 생성
class Classifier_drop(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(Classifier_drop, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim),
#             nn.Softmax(dim=1)
        )

    def forward(self, x):
        return self.network(x)


In [64]:
# VAE LOSS
# import torch
# import torch.nn.functional as F

# def vae_loss(reconstructed_data, original_data, mu, log_var):
#     # Reconstruction Loss
#     recon_loss = F.binary_cross_entropy(reconstructed_data, original_data, reduction='mean')

#     # KL Divergence Loss
#     kl_divergence = -0.5 * torch.sum(1 + log_var - mu.pow(2) - log_var.exp())

#     # Total Loss
#     total_loss = recon_loss + kl_divergence
#     return total_loss


In [20]:
# Hyperparameters
input_dim = X_val_labeled_tensor.shape[1]
print('input shape : ', X_val_labeled_tensor.shape[1])
hidden_dim = 64
latent_dim = 4
learning_rate = 0.001

# Initialize the model, optimizer and loss function
device = 'cuda:0'
vae = VAE(input_dim, hidden_dim, latent_dim)
vae.to(device)

reconstruction_loss = nn.BCELoss(reduction='sum')


# classifier 초기화
classifier = Classifier_drop(input_dim = latent_dim, hidden_dim = latent_dim//2 , output_dim= 2)
classifier.to(device)
classification_loss = nn.CrossEntropyLoss()


# optimizer
combined_parameters = list(vae.parameters()) + list(classifier.parameters())
optimizer = optim.Adam(combined_parameters, lr=learning_rate)


# vae.load_state_dict(torch.load('./VAE_new.pth'))
# classifier.load_state_dict(torch.load('./classifier_new.pth'))


input shape :  7375


### 모델훈련
현재 갑자기 training이 안되는 문제 발생

In [21]:
from tqdm import tqdm
from torch.optim.lr_scheduler import StepLR
device = 'cuda:0'
vae.to(device)
classifier.to(device)

scheduler = StepLR(optimizer, step_size=1, gamma=0.95)


for epoch in tqdm(range(30)):
    vae.train()
    classifier.train()
    # epoch 마다 data shuffle
    tmp = pd.concat([X_train_labeled, y_train_labeled], axis=1)
    tmp = tmp.sample(frac=1).reset_index(drop=True)
    X_train_labeled = tmp.drop(columns=['Delay'])
    y_train_labeled = tmp[['Delay']]
    
    # epoch 마다 loss 세기
    loss_dict = {'total_loss':0, 'labeled_loss':0, 'unlabeled_loss':0, 'class_loss':0}
    num_of_batch = 50
    for batch_num in range(1,num_of_batch+1):
        labeled_data, labeled_label = make_label_batch(batch_num, num_of_batch)
        unlabeled_data = make_unlabel_batch(batch_num, num_of_batch)

        optimizer.zero_grad()

        # Labeled data에 대한 VAE 훈련
        reconstructed_labeled_data, mu, log_var = vae(labeled_data.to(device))
        labeled_loss = reconstruction_loss(reconstructed_labeled_data, labeled_data.to(device)) - 0.5 * torch.sum(1 + log_var - mu.pow(2) - log_var.exp())

        # Unlabeled data에 대한 VAE 훈련
        reconstructed_unlabeled_data, mu_unlabeled, log_var_unlabeled = vae(unlabeled_data.to(device))
        unlabeled_loss = reconstruction_loss(reconstructed_unlabeled_data, unlabeled_data.to(device)) - 0.5 * torch.sum(1 + log_var_unlabeled - mu_unlabeled.pow(2) - log_var_unlabeled.exp())

        # Labeled data에 대한 Classifier 훈련
        latent_labeled_data = vae.reparameterize(mu, log_var)
        classifier_output = classifier(latent_labeled_data)
        labeled_label_indices = torch.argmax(labeled_label, dim=1).to(device) # 다중 타겟을 원핫으로 변환
        class_loss = classification_loss(classifier_output, labeled_label_indices)

        # 총 Loss 계산 및 업데이트
        total_loss = labeled_loss + unlabeled_loss + class_loss
        total_loss.backward()
        optimizer.step()
        

        loss_dict['total_loss'] += total_loss.item()
        loss_dict['labeled_loss'] += labeled_loss.item()
        loss_dict['unlabeled_loss'] += unlabeled_loss.item()
        loss_dict['class_loss'] += class_loss.item()

#         if batch_num % 10 ==0:
#             print(batch_num)

    print(f"epoch:{epoch+1}, total:{loss_dict['total_loss']/num_of_batch : .4f}, labeled_loss:{loss_dict['labeled_loss']/num_of_batch : .4f}, unlabeled_loss:{loss_dict['unlabeled_loss']/num_of_batch : .4f}, class_loss: {loss_dict['class_loss']/num_of_batch : .4f}")
    scheduler.step()
    # validation 점수 측정
    validation_score()
    torch.save(vae.state_dict(), f'./VAE _{epoch+1}.pth') # 모델 저장
    torch.save(classifier.state_dict(), f'./classifier_dropout_{epoch+1}.pth') # 모델 저장

  0%|                                                                                           | 0/30 [00:00<?, ?it/s]

epoch:1, total: 19720871.8000, labeled_loss: 9861194.1400, unlabeled_loss: 9859677.6300, class_loss:  0.4245


  3%|██▊                                                                                | 1/30 [01:00<29:04, 60.15s/it]

 Validation Log Loss = 0.5265
epoch:2, total: 1942144.8088, labeled_loss: 971284.0006, unlabeled_loss: 970860.3800, class_loss:  0.3739


  7%|█████▌                                                                             | 2/30 [02:00<28:03, 60.11s/it]

 Validation Log Loss = 0.4980
epoch:3, total: 623441.4125, labeled_loss: 311534.8294, unlabeled_loss: 311906.2969, class_loss:  0.2929


 10%|████████▎                                                                          | 3/30 [03:00<27:01, 60.06s/it]

 Validation Log Loss = 0.4727
epoch:4, total: 502822.8200, labeled_loss: 251547.8484, unlabeled_loss: 251274.7656, class_loss:  0.2038


 13%|███████████                                                                        | 4/30 [04:00<26:01, 60.04s/it]

 Validation Log Loss = 0.4807
epoch:5, total: 457620.3956, labeled_loss: 228878.1237, unlabeled_loss: 228742.1344, class_loss:  0.1379


 17%|█████████████▊                                                                     | 5/30 [05:00<25:00, 60.04s/it]

 Validation Log Loss = 0.5273
epoch:6, total: 431453.6650, labeled_loss: 215820.5934, unlabeled_loss: 215632.9728, class_loss:  0.1024


 20%|████████████████▌                                                                  | 6/30 [06:00<24:01, 60.05s/it]

 Validation Log Loss = 0.5833
epoch:7, total: 413369.1544, labeled_loss: 206765.9219, unlabeled_loss: 206603.1434, class_loss:  0.0829


 23%|███████████████████▎                                                               | 7/30 [07:00<23:01, 60.05s/it]

 Validation Log Loss = 0.6329
epoch:8, total: 400480.2319, labeled_loss: 200331.3362, unlabeled_loss: 200148.8334, class_loss:  0.0711


 27%|██████████████████████▏                                                            | 8/30 [08:00<22:00, 60.04s/it]

 Validation Log Loss = 0.6772
epoch:9, total: 390512.3744, labeled_loss: 195336.0103, unlabeled_loss: 195176.3050, class_loss:  0.0636


 30%|████████████████████████▉                                                          | 9/30 [09:00<21:00, 60.03s/it]

 Validation Log Loss = 0.7170
epoch:10, total: 382862.2519, labeled_loss: 191473.2334, unlabeled_loss: 191388.9534, class_loss:  0.0580


 33%|███████████████████████████▎                                                      | 10/30 [10:00<20:00, 60.05s/it]

 Validation Log Loss = 0.7506
epoch:11, total: 376663.0912, labeled_loss: 188387.5572, unlabeled_loss: 188275.4697, class_loss:  0.0532


 37%|██████████████████████████████                                                    | 11/30 [11:00<19:00, 60.04s/it]

 Validation Log Loss = 0.7869
epoch:12, total: 371556.0781, labeled_loss: 185847.2875, unlabeled_loss: 185708.7297, class_loss:  0.0494


 40%|████████████████████████████████▊                                                 | 12/30 [12:00<18:00, 60.04s/it]

 Validation Log Loss = 0.8164
epoch:13, total: 367099.2162, labeled_loss: 183584.1009, unlabeled_loss: 183515.0716, class_loss:  0.0461


 43%|███████████████████████████████████▌                                              | 13/30 [13:00<17:00, 60.05s/it]

 Validation Log Loss = 0.8453
epoch:14, total: 363339.7625, labeled_loss: 181713.4247, unlabeled_loss: 181626.3056, class_loss:  0.0434


 47%|██████████████████████████████████████▎                                           | 14/30 [14:00<16:01, 60.10s/it]

 Validation Log Loss = 0.8677
epoch:15, total: 360069.1644, labeled_loss: 180069.8081, unlabeled_loss: 179999.3266, class_loss:  0.0409


 50%|█████████████████████████████████████████                                         | 15/30 [15:00<15:01, 60.11s/it]

 Validation Log Loss = 0.8930
epoch:16, total: 357096.9556, labeled_loss: 178601.7997, unlabeled_loss: 178495.1244, class_loss:  0.0393


 53%|███████████████████████████████████████████▋                                      | 16/30 [16:01<14:01, 60.11s/it]

 Validation Log Loss = 0.9152
epoch:17, total: 354492.9869, labeled_loss: 177319.5003, unlabeled_loss: 177173.4544, class_loss:  0.0375


 57%|██████████████████████████████████████████████▍                                   | 17/30 [17:01<13:00, 60.07s/it]

 Validation Log Loss = 0.9327
epoch:18, total: 352208.9988, labeled_loss: 176151.2884, unlabeled_loss: 176057.6784, class_loss:  0.0358


 60%|█████████████████████████████████████████████████▏                                | 18/30 [18:01<12:00, 60.06s/it]

 Validation Log Loss = 0.9524
epoch:19, total: 350182.3275, labeled_loss: 175139.1909, unlabeled_loss: 175043.1066, class_loss:  0.0346


 63%|███████████████████████████████████████████████████▉                              | 19/30 [19:01<11:00, 60.07s/it]

 Validation Log Loss = 0.9672
epoch:20, total: 348297.0094, labeled_loss: 174190.1875, unlabeled_loss: 174106.7913, class_loss:  0.0328


 67%|██████████████████████████████████████████████████████▋                           | 20/30 [20:01<10:00, 60.06s/it]

 Validation Log Loss = 0.9849
epoch:21, total: 346505.9213, labeled_loss: 173341.6137, unlabeled_loss: 173164.2747, class_loss:  0.0316


 70%|█████████████████████████████████████████████████████████▍                        | 21/30 [21:01<09:00, 60.06s/it]

 Validation Log Loss = 0.9987
epoch:22, total: 344910.5300, labeled_loss: 172486.0212, unlabeled_loss: 172424.4781, class_loss:  0.0306


 73%|████████████████████████████████████████████████████████████▏                     | 22/30 [22:01<08:00, 60.11s/it]

 Validation Log Loss = 1.0157
epoch:23, total: 343444.7444, labeled_loss: 171759.9494, unlabeled_loss: 171684.7650, class_loss:  0.0296


 77%|██████████████████████████████████████████████████████████████▊                   | 23/30 [23:01<07:00, 60.09s/it]

 Validation Log Loss = 1.0284
epoch:24, total: 342151.0894, labeled_loss: 171131.0681, unlabeled_loss: 171019.9900, class_loss:  0.0288


 80%|█████████████████████████████████████████████████████████████████▌                | 24/30 [24:01<06:00, 60.10s/it]

 Validation Log Loss = 1.0383
epoch:25, total: 340887.2800, labeled_loss: 170492.9534, unlabeled_loss: 170394.2969, class_loss:  0.0279


 83%|████████████████████████████████████████████████████████████████████▎             | 25/30 [25:01<05:00, 60.10s/it]

 Validation Log Loss = 1.0510
epoch:26, total: 339779.3394, labeled_loss: 169924.9069, unlabeled_loss: 169854.4022, class_loss:  0.0271


 87%|███████████████████████████████████████████████████████████████████████           | 26/30 [26:01<04:00, 60.09s/it]

 Validation Log Loss = 1.0624
epoch:27, total: 338769.0375, labeled_loss: 169448.9047, unlabeled_loss: 169320.1031, class_loss:  0.0263


 90%|█████████████████████████████████████████████████████████████████████████▊        | 27/30 [27:02<03:00, 60.11s/it]

 Validation Log Loss = 1.0708
epoch:28, total: 337834.0631, labeled_loss: 168945.2613, unlabeled_loss: 168888.7700, class_loss:  0.0257


 93%|████████████████████████████████████████████████████████████████████████████▌     | 28/30 [28:02<02:00, 60.10s/it]

 Validation Log Loss = 1.0838
epoch:29, total: 336858.1281, labeled_loss: 168475.9609, unlabeled_loss: 168382.1384, class_loss:  0.0252


 97%|███████████████████████████████████████████████████████████████████████████████▎  | 29/30 [29:02<01:00, 60.12s/it]

 Validation Log Loss = 1.0895
epoch:30, total: 336055.3344, labeled_loss: 168095.5022, unlabeled_loss: 167959.8012, class_loss:  0.0244


100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [30:02<00:00, 60.08s/it]

 Validation Log Loss = 1.0994





In [22]:
# torch.save(vae.state_dict(), './VAE2.pth') # 모델 저장
# torch.save(classifier.state_dict(), './classifier_dropout1.pth') # 모델 저장

### classifier 성능 비교

In [93]:
# 1. train 이용한 경우 validation
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier


# 데이터 수치화
qual_col = ['Origin_Airport', 'Origin_State', 'Destination_Airport', 'Destination_State', 'Airline', 'Carrier_Code(IATA)', 'Tail_Number']
X_train_labeled_le = X_train_labeled.copy()
X_val_labeled_le = X_val_labeled.copy()
test_le = test.copy()
for i in qual_col:
    le = LabelEncoder()
    le=le.fit(train[i]) # 사용할 수 있는 전체 X를 이용해서 LE
    X_train_labeled_le[i]=le.transform(X_train_labeled_le[i])
    X_val_labeled_le[i]=le.transform(X_val_labeled_le[i])
    
    for label in np.unique(test[i]):
        if label not in le.classes_: # train에 없는 label인 경우
            le.classes_ = np.append(le.classes_, label)
    test_le[i]=le.transform(test_le[i])
print('Done.')


# 모델 정의
models = {
    "Extra Trees Classifier": ExtraTreesClassifier(random_state=42),
    "Random Forest Classifier": RandomForestClassifier(random_state=42),
    "Light Gradient Boosting Machine": LGBMClassifier(random_state=42),
    "Decision Tree Classifier": DecisionTreeClassifier(random_state=42),
    "Gradient Boosting Classifier": GradientBoostingClassifier(random_state=42),
    "Ada Boost Classifier": AdaBoostClassifier(random_state=42),
    "Logistic Regression": LogisticRegression(random_state=42),
}

# 각 모델의 성능을 비교
for name, model in models.items():
    model.fit(X_train_labeled_le, y_train_labeled)
    y_pred = model.predict_proba(X_val_labeled_le)
    loss = log_loss(y_val_labeled, y_pred)
    print(f"{name}: Log Loss = {loss:.4f}")

Done.


  model.fit(X_train_labeled_le, y_train_labeled)


Extra Trees Classifier: Log Loss = 0.5265


  model.fit(X_train_labeled_le, y_train_labeled)


Random Forest Classifier: Log Loss = 0.4804


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


Light Gradient Boosting Machine: Log Loss = 0.4399
Decision Tree Classifier: Log Loss = 10.2489


  y = column_or_1d(y, warn=True)


Gradient Boosting Classifier: Log Loss = 0.4437


  y = column_or_1d(y, warn=True)


Ada Boost Classifier: Log Loss = 0.6821


  y = column_or_1d(y, warn=True)


Logistic Regression: Log Loss = 0.4554


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [18]:
# 2. semi-supervised 된 신경망 모델의 validation set 예측성능
from sklearn.metrics import log_loss
def validation_score():
    with torch.no_grad():
        vae.eval()
        classifier.eval()

        ## 잠재벡터화
        reconstructed_labeled_data, mu, log_var = vae(X_val_labeled_tensor.to(device))
        X_val_latent_labeled_data = vae.reparameterize(mu, log_var)

        ## 예측
        y_pred = classifier(X_val_latent_labeled_data)
        loss = log_loss(y_val_labeled, torch.softmax(y_pred,dim=1).cpu().detach().numpy()) # softmax로 확률로 바꿔줘야!
        print(f" Validation Log Loss = {loss:.4f}")
#         print(torch.softmax(y_pred,dim=1)[:20,:])
validation_score()

 Validation Log Loss = 0.6823


### submission

In [67]:
# 데이터 가져오기
X_test = test.drop(columns=['ID'])
test_batch = make_test_batch(1,10)

# 예측하기
device = 'cuda:2'
with torch.no_grad():
    vae.to(device)
    classifier.to(device)
    vae.eval()
    classifier.eval()

    ## 잠재벡터화
    reconstructed_labeled_data, mu, log_var = vae(test_batch.to(device))
    X_test_latent_labeled_data = vae.reparameterize(mu, log_var)

    ## 예측
    y_pred = classifier(X_test_latent_labeled_data)
    y_pred = torch.softmax(y_pred,dim=1).cpu().detach().numpy()

In [68]:
# 데이터 가져오기
# X_test = test.drop(columns=['ID'])
for batch_num in range(2, 11):
    test_batch = make_test_batch(batch_num,10)

    # 예측하기
    device = 'cuda:2'
    with torch.no_grad():
        vae.to(device)
        classifier.to(device)
        vae.eval()
        classifier.eval()

        ## 잠재벡터화
        reconstructed_labeled_data, mu, log_var = vae(test_batch.to(device))
        X_test_latent_labeled_data = vae.reparameterize(mu, log_var)

        ## 예측
        y_pred_batch_num = classifier(X_test_latent_labeled_data)
        y_pred_batch_num = torch.softmax(y_pred_batch_num,dim=1).cpu().detach().numpy()
    y_pred = np.vstack([y_pred, y_pred_batch_num])
    print('batch num : ', batch_num)

batch num :  2
batch num :  3
batch num :  4
batch num :  5
batch num :  6
batch num :  7
batch num :  8
batch num :  9
batch num :  10


In [69]:
y_pred.shape

(1000000, 2)

In [70]:
# 제출본 만들기
sample_submission = pd.read_csv('sample_submission.csv', index_col = 0)
submission = pd.DataFrame(data=y_pred, columns=sample_submission.columns, index=sample_submission.index)

In [71]:
submission.to_csv('./submission/submission_semi-supervise-vae2.csv', index=True)