In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
os.chdir('/content/drive/MyDrive/github/Dacon/항공편지연예측AI경진대회(2023.04.03-2023.05.08)')
os.getcwd()

'/content/drive/MyDrive/github/Dacon/항공편지연예측AI경진대회(2023.04.03-2023.05.08)'

## 1. 결측치처리
**해당 노트북**
+ 전처리방법2 + x결측치삭제 + vae 활용 + validation set 확인

In [1]:
import pandas as pd
import numpy as np
import random
import os
import gc

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

### 1.1. 전처리방법2 데이터 가져오기

In [2]:
train = pd.read_parquet('./data/train_preprocess_2.parquet')
# test = pd.read_parquet('./test.parquet')
test = pd.read_parquet('./data/test_preprocess_2.parquet')
sample_submission = pd.read_csv('sample_submission.csv', index_col = 0)

print(train.shape)
print(train.Delay.value_counts())

(1000000, 19)
Not_Delayed    210001
Delayed         45000
Name: Delay, dtype: int64


### 1.2. 남은 결측치 처리 - 삭제

In [3]:
# print(train.isnull().sum())
# print(train.dropna().shape)
# print(train.dropna().isnull().sum())
train = train.dropna(subset=['Estimated_Departure_Time','Estimated_Arrival_Time','Carrier_Code(IATA)','Airline','Carrier_ID(DOT)'])
print(train.isnull().sum())


# 레이블(Delay)을 제외한 결측값이 존재하는 변수들을 unknown으로 대체합니다.
NaN_col = ['Origin_State','Destination_State','Airline','Estimated_Departure_Time', 'Estimated_Arrival_Time','Carrier_Code(IATA)','Carrier_ID(DOT)']

for col in NaN_col:
    # mode = train[col].mode()[0]
    # train[col] = train[col].fillna(mode)
    
    if col in test.columns:
        test[col] = test[col].fillna('Unknown')
print('Done.')

ID                               0
Month                            0
Day_of_Month                     0
Estimated_Departure_Time         0
Estimated_Arrival_Time           0
Cancelled                        0
Diverted                         0
Origin_Airport                   0
Origin_Airport_ID                0
Origin_State                     0
Destination_Airport              0
Destination_Airport_ID           0
Destination_State                0
Distance                         0
Airline                          0
Carrier_Code(IATA)               0
Carrier_ID(DOT)                  0
Tail_Number                      0
Delay                       520399
dtype: int64
Done.


### 1.3. label & unlabel split  / label_train & label_validation split

#### 1배치 데이터 흐름
1. vae에는 X_train_labeled와 X_unlabeled를 각각 onehot으로 만들어서 합쳐서 넣어주기
2. classifier에는 X_train_labeled를 onehot으로 만든 것 넣어주기


#### 필요한 것
1. labeled와 unlabeled 나누기
2. labeled에서 train과 validation 분리하기
3. X_train_labeld & X_unlabeled 를 이용한 onehot encoding
4. 전체 데이터에 onehot 적용하면 데이터 크기 너무 커지므로, 배치로 처리하기

In [10]:
# 1. labeled & unlabeld split
train_labeled , train_unlabeled = train[train['Delay'].notnull()], train[train['Delay'].isnull()]

X_labeled, y_labeled = train_labeled.drop(['ID','Delay'], axis=1), train_labeled[['Delay']]
change_cate2num = {'Not_Delayed':0, "Delayed":1}
y_labeled['Delay'] = y_labeled['Delay'].apply(lambda x : change_cate2num[x])
X_unlabeled = train_unlabeled.drop(['ID','Delay'], axis=1)

print(X_labeled.shape, X_unlabeled.shape)


# 2. train_labeled & val_labeled split
from sklearn.model_selection import train_test_split
X_train_labeled, X_val_labeled, y_train_labeled, y_val_labeled = train_test_split(X_labeled, y_labeled, test_size=0.2, random_state=42)


# 3. 데이터 정리 & onehotencoding
from sklearn.preprocessing import OneHotEncoder
cate_cols = ['Month', 'Day_of_Month', 'Cancelled', 'Diverted', 'Origin_Airport',
       'Origin_Airport_ID', 'Origin_State', 'Destination_Airport',
       'Destination_Airport_ID', 'Destination_State', 'Airline',
       'Carrier_Code(IATA)', 'Carrier_ID(DOT)', 'Tail_Number']


# Airport 2개 삭제함
cate_cols = ['Month', 'Day_of_Month', 'Cancelled', 'Diverted', 
       'Origin_Airport_ID', 'Origin_State', 
       'Destination_Airport_ID', 'Destination_State', 'Airline',
       'Carrier_Code(IATA)', 'Carrier_ID(DOT)', 'Tail_Number']

numeric_cols = ['Estimated_Departure_Time','Estimated_Arrival_Time','Distance']

## 3.1. VAE 훈련에 쓸 데이터 : X_train_labeled, X_unlabeled
### 3.1.1. 데이터 정리
X_vae_train = pd.concat([X_train_labeled, X_unlabeled])
X_vae_train_cate = X_vae_train[cate_cols]

encoder = OneHotEncoder()
encoder.fit(X_vae_train_cate)


# X_vae_train_numeric = X_vae_train[numeric_cols].astype(np.float32)

### 3.1.2. 범주형 변수를 원-핫 인코딩으로 변환


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_labeled['Delay'] = y_labeled['Delay'].apply(lambda x : change_cate2num[x])


(178176, 17) (520399, 17)


In [55]:
import numpy as np
from torch.utils.data.dataset import Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import torch

class Flight_labeled(Dataset): 
    def __init__(self, X_train_labeled, y_train_labeled, encoder):
        # 1. 데이터 받아오기
        self.X_train_labeled = X_train_labeled
        self.y_train_labeled =  y_train_labeled
        
        self.cate_cols = ['Month', 'Day_of_Month', 'Cancelled', 'Diverted', 'Origin_Airport_ID', \
                          'Origin_State', 'Destination_Airport_ID', 'Destination_State', 'Airline',\
                          'Carrier_Code(IATA)', 'Carrier_ID(DOT)', 'Tail_Number']

        self.numeric_cols = ['Estimated_Departure_Time','Estimated_Arrival_Time','Distance']

        
        
    # 사용 가능한 데이터 개수 return
    def __len__(self):
        return len(self.X_train_labeled)
    
    def __getitem__(self, i):
        # onehot encoding 후, tensor로 반환하기
        # 1. category는 onehot으로 변환하고, numeric은 category onehot 뒤에 붙이기
        X_sample_category = self.X_train_labeled[self.cate_cols].iloc[i,:].to_frame().T
#         print('1: ', X_sample_category.dtype, X_sample_category.shape)
        
        X_sample_category = encoder.transform(X_sample_category)
#         print('2: ', X_sample_category.dtype, X_sample_category.shape)
        
        X_sample_category = X_sample_category.toarray()  # 추가된 코드: X_sample_category를 2차원 배열로 변환 : 희소행렬로 반환되는 onehot encoding 결과를, 일반적인 Numpy 배열로 변환해줌
#         print('3: ', X_sample_category.dtype, X_sample_category.shape)
        
        X_sample_numeric = np.array(self.X_train_labeled[self.numeric_cols].iloc[i,:]).reshape(1,-1)
#         print('4: ', X_sample_numeric.dtype, X_sample_numeric.shape)
        
        X_sample = np.hstack([X_sample_category,X_sample_numeric])
#         print('5: ', X_sample.dtype, X_sample.shape)
        
        # 2. 텐서로 변환하기
        X_sample = torch.tensor(X_sample, dtype=torch.float32)
        y_sample = torch.tensor(self.y_train_labeled.iloc[i,:].values, dtype=torch.float32)
        

        return X_sample, y_sample
    

class Flight_unlabeled(Dataset):
    def __init__(self, X_unlabeled, encoder):
        self.X_unlabeled =  X_unlabeled

        
        
    # 사용 가능한 데이터 개수 return
    def __len__(self):
        return len(self.X_unlabeled)
    
    def __getitem__(self, i):
        # onehot encoding 후, tensor로 반환하기
        # 1. category는 onehot으로 변환하고, numeric은 category onehot 뒤에 붙이기
        X_sample_category = self.X_unlabeled[self.cate_cols].iloc[i,:].to_frame().T
#         print('1: ', X_sample_category.dtype, X_sample_category.shape)
        
        X_sample_category = encoder.transform(X_sample_category)
#         print('2: ', X_sample_category.dtype, X_sample_category.shape)
        
        X_sample_category = X_sample_category.toarray()  # 추가된 코드: X_sample_category를 2차원 배열로 변환 : 희소행렬로 반환되는 onehot encoding 결과를, 일반적인 Numpy 배열로 변환해줌
#         print('3: ', X_sample_category.dtype, X_sample_category.shape)
        
        X_sample_numeric = np.array(self.X_train_labeled[self.numeric_cols].iloc[i,:]).reshape(1,-1)
#         print('4: ', X_sample_numeric.dtype, X_sample_numeric.shape)
        
        X_sample = np.hstack([X_sample_category,X_sample_numeric])
#         print('5: ', X_sample.dtype, X_sample.shape)
        
        # 2. 텐서로 변환하기
        X_sample = torch.tensor(X_sample, dtype=torch.float32)
        return X_sample
    


In [57]:
from torch.utils.data.dataloader import DataLoader
import tqdm

label_dataset = Flight_labeled(X_train_labeled,y_train_labeled, encoder)
unlabel_dataset = Flight_unlabeled(X_unlabeled, encoder)
print(len(label_dataset),len(unlabel_dataset))

label_loader = DataLoader(label_dataset, batch_size=128)
unlabel_loader = DataLoader(unlabel_dataset, batch_size=32)


iterator = tqdm.tqdm(label_loader)
for data, label in iterator:
#     print(len(data), label.sum())
    print("Data batch shape:", data.shape)
    print("Label batch shape:", label.shape)

142540 520399


  0%|                                                                                 | 1/1114 [00:01<33:06,  1.78s/it]

Data batch shape: torch.Size([128, 1, 7377])
Label batch shape: torch.Size([128, 1])


  0%|▏                                                                                | 2/1114 [00:03<33:16,  1.80s/it]

Data batch shape: torch.Size([128, 1, 7377])
Label batch shape: torch.Size([128, 1])


  0%|▏                                                                                | 3/1114 [00:05<33:14,  1.80s/it]

Data batch shape: torch.Size([128, 1, 7377])
Label batch shape: torch.Size([128, 1])


  0%|▎                                                                                | 4/1114 [00:07<33:30,  1.81s/it]

Data batch shape: torch.Size([128, 1, 7377])
Label batch shape: torch.Size([128, 1])


  0%|▎                                                                                | 5/1114 [00:09<33:27,  1.81s/it]

Data batch shape: torch.Size([128, 1, 7377])
Label batch shape: torch.Size([128, 1])


  1%|▍                                                                                | 6/1114 [00:10<33:12,  1.80s/it]

Data batch shape: torch.Size([128, 1, 7377])
Label batch shape: torch.Size([128, 1])


  1%|▌                                                                                | 7/1114 [00:12<33:01,  1.79s/it]

Data batch shape: torch.Size([128, 1, 7377])
Label batch shape: torch.Size([128, 1])


  1%|▌                                                                                | 8/1114 [00:14<33:16,  1.81s/it]

Data batch shape: torch.Size([128, 1, 7377])
Label batch shape: torch.Size([128, 1])


  1%|▋                                                                                | 9/1114 [00:16<33:16,  1.81s/it]

Data batch shape: torch.Size([128, 1, 7377])
Label batch shape: torch.Size([128, 1])


  1%|▋                                                                               | 10/1114 [00:18<33:17,  1.81s/it]

Data batch shape: torch.Size([128, 1, 7377])
Label batch shape: torch.Size([128, 1])


  1%|▊                                                                               | 11/1114 [00:19<33:02,  1.80s/it]

Data batch shape: torch.Size([128, 1, 7377])
Label batch shape: torch.Size([128, 1])


  1%|▊                                                                               | 12/1114 [00:21<32:59,  1.80s/it]

Data batch shape: torch.Size([128, 1, 7377])
Label batch shape: torch.Size([128, 1])


  1%|▉                                                                               | 13/1114 [00:23<33:04,  1.80s/it]

Data batch shape: torch.Size([128, 1, 7377])
Label batch shape: torch.Size([128, 1])


  1%|█                                                                               | 14/1114 [00:25<33:08,  1.81s/it]

Data batch shape: torch.Size([128, 1, 7377])
Label batch shape: torch.Size([128, 1])


  1%|█                                                                               | 15/1114 [00:27<33:08,  1.81s/it]

Data batch shape: torch.Size([128, 1, 7377])
Label batch shape: torch.Size([128, 1])


  1%|█                                                                               | 15/1114 [00:28<34:41,  1.89s/it]


KeyboardInterrupt: 

In [45]:
# encoder.categories_

# 

[array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12], dtype=int64),
 array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31],
       dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([10135, 10136, 10140, 10141, 10146, 10154, 10155, 10157, 10158,
        10165, 10170, 10185, 10208, 10245, 10257, 10268, 10275, 10279,
        10299, 10333, 10361, 10372, 10397, 10408, 10409, 10423, 10431,
        10434, 10466, 10469, 10529, 10551, 10558, 10561, 10562, 10577,
        10581, 10599, 10620, 10627, 10631, 10643, 10666, 10676, 10685,
        10693, 10713, 10721, 10728, 10731, 10732, 10739, 10747, 10754,
        10779, 10781, 10785, 10792, 10800, 10821, 10849, 10868, 10874,
        10918, 10926, 10967, 10980, 10990, 10994, 11003, 11013, 11027,
        11042, 11049, 11057, 11066, 11067, 11076, 11092, 11097, 11109,
        11111, 11122, 11140, 11146, 11150, 11193, 11203, 11233, 11252,
     

## 아래는 아직 작성 안한 부분

In [39]:
# import numpy as np
# from torch.utils.data.dataset import Dataset
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import OneHotEncoder

# class Flight(Dataset): # 1. 클래스 선언
#     def __init__(self):
#         # 1. 데이터 읽기 & label, unlabel 나누기
#         self.data = train
        
#         self.train_labeled , self.train_unlabeled = self.train[train['Delay'].notnull()], self.train[train['Delay'].isnull()]
        
#         self.X_labeled, self.y_labeled = self.train_labeled.drop(['ID','Delay'], axis=1), self.train_labeled['Delay']
        
#         ## y값 0,1로 바꾸기
#         change_cate2num = {'Not_Delayed':0, "Delayed":1}
#         y_labeled['Delay'] = y_labeled['Delay'].apply(lambda x : change_cate2num[x])
        
#         self.X_unlabeled = self.train_unlabeled.drop(['ID','Delay'], axis=1)

#         print(self.X_labeled.shape, self.X_unlabeld.shape)


#         # 2. train_labeled & val_labeled split
#         self.X_train_labeled, self.X_val_labeled, self.y_train_labeled, self.y_val_labeled = train_test_split(X_labeled, y_labeled, test_size=0.2, random_state=42)

        
        
#         # 3. 데이터 정리 & onehotencoding
#         cate_cols = ['Month', 'Day_of_Month', 'Cancelled', 'Diverted', 'Origin_Airport',
#                'Origin_Airport_ID', 'Origin_State', 'Destination_Airport',
#                'Destination_Airport_ID', 'Destination_State', 'Airline',
#                'Carrier_Code(IATA)', 'Carrier_ID(DOT)', 'Tail_Number']

#         numeric_cols = ['Estimated_Departure_Time','Estimated_Arrival_Time','Distance']

#         ## 3.1. VAE 훈련에 쓸 데이터 : X_train_labeled, X_unlabeled
#         ### 3.1.1. 데이터 정리
#         self.X_vae_train = pd.concat([self.X_train_labeled, self.X_unlabeled])
#         self.X_vae_train_cate = self.X_vae_train[cate_cols]
#         self.X_vae_train_numeric = self.X_vae_train[numeric_cols].astype(np.float32)

#         ### 3.1.2. 범주형 변수를 원-핫 인코딩으로 변환
#         encoder = OneHotEncoder()
#         self.X_vae_train_onehot_categorical = encoder.fit_transform(self.X_vae_train_cate).toarray()
#         self.X_vae_train_encoded = np.hstack((self.X_vae_train_onehot_categorical, self.X_vae_train_numeric.values))

        
#         ## 3.2. classifier에 쓸 데이터 : X_train_labeled, y_train_labeled
#         self.X_classifier_train_cate = self.X_train_labeled[cate_cols]
#         self.X_classifier_train_numeric = self.X_train_labeled[numeric_cols].astype(np.float32)
#         self.X_classifier_onehot_cate = encoder.fit_transform(self.X_classifier_train_cate).toarray()
#         self.X_classifier_train_encoded = np.hstack((self.X_classifier_onehot_cate, self.X_classifier_train_numeric.values))
        
#         ## 3.3. 성능 평가에 쓸 데이터 : X_val_labeled, y_val_labeled
        
        
#     # 사용 가능한 데이터 개수 return
#     def __len__(self):
#         return
    
#     def __getitem__(self, i):
#         return 

In [34]:
y_labeled

Unnamed: 0,Delay
6,Not_Delayed
8,Not_Delayed
10,Delayed
12,Not_Delayed
13,Not_Delayed
...,...
999950,Not_Delayed
999955,Delayed
999963,Delayed
999985,Not_Delayed


In [28]:
# # 1. labeled & unlabeld split
# train_labeled , train_unlabeled = train[train['Delay'].notnull()], train[train['Delay'].isnull()]

# X_labeled, y_labeled = train_labeled.drop(['ID','Delay'], axis=1), train_labeled['Delay']
# change_cate2num = {'Not_Delayed':0, "Delayed":1}
# y_labeled['Delay'] = y_labeled['Delay'].apply(lambda x : change_cate2num[x])
# X_unlabeled = train_unlabeled.drop(['ID','Delay'], axis=1)

# print(X_labeled.shape, X_unlabeld.shape)


# # 2. train_labeled & val_labeled split
# from sklearn.model_selection import train_test_split
# X_train_labeled, X_val_labeled, y_train_labeled, y_val_labeled = train_test_split(X_labeled, y_labeled, test_size=0.2, random_state=42)


# # 3. 데이터 정리 & onehotencoding
# from sklearn.preprocessing import OneHotEncoder
# cate_cols = ['Month', 'Day_of_Month', 'Cancelled', 'Diverted', 'Origin_Airport',
#        'Origin_Airport_ID', 'Origin_State', 'Destination_Airport',
#        'Destination_Airport_ID', 'Destination_State', 'Airline',
#        'Carrier_Code(IATA)', 'Carrier_ID(DOT)', 'Tail_Number']

# numeric_cols = ['Estimated_Departure_Time','Estimated_Arrival_Time','Distance']

# ## 3.1. VAE 훈련에 쓸 데이터 : X_train_labeled, X_unlabeled
# ### 3.1.1. 데이터 정리
# X_vae_train = pd.concat([X_train_labeled, X_unlabeled])
# X_vae_train_cate = X_vae_train[cate_cols]
# X_vae_train_numeric = X_vae_train[numeric_cols].astype(np.float32)

# ### 3.1.2. 범주형 변수를 원-핫 인코딩으로 변환
# encoder = OneHotEncoder()
# X_vae_train_onehot_categorical = encoder.fit_transform(X_vae_train_cate).toarray()
# X_vae_train_encoded = np.hstack((X_vae_train_onehot_categorical, X_vae_train_numeric.values))

# ## 3.2. classifier에 쓸 데이터 : X_train_labeled, y_train_labeled


# ## 3.3. 성능 평가에 쓸 데이터 : X_val_labeled, y_val_labeled

(178176, 17) (520399, 18)


MemoryError: Unable to allocate 40.1 GiB for an array with shape (662939, 8123) and data type float64

## 2. 준지도학습진행

### 전처리방법7 저장

In [None]:
save_idx = input('몇 번째 전처리 방법인지 정수-정수를 입력하세요 : ')
train_save_name = 'train_preprocess_' + save_idx
test_save_name = 'test_preprocess_' + save_idx
train.to_parquet(f'./data/{train_save_name}.parquet')
test.to_parquet(f'./data/{test_save_name}.parquet')

몇 번째 전처리 방법인지 정수-정수를 입력하세요 : 4
