# Pytorch
- 구글의 tensorflow와 유사한 딥러닝 라이브러리
- 페이스북 인공지능 연구팀에 의해 주로 개발
- torch
    - 텐서 변환및 다양한 수학 함수와 클래스가 포함 되어 있다.
- torch.nn
    - 신경망을 구축하기 위한 레이어(층), 활성화 함수, 손실함수등 에대한 함수와 클래스가 포함 되어 있다.
- torch.utils.data
    - 미니 배치 학습을 위한 데이터셋 구성관련 함수와 클래스가 포함 되어 있다.
- torch.optim
    - optimizer 관련 함수와 클래스가 포함 되어 있다.
- https://pytorch.org/

# 타이타닉 데이터셋을 딥러닝 학습 시켜보기
1. 데이터 전처리
2. 미니배치 단위 학습을 위해 데이터셋 클래스 구현
3. 딥러닝 모델 인공신경망 구현
4. 하이퍼파라미터 정의(손실함수및 옵티마이저 선택등.)
5. 학습 및 테스트 loop 구현

## 데이터 전처리

In [112]:
import torch
import numpy as np
import pandas as pd

- 구글 드라이브 연결

In [113]:
# from google.colab import drive
# drive.mount('/content/drive')
# 코랩일 경우

- 데이터 경로 변수

In [114]:
# DATA_PATH = "/content/drive/MyDrive/data/"
# DATA_PATH
# 코랩일 경우
# train = pd.read_csv(f"{DATA_PATH}titanic_train.csv") # 학습데이터
# test = pd.read_csv(f"{DATA_PATH}titanic_test.csv") # 테스트 데이터
# train.shape , test.shape

In [115]:
# 절대 경로 윈도우
absolute_path = "D:\\NLP_Papers_Review\\data\\titanic_train.csv"

# 절대 경로를 이용해 데이터 읽기
df_absolute = pd.read_csv(absolute_path)
print("Data from absolute path:")
print(df_absolute.head())


Data from absolute path:
   passengerid  survived  pclass                      name  gender   age  \
0          494         0       1   Artagaveytia, Mr. Ramon    male  71.0   
1          462         0       3       Morley, Mr. William    male  34.0   
2         1286         0       3  Kink-Heilmann, Mr. Anton    male  29.0   
3         1130         1       2     Hiltunen, Miss. Marta  female  18.0   
4          461         1       1       Anderson, Mr. Harry    male  48.0   

   sibsp  parch    ticket     fare cabin embarked  
0      0      0  PC 17609  49.5042   NaN        C  
1      0      0    364506   8.0500   NaN        S  
2      3      1    315153  22.0250   NaN        S  
3      1      1    250650  13.0000   NaN        S  
4      0      0     19952  26.5500   E12        S  


In [116]:
import os
os.getcwd()
# 현재 디렉토리 출력

'd:\\NLP_Papers_Review'

In [117]:
# 상대 경로 (현재 작업 디렉토리가 D:\NLP_Papers_Review 라고 가정)
relative_path = "..\\data\\titanic_train.csv" # 폴더위치를 한단계 올림

#os.chdir("d:\\NLP_Papers_Review") 위치를 번경
df_relative = pd.read_csv(relative_path)
print("Data from relative path:")
print(df_relative.head())
# 현재 작업 위치가 

FileNotFoundError: [Errno 2] No such file or directory: '..\\data\\titanic_train.csv'

In [118]:
os.chdir("d:\\NLP_Papers_Review")
from glob import glob
csv_files = glob("data/titanic_*.csv")

(test,train)=csv_files
train=pd.read_csv(train)
test=pd.read_csv(test)
train

Unnamed: 0,passengerid,survived,pclass,name,gender,age,sibsp,parch,ticket,fare,cabin,embarked
0,494,0,1,"Artagaveytia, Mr. Ramon",male,71.0,0,0,PC 17609,49.5042,,C
1,462,0,3,"Morley, Mr. William",male,34.0,0,0,364506,8.0500,,S
2,1286,0,3,"Kink-Heilmann, Mr. Anton",male,29.0,3,1,315153,22.0250,,S
3,1130,1,2,"Hiltunen, Miss. Marta",female,18.0,1,1,250650,13.0000,,S
4,461,1,1,"Anderson, Mr. Harry",male,48.0,0,0,19952,26.5500,E12,S
...,...,...,...,...,...,...,...,...,...,...,...,...
911,738,1,1,"Lesurer, Mr. Gustave J",male,35.0,0,0,PC 17755,512.3292,B101,C
912,518,0,3,"Ryan, Mr. Patrick",male,,0,0,371110,24.1500,,Q
913,664,0,3,"Coleff, Mr. Peju",male,36.0,0,0,349210,7.4958,,S
914,109,0,3,"Rekic, Mr. Tido",male,38.0,0,0,349249,7.8958,,S


In [119]:
csv_files

['data\\titanic_test.csv', 'data\\titanic_train.csv']

- 결측치 확인하기

In [120]:
train.isnull().sum()

passengerid      0
survived         0
pclass           0
name             0
gender           0
age            180
sibsp            0
parch            0
ticket           0
fare             0
cabin          706
embarked         0
dtype: int64

In [121]:
test.isnull().sum()

passengerid      0
pclass           0
name             0
gender           0
age             83
sibsp            0
parch            0
ticket           0
fare             1
cabin          308
embarked         2
dtype: int64

In [122]:
null_col=train.isnull().columns[train.isnull().sum()>0]

for col in null_col:
    if pd.api.types.is_numeric_dtype(train[col]):# 숫자형이면 평균으로 채우기
        train[col].fillna(train[col].mean(), inplace=True)
    elif pd.api.types.is_string_dtype(train[col]):# 문자열이면 최빈값으로 채우기
        train[col].fillna(train[col].mode()[0], inplace=True)


null_col=test.isnull().columns[test.isnull().sum()>0]

for col in null_col:
    if pd.api.types.is_numeric_dtype(test[col]):# 숫자형이면 평균으로 채우기
        test[col].fillna(test[col].mean(), inplace=True)
    elif pd.api.types.is_string_dtype(train[col]):# 문자열이면 최빈값으로 채우기
        test[col].fillna(test[col].mode()[0], inplace=True)
        
train.isnull().sum().sum() , test.isnull().sum().sum()

(0, 0)

- 학습 데이터에서 얻은 통계량을 이용하여 결측치를 채워야한다.

In [80]:
age_mean = train["age"].mean()
fare_median = train["fare"].median()
cabin_unk = "UNK"
embarked_mode = train["embarked"].mode()[0]
age_mean , fare_median ,cabin_unk , embarked_mode

(29.824741935483875, 13.8583, 'UNK', 'S')

- 학습데이터 결측치 처리

In [29]:
train["age"] = train["age"].fillna(age_mean)
train["cabin"] = train["cabin"].fillna(cabin_unk)

- 테스트데이터 결측치 처리

In [30]:
test["age"] = test["age"].fillna(age_mean)
test["fare"] = test["fare"].fillna(fare_median)
test["cabin"] = test["cabin"].fillna(cabin_unk)
test["embarked"] = test["embarked"].fillna(embarked_mode)

In [31]:
train.isnull().sum().sum() , test.isnull().sum().sum()

(3, 0)

In [123]:
train.isnull().sum()

passengerid    0
survived       0
pclass         0
name           0
gender         0
age            0
sibsp          0
parch          0
ticket         0
fare           0
cabin          0
embarked       0
dtype: int64

- 특성으로 사용할 변수 추가하기

In [124]:
cols = ["age","sibsp","parch","fare","pclass","gender","embarked"]
train_ft = train[cols].copy()
test_ft = test[cols].copy()
train_ft.shape, test_ft.shape

((916, 7), (393, 7))

- 범주형 변수 원핫인코딩하여 특성으로 추가하기

In [125]:
from sklearn.preprocessing import OneHotEncoder
cols = ['gender','embarked']
enc = OneHotEncoder(handle_unknown = 'ignore')
enc.fit(train_ft[cols])

In [86]:
# none_numeric_col=[col for col in train.columns if train[col].dtype not in ['float64', 'int64']]

#train.drop(columns=["name"],inplace=True)

In [83]:
# [print(col) for col in test.columns if test[col].dtype not in ['float64', 'int64']]

# test.drop(columns=["name"],inplace=True)

name
gender
ticket
cabin
embarked


In [87]:
# none_numeric_col

['gender', 'ticket', 'cabin', 'embarked']

In [126]:
# 학습 데이터
tmp = pd.DataFrame(
    enc.transform(train_ft[cols]).toarray(),
    columns = enc.get_feature_names_out()
)
train_ft = pd.concat([train_ft,tmp],axis=1)
train_ft.head(),train_ft.shape

(    age  sibsp  parch     fare  pclass  gender embarked  gender_female  \
 0  71.0      0      0  49.5042       1    male        C            0.0   
 1  34.0      0      0   8.0500       3    male        S            0.0   
 2  29.0      3      1  22.0250       3    male        S            0.0   
 3  18.0      1      1  13.0000       2  female        S            1.0   
 4  48.0      0      0  26.5500       1    male        S            0.0   
 
    gender_male  embarked_C  embarked_Q  embarked_S  
 0          1.0         1.0         0.0         0.0  
 1          1.0         0.0         0.0         1.0  
 2          1.0         0.0         0.0         1.0  
 3          0.0         0.0         0.0         1.0  
 4          1.0         0.0         0.0         1.0  ,
 (916, 12))

In [127]:
# 테스트 데이터
tmp = pd.DataFrame(
    enc.transform(test_ft[cols]).toarray(),
    columns = enc.get_feature_names_out()
)
test_ft = pd.concat([test_ft,tmp],axis=1)
test_ft.head()

Unnamed: 0,age,sibsp,parch,fare,pclass,gender,embarked,gender_female,gender_male,embarked_C,embarked_Q,embarked_S
0,62.0,0,0,26.55,1,male,S,0.0,1.0,0.0,0.0,1.0
1,28.0,0,0,47.1,1,male,S,0.0,1.0,0.0,0.0,1.0
2,24.0,0,0,9.5,3,male,S,0.0,1.0,0.0,0.0,1.0
3,29.824742,0,0,7.7333,3,female,Q,1.0,0.0,0.0,1.0,0.0
4,18.5,0,0,7.2833,3,female,Q,1.0,0.0,0.0,1.0,0.0


- 파생변수 생성과정에서 생긴 결측치 확인 해보기

In [128]:
train_ft.isnull().sum().sum(), test_ft.isnull().sum().sum()

(0, 0)

In [129]:
cols = ["gender","embarked"]
train_ft = train_ft.drop(columns=cols)
test_ft = test_ft.drop(columns=cols)

- Min-Max Scaling

In [130]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(train_ft)

In [131]:
train_ft = scaler.transform(train_ft) # 학습 데이터
train_ft

array([[0.88726043, 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.42377552, 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ],
       [0.36114243, 0.375     , 0.11111111, ..., 0.        , 0.        ,
        1.        ],
       ...,
       [0.44882876, 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ],
       [0.473882  , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ],
       [0.23587624, 0.125     , 0.11111111, ..., 0.        , 0.        ,
        1.        ]])

In [132]:
test_ft = scaler.transform(test_ft) # 테스트 데이터
test_ft

array([[0.77452086, 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ],
       [0.34861581, 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ],
       [0.29850933, 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ],
       ...,
       [0.26092948, 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ],
       [0.37147366, 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ],
       [0.33608919, 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [133]:
target = train["survived"].to_numpy()
target

array([0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0,
       0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,

In [134]:
train_ft.shape, test_ft.shape, target.shape

((916, 10), (393, 10), (916,))

## 데이터셋 클래스 구현
- DATASET과 DATALOADER
    - Pytorch에서는 미니배치 단위 학습을 위해 데이터셋을 좀 더 쉽게 다룰 수 있도록 유용한 도구로서 torch.utils.data.Dataset과 torch.utils.data.DataLoader를 제공
    - Dataset은 학습데이터와 정답을 저장해서 인덱싱을 통해 반환할수 있는 클래스
    - DataLoader 는 Dataset의 데이터를 쉽게 접근할 수 있도록 iterable 객체로 만들어 준다.
    - DataLoader 을 사용하면 미니 배치, 셔플(shuffle) 등 간단히 수행

In [135]:
class TitanicDataset(torch.utils.data.Dataset):
    def __init__(self,x, y=None): # 데이터를 받아 인스턴스 변수안에 저장
        self.x = x
        self.y = y
        if self.y is not None:
            self.y = self.y.reshape(-1,1)

    def __len__(self): # 총 샘플 수를 반환해주기 위해
        return len(self.x)

    def __getitem__(self,idx): # 인덱싱을 통해 데이터를 반환해주기 위해!
        item = {}
        item["x"] = torch.Tensor(self.x[idx]) # float32 텐서로 변환
        if self.y is not None:
            item["y"] = torch.Tensor(self.y[idx])
        return item

In [136]:
dt = TitanicDataset(train_ft, target)
dt[0]

{'x': tensor([0.8873, 0.0000, 0.0000, 0.0966, 0.0000, 0.0000, 1.0000, 1.0000, 0.0000,
         0.0000]),
 'y': tensor([0.])}

In [137]:
dt[0:5]

{'x': tensor([[0.8873, 0.0000, 0.0000, 0.0966, 0.0000, 0.0000, 1.0000, 1.0000, 0.0000,
          0.0000],
         [0.4238, 0.0000, 0.0000, 0.0157, 1.0000, 0.0000, 1.0000, 0.0000, 0.0000,
          1.0000],
         [0.3611, 0.3750, 0.1111, 0.0430, 1.0000, 0.0000, 1.0000, 0.0000, 0.0000,
          1.0000],
         [0.2233, 0.1250, 0.1111, 0.0254, 0.5000, 1.0000, 0.0000, 0.0000, 0.0000,
          1.0000],
         [0.5991, 0.0000, 0.0000, 0.0518, 0.0000, 0.0000, 1.0000, 0.0000, 0.0000,
          1.0000]]),
 'y': tensor([[0.],
         [0.],
         [0.],
         [1.],
         [1.]])}

In [142]:
dl = torch.utils.data.DataLoader(dt,batch_size=2, shuffle=False)
dl # 슬라이싱과 인덱싱이 지원이 안될 뿐 반복가능한 객체이다.
next(iter(dl))

{'x': tensor([[0.8873, 0.0000, 0.0000, 0.0966, 0.0000, 0.0000, 1.0000, 1.0000, 0.0000,
          0.0000],
         [0.4238, 0.0000, 0.0000, 0.0157, 1.0000, 0.0000, 1.0000, 0.0000, 0.0000,
          1.0000]]),
 'y': tensor([[0.],
         [0.]])}

## 딥러닝 모델 인공신경망 구현
- PyTorch에서 신경망 모델은 torch.nn.Module 을 상속받는 클래스(class)를 생성하여 정의
- `__init__` 메소드 에서 신경망의 계층(layer)들을 정의
- `forward` 메소드 에서 신경망에 텐서를 어떻게 전달할지 지정


In [None]:
train_ft.shape[1]

10

In [143]:
class Net(torch.nn.Module):
    def __init__(self, n_features):
        super(Net, self).__init__()
        self.fc_layer1 = torch.nn.Linear(n_features, 8)
        self.batch_norm1 = torch.nn.BatchNorm1d(8)
        self.relu1 = torch.nn.ReLU()
        self.dropout1 = torch.nn.Dropout(0.5)  
        self.fc_layer2 = torch.nn.Linear(8, 4)
        self.batch_norm2 = torch.nn.BatchNorm1d(4)
        self.relu2 = torch.nn.ReLU()
        self.dropout2 = torch.nn.Dropout(0.5)  

        self.out_layer = torch.nn.Linear(4, 1)

    def forward(self, x):
        x = self.fc_layer1(x)
        x = self.batch_norm1(x)
        x = self.relu1(x)
        x = self.dropout1(x)  

        x = self.fc_layer2(x)
        x = self.batch_norm2(x)
        x = self.relu2(x)
        x = self.dropout2(x)  

        return self.out_layer(x)

In [144]:
class Net(torch.nn.Module):
    def __init__(self, n_features):
        super(Net, self).__init__()
        self.seq = torch.nn.Sequential(
            torch.nn.Linear(n_features, 8),
            torch.nn.BatchNorm1d(8),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.5),
            torch.nn.Linear(8, 4),
            torch.nn.BatchNorm1d(4),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.5),
            torch.nn.Linear(4, 1)
        )

    def forward(self, x):
        return self.seq(x)

In [145]:
batch = next(iter(dl))
batch

{'x': tensor([[0.8873, 0.0000, 0.0000, 0.0966, 0.0000, 0.0000, 1.0000, 1.0000, 0.0000,
          0.0000],
         [0.4238, 0.0000, 0.0000, 0.0157, 1.0000, 0.0000, 1.0000, 0.0000, 0.0000,
          1.0000]]),
 'y': tensor([[0.],
         [0.]])}

In [149]:
model=Net(train_ft.shape[1])
model(batch['x'])

tensor([[ 0.5440],
        [-0.4110]], grad_fn=<AddmmBackward0>)

In [None]:
model = Net(train_ft.shape[1]) #인풋 초기화
model(batch["x"]) # 학습

tensor([[ 0.0403],
        [-0.0021]], grad_fn=<AddmmBackward0>)

## 하이퍼파라미터 정의(손실함수및 옵티마이저 선택등.)


In [150]:
batch_size = 5 # 배치 크기
loss_fn = torch.nn.BCEWithLogitsLoss() # 손실 함수

In [152]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device # 장치 문자열

'cpu'

In [153]:
model = Net(train_ft.shape[1]).to(device) # 모델 객체 생성후 gpu 로 장치 이동!!

In [154]:
optimizer = torch.optim.Adam(model.parameters() ,lr=0.001) # 옵티마이저 객체 생성

  _torch_pytree._register_pytree_node(


## 학습 및 테스트 loop 구현

In [155]:
train_dt = TitanicDataset(train_ft, target)
train_dl = torch.utils.data.DataLoader(train_dt, batch_size=batch_size, shuffle=True)
train_dl

<torch.utils.data.dataloader.DataLoader at 0x20d31905490>

- 예측

In [None]:
test_dt = TitanicDataset(test_ft)
test_dl = torch.utils.data.DataLoader(test_dt,batch_size=batch_size, shuffle=False)

In [161]:
from tqdm.auto import tqdm

def train_loop(dataloader,model,loss_fn,optimizer,device):
    epoch_loss=0
    model.train()
    
    for batch in tqdm(dataloader):
        pred=model(batch['x'].to(device))
        loss=loss_fn(pred,batch['y'].to(device))
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        epoch_loss+=loss.item()
    epoch_loss/=len(dataloader)
    
    return epoch_loss

In [162]:
@torch.no_grad()
def test_loop(dataloader,model,loss_fn,device):
    model.eval()
    epoch_loss=0
    act_func=torch.nn.Sigmoid()
    pred_list=[]
    
    for batch in dataloader:
        pred=model(batch['x'].to(device))
        if batch.get('y') is True:
            loss=loss_fn(pred,batch['y'].to(device))
            epoch_loss+=loss.item()
            
        pred=act_func(pred).to("cpu").numpy() 
        pred_list.append(pred)
        
    epoch_loss /= len(dataloader)
    pred = np.concatenate(pred_list)
    return epoch_loss, pred

In [159]:
batch_size = 32 # 배치 사이즈
loss_fn = torch.nn.BCEWithLogitsLoss() # 손실 객체
device = "cuda" if torch.cuda.is_available() else "cpu" # 장치 문자열
epochs = 100 # 최대 가능한 에폭수
n_splits = 5 # cv 에서 K 개수
n_features = train_ft.shape[1] # 피처 개수

from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
cv = KFold(n_splits=n_splits, shuffle=True, random_state=42)

In [166]:
is_holdout = False
save_dir = "D:\\NLP_Papers_Review\\weight\\titanic"
#save_dir = "./weight/titanic" 상대 경로 현재 최상위 디렉토리 인지 확인이 필요함.
#os.chdir("d:\\NLP_Papers_Review")

best_score_list = []
for i, (tri, vai) in enumerate( cv.split(train_ft) ):
    # 학습용 데이터
    x_train = train_ft[tri]
    y_train = target[tri]

    # 검증용 데이터
    x_valid = train_ft[vai]
    y_valid = target[vai]

    # 학습용 데이터로더 객체
    train_dt = TitanicDataset(x_train, y_train)
    train_dl = torch.utils.data.DataLoader(train_dt, batch_size=batch_size, shuffle=True)

    # 검증용 데이터로더 객체
    valid_dt = TitanicDataset(x_valid, y_valid)
    valid_dl = torch.utils.data.DataLoader(valid_dt, batch_size=batch_size, shuffle=False)

    # 모델 객체와 옵티마이저 객체 생성
    model = Net(n_features).to(device)
    optimizer = torch.optim.Adam( model.parameters() )

    best_score = 0 # 현재 최고 점수
    patience = 0 # 조기 종료 조건을 주기 위한 변수
    for epoch in range(epochs):
        train_loss = train_loop(dataloader=train_dl, model=model, loss_fn=loss_fn, optimizer=optimizer, device=device)
        valid_loss, pred= test_loop(dataloader=valid_dl, model=model, loss_fn=loss_fn, device=device)

        score = roc_auc_score(y_valid, pred)

        print(train_loss, valid_loss, score)
        if score > best_score:
            best_score = score # 최고 점수 업데이트
            patience = 0
            torch.save(model.state_dict(), os.path.join(save_dir, f"model_titanic_{i}.pth")) # 최고 점수 모델 가중치 저장

        patience += 1
        if patience == 5:
            break

    print(f"{i}번째 폴드 최고 AUC: {best_score}")
    best_score_list.append(best_score)
    if is_holdout:
        break

  0%|          | 0/23 [00:00<?, ?it/s]

0.7520837135936903 0.0 0.8288847117794487


  0%|          | 0/23 [00:00<?, ?it/s]

0.7339629090350607 0.0 0.8381578947368422


  0%|          | 0/23 [00:00<?, ?it/s]

0.710543718027032 0.0 0.8550751879699248


  0%|          | 0/23 [00:00<?, ?it/s]

0.6804018227950387 0.0 0.8515664160401002


  0%|          | 0/23 [00:00<?, ?it/s]

0.6766118536824766 0.0 0.8525689223057645


  0%|          | 0/23 [00:00<?, ?it/s]

0.6597550951916239 0.0 0.8612155388471178


  0%|          | 0/23 [00:00<?, ?it/s]

0.651971301306849 0.0 0.8610902255639097


  0%|          | 0/23 [00:00<?, ?it/s]

0.6387711659721707 0.0 0.8598370927318295


  0%|          | 0/23 [00:00<?, ?it/s]

0.6125424776388251 0.0 0.8590852130325815


  0%|          | 0/23 [00:00<?, ?it/s]

0.5888865901076276 0.0 0.8584586466165414
0번째 폴드 최고 AUC: 0.8612155388471178


  0%|          | 0/23 [00:00<?, ?it/s]

0.6866033491880997 0.0 0.5510786015373171


  0%|          | 0/23 [00:00<?, ?it/s]

0.6648113857144895 0.0 0.7181998512273742


  0%|          | 0/23 [00:00<?, ?it/s]

0.6517256342846415 0.0 0.8568063476320358


  0%|          | 0/23 [00:00<?, ?it/s]

0.643267413844233 0.0 0.8776345152491941


  0%|          | 0/23 [00:00<?, ?it/s]

0.6363360130268595 0.0 0.8742871311678652


  0%|          | 0/23 [00:00<?, ?it/s]

0.6277154580406521 0.0 0.8766426977436151


  0%|          | 0/23 [00:00<?, ?it/s]

0.6133688221807065 0.0 0.8823456484006942


  0%|          | 0/23 [00:00<?, ?it/s]

0.6046866515408391 0.0 0.9009422266303001


  0%|          | 0/23 [00:00<?, ?it/s]

0.6048671097859092 0.0 0.9109843788742872


  0%|          | 0/23 [00:00<?, ?it/s]

0.5901409854059634 0.0 0.9117282420034714


  0%|          | 0/23 [00:00<?, ?it/s]

0.5836742831313092 0.0 0.9197867592363005


  0%|          | 0/23 [00:00<?, ?it/s]

0.5926261259161908 0.0 0.9283411852219192


  0%|          | 0/23 [00:00<?, ?it/s]

0.5836420992146367 0.0 0.9314406149268534


  0%|          | 0/23 [00:00<?, ?it/s]

0.571166121441385 0.0 0.9311926605504588


  0%|          | 0/23 [00:00<?, ?it/s]

0.5718952378501063 0.0 0.9321844780560377


  0%|          | 0/23 [00:00<?, ?it/s]

0.5448518615701924 0.0 0.9324324324324325


  0%|          | 0/23 [00:00<?, ?it/s]

0.5533811100151228 0.0 0.933300272749814


  0%|          | 0/23 [00:00<?, ?it/s]

0.5482854402583578 0.0 0.9363997024547484


  0%|          | 0/23 [00:00<?, ?it/s]

0.5615877768267756 0.0 0.9356558393255641


  0%|          | 0/23 [00:00<?, ?it/s]

0.5518767626389213 0.0 0.9354078849491694


  0%|          | 0/23 [00:00<?, ?it/s]

0.5370168349017268 0.0 0.9377634515249194


  0%|          | 0/23 [00:00<?, ?it/s]

0.5496993699799413 0.0 0.9381353830895116


  0%|          | 0/23 [00:00<?, ?it/s]

0.5512232495390851 0.0 0.9383833374659063


  0%|          | 0/23 [00:00<?, ?it/s]

0.533228144697521 0.0 0.9404909496652616


  0%|          | 0/23 [00:00<?, ?it/s]

0.5409322082996368 0.0 0.93962310934788


  0%|          | 0/23 [00:00<?, ?it/s]

0.5364713280097299 0.0 0.9386312918423011


  0%|          | 0/23 [00:00<?, ?it/s]

0.5316134121107019 0.0 0.9397470865360773


  0%|          | 0/23 [00:00<?, ?it/s]

0.533934077490931 0.0 0.9416067443590378


  0%|          | 0/23 [00:00<?, ?it/s]

0.5272648839846902 0.0 0.9413587899826431


  0%|          | 0/23 [00:00<?, ?it/s]

0.5253679039685623 0.0 0.941606744359038


  0%|          | 0/23 [00:00<?, ?it/s]

0.5348635976729186 0.0 0.9418546987354327


  0%|          | 0/23 [00:00<?, ?it/s]

0.5339441364226134 0.0 0.9430944706174064


  0%|          | 0/23 [00:00<?, ?it/s]

0.5406829598157302 0.0 0.9419786759236302


  0%|          | 0/23 [00:00<?, ?it/s]

0.5454908233621846 0.0 0.9433424249938013


  0%|          | 0/23 [00:00<?, ?it/s]

0.5252316568208777 0.0 0.9461939003223407


  0%|          | 0/23 [00:00<?, ?it/s]

0.527501282484635 0.0 0.9496652615918671


  0%|          | 0/23 [00:00<?, ?it/s]

0.5110470419344695 0.0 0.9502851475328541


  0%|          | 0/23 [00:00<?, ?it/s]

0.5295330558134161 0.0 0.9494173072154725


  0%|          | 0/23 [00:00<?, ?it/s]

0.5254271626472473 0.0 0.9497892387800645


  0%|          | 0/23 [00:00<?, ?it/s]

0.5196569341680278 0.0 0.9496652615918669


  0%|          | 0/23 [00:00<?, ?it/s]

0.5169452415860217 0.0 0.9487974212744855
1번째 폴드 최고 AUC: 0.9502851475328541


  0%|          | 0/23 [00:00<?, ?it/s]

0.6946948611217997 0.0 0.7920824579831933


  0%|          | 0/23 [00:00<?, ?it/s]

0.6718392812687418 0.0 0.8285845588235294


  0%|          | 0/23 [00:00<?, ?it/s]

0.6576686682908431 0.0 0.861016281512605


  0%|          | 0/23 [00:00<?, ?it/s]

0.6409643422002378 0.0 0.86390493697479


  0%|          | 0/23 [00:00<?, ?it/s]

0.6228334204010342 0.0 0.8607536764705882


  0%|          | 0/23 [00:00<?, ?it/s]

0.6184183151825614 0.0 0.858390231092437


  0%|          | 0/23 [00:00<?, ?it/s]

0.6168888444485872 0.0 0.861672794117647


  0%|          | 0/23 [00:00<?, ?it/s]

0.6019000514693882 0.0 0.8631171218487395
2번째 폴드 최고 AUC: 0.86390493697479


  0%|          | 0/23 [00:00<?, ?it/s]

0.6659123068270476 0.0 0.7013649778652238


  0%|          | 0/23 [00:00<?, ?it/s]

0.6467910357143568 0.0 0.7616207575012297


  0%|          | 0/23 [00:00<?, ?it/s]

0.6351877917414126 0.0 0.8239670437776685


  0%|          | 0/23 [00:00<?, ?it/s]

0.6018616971762284 0.0 0.844872110181997


  0%|          | 0/23 [00:00<?, ?it/s]

0.6111931852672411 0.0 0.859382685686178


  0%|          | 0/23 [00:00<?, ?it/s]

0.5975091120471125 0.0 0.8700811608460404


  0%|          | 0/23 [00:00<?, ?it/s]

0.5928404668103093 0.0 0.8726635514018692


  0%|          | 0/23 [00:00<?, ?it/s]

0.5802696036255878 0.0 0.8834849975405804


  0%|          | 0/23 [00:00<?, ?it/s]

0.5652696671693221 0.0 0.8866822429906542


  0%|          | 0/23 [00:00<?, ?it/s]

0.5659962788872097 0.0 0.8875430398425973


  0%|          | 0/23 [00:00<?, ?it/s]

0.547247465537942 0.0 0.890002459419577


  0%|          | 0/23 [00:00<?, ?it/s]

0.5607763075310251 0.0 0.8933226758484998


  0%|          | 0/23 [00:00<?, ?it/s]

0.5393634645835214 0.0 0.8918470241023119


  0%|          | 0/23 [00:00<?, ?it/s]

0.5176680347193843 0.0 0.8924618789965567


  0%|          | 0/23 [00:00<?, ?it/s]

0.5479832563711249 0.0 0.8923389080177079


  0%|          | 0/23 [00:00<?, ?it/s]

0.5176487487295399 0.0 0.8940605017215937


  0%|          | 0/23 [00:00<?, ?it/s]

0.5116662784763004 0.0 0.8954131824889326


  0%|          | 0/23 [00:00<?, ?it/s]

0.5071213996928671 0.0 0.8951057550418102


  0%|          | 0/23 [00:00<?, ?it/s]

0.5126235964505569 0.0 0.8954131824889325


  0%|          | 0/23 [00:00<?, ?it/s]

0.5052784473999686 0.0 0.8943679291687162


  0%|          | 0/23 [00:00<?, ?it/s]

0.5199221489222153 0.0 0.8936915887850467
3번째 폴드 최고 AUC: 0.8954131824889326


  0%|          | 0/23 [00:00<?, ?it/s]

0.7878119271734486 0.0 0.2687283391095707


  0%|          | 0/23 [00:00<?, ?it/s]

0.7575207948684692 0.0 0.42828579045587845


  0%|          | 0/23 [00:00<?, ?it/s]

0.7070726311725118 0.0 0.6021727539322846


  0%|          | 0/23 [00:00<?, ?it/s]

0.7007409645163495 0.0 0.73980271927486


  0%|          | 0/23 [00:00<?, ?it/s]

0.6815934310788694 0.0 0.7900559850706478


  0%|          | 0/23 [00:00<?, ?it/s]

0.6685822476511416 0.0 0.8341109037589975


  0%|          | 0/23 [00:00<?, ?it/s]

0.655243570389955 0.0 0.8363103172487336


  0%|          | 0/23 [00:00<?, ?it/s]

0.6523832186408665 0.0 0.8456411623567051


  0%|          | 0/23 [00:00<?, ?it/s]

0.6426937942919524 0.0 0.8509730738469741


  0%|          | 0/23 [00:00<?, ?it/s]

0.6216131319170413 0.0 0.8529725406558251


  0%|          | 0/23 [00:00<?, ?it/s]

0.6196996414143107 0.0 0.8572380698480405


  0%|          | 0/23 [00:00<?, ?it/s]

0.6199395786161008 0.0 0.8677685950413222


  0%|          | 0/23 [00:00<?, ?it/s]

0.6009772886400637 0.0 0.8731005065315914


  0%|          | 0/23 [00:00<?, ?it/s]

0.5994615710299948 0.0 0.8813649693415089


  0%|          | 0/23 [00:00<?, ?it/s]

0.6014861231264861 0.0 0.8805651826179685


  0%|          | 0/23 [00:00<?, ?it/s]

0.5910700144975082 0.0 0.8864302852572646


  0%|          | 0/23 [00:00<?, ?it/s]

0.5775335418141406 0.0 0.8906958144494802


  0%|          | 0/23 [00:00<?, ?it/s]

0.5841302845789038 0.0 0.8904292188749667


  0%|          | 0/23 [00:00<?, ?it/s]

0.5670367466366809 0.0 0.8945614502799254


  0%|          | 0/23 [00:00<?, ?it/s]

0.5330074533172275 0.0 0.8962943215142628


  0%|          | 0/23 [00:00<?, ?it/s]

0.554929511702579 0.0 0.8992268728339109


  0%|          | 0/23 [00:00<?, ?it/s]

0.5665230815825255 0.0 0.8988269794721409


  0%|          | 0/23 [00:00<?, ?it/s]

0.5284111396126125 0.0 0.9016262330045323


  0%|          | 0/23 [00:00<?, ?it/s]

0.5511016301486803 0.0 0.9013596374300187


  0%|          | 0/23 [00:00<?, ?it/s]

0.5665520390738612 0.0 0.9033591042388697


  0%|          | 0/23 [00:00<?, ?it/s]

0.5559878077196039 0.0 0.903625699813383


  0%|          | 0/23 [00:00<?, ?it/s]

0.5367736751618593 0.0 0.9038922953878966


  0%|          | 0/23 [00:00<?, ?it/s]

0.5537945122822471 0.0 0.9045587843241802


  0%|          | 0/23 [00:00<?, ?it/s]

0.5296575841696366 0.0 0.9049586776859503


  0%|          | 0/23 [00:00<?, ?it/s]

0.5533452150614365 0.0 0.9037589976006397


  0%|          | 0/23 [00:00<?, ?it/s]

0.538318571837052 0.0 0.9029592108770994


  0%|          | 0/23 [00:00<?, ?it/s]

0.5303959276365198 0.0 0.9033591042388697


  0%|          | 0/23 [00:00<?, ?it/s]

0.5247863893923552 0.0 0.9037589976006397
4번째 폴드 최고 AUC: 0.9049586776859503


In [169]:
test_dt = TitanicDataset(test_ft)
test_dl = torch.utils.data.DataLoader(test_dt, batch_size=batch_size, shuffle=False)

pred_list=[]
for i in range(n_splits):
    model=Net(n_features).to(device)
    state_dict=torch.load(os.path.join(save_dir, f"model_titanic_{i}.pth"))
    model.load_state_dict(state_dict)
    _, pred = test_loop(test_dl, model, loss_fn, device)
    pred_list.append(pred)

In [181]:
np.array(pred_list).shape

(5, 393, 1)

In [176]:
pred = np.mean(pred_list,axis=0) 
np.where(pred > 0.5, 1, 0)

array([[0],
       [0],
       [0],
       [1],
       [1],
       [1],
       [0],
       [1],
       [1],
       [1],
       [0],
       [1],
       [1],
       [1],
       [1],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [1],
       [0],
       [0],
       [1],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [1],
       [0],
       [0],
       [1],
       [0],
       [1],
       [1],
       [0],
       [0],
       [0],
       [1],
       [1],
       [0],
       [1],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
    

In [177]:
np.array(list(map(lambda p: 1 if p > 0.5 else 0, pred)))

array([0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0,