In [1]:
import pandas as pd
import numpy as np
import random
import os
import torch
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from google.colab import drive
drive.mount('/content/drive')

DATA_PATH = "/content/drive/MyDrive/data/"

SEED = 42 # 시드값

# 데이터 블러오기
train = pd.read_csv(f"{DATA_PATH}titanic_train.csv") # 학습데이터
test = pd.read_csv(f"{DATA_PATH}titanic_test.csv") # 테스트 데이터

# 결측치 처리
age_mean = train["age"].mean()
fare_median = train["fare"].median()
cabin_unk = "UNK"
embarked_mode = train["embarked"].mode()[0]
train["age"] = train["age"].fillna(age_mean)
train["cabin"] = train["cabin"].fillna(cabin_unk)
test["age"] = test["age"].fillna(age_mean)
test["fare"] = test["fare"].fillna(fare_median)
test["cabin"] = test["cabin"].fillna(cabin_unk)
test["embarked"] = test["embarked"].fillna(embarked_mode)

# 특성으로 사용할 변수 선택
cols = ["age","sibsp","parch","fare","pclass","gender","embarked"]
train_ft = train[cols].copy()
test_ft = test[cols].copy()

# 범주형 변수 원핫인코딩
cols = ['gender','embarked']
enc = OneHotEncoder(handle_unknown = 'ignore')
enc.fit(train[cols])
tmp = pd.DataFrame(
    enc.transform(train_ft[cols]).toarray(),
    columns = enc.get_feature_names_out()
)
train_ft = pd.concat([train_ft,tmp],axis=1).drop(columns=cols)
tmp = pd.DataFrame(
    enc.transform(test_ft[cols]).toarray(),
    columns = enc.get_feature_names_out()
)
test_ft = pd.concat([test_ft,tmp],axis=1).drop(columns=cols)

# Min-Max Scaling
scaler = MinMaxScaler()
scaler.fit(train_ft)
train_ft = scaler.transform(train_ft)
test_ft = scaler.transform(test_ft)

# 정답 데이터
target = train["survived"].to_numpy().reshape(-1,1) # 정답 데이터 2차원으로 변경

train_ft.shape, test_ft.shape, target.shape

Mounted at /content/drive


((916, 10), (393, 10), (916, 1))

# 재현성 함수(Reproduction)



In [2]:
def reset_seeds(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

# 데이터셋 클래스 구현
- 미니 배치 학습을 위해 데이터셋 클래스를 구현하고, DataLoader 클래스를 사용해서 미니 배치 학습이 가능한 상태를 만듦

In [3]:
class TitanicDataset(torch.utils.data.Dataset):
    def __init__(self, x, y=None):
        self.x = x
        self.y = y

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        item = {}
        item["x"] = torch.Tensor(self.x[idx])

        if self.y is not None:
            item["y"] = torch.Tensor(self.y[idx])

        return item

In [4]:
dt = TitanicDataset(train_ft, target)
dt[-1]

{'x': tensor([0.2359, 0.1250, 0.1111, 0.0717, 0.5000, 0.0000, 1.0000, 0.0000, 0.0000,
         1.0000]),
 'y': tensor([0.])}

In [5]:
dl = torch.utils.data.DataLoader(dt, batch_size=2, shuffle=False)
next(iter(dl))

{'x': tensor([[0.8873, 0.0000, 0.0000, 0.0966, 0.0000, 0.0000, 1.0000, 1.0000, 0.0000,
          0.0000],
         [0.4238, 0.0000, 0.0000, 0.0157, 1.0000, 0.0000, 1.0000, 0.0000, 0.0000,
          1.0000]]),
 'y': tensor([[0.],
         [0.]])}

# 인공 신경망 모델 클래스 구현

In [6]:
class Net(torch.nn.Module):
    def __init__(self, n_features):
        super().__init__()
        self.seq = torch.nn.Sequential(
            torch.nn.Linear(n_features, 12),
            torch.nn.BatchNorm1d(12),
            torch.nn.LeakyReLU(),
            torch.nn.Linear(12, 8),
            torch.nn.BatchNorm1d(8),
            torch.nn.LeakyReLU(),
            torch.nn.Linear(8, 4),
            torch.nn.BatchNorm1d(4),
            torch.nn.LeakyReLU(),
            torch.nn.Linear(4, 1)
        )

    def forward(self, x):
        return self.seq(x)

In [7]:
batch = next(iter(dl))
model = Net(train_ft.shape[1])
model(batch["x"])

tensor([[-0.4456],
        [-0.4197]], grad_fn=<AddmmBackward0>)

# 하이퍼파라미터 정의
- 손실함수, 배치사이즈 등

In [8]:
loss_fn = torch.nn.BCEWithLogitsLoss()
device = "cuda" if torch.cuda.is_available() else "cpu"
batch_size = 32
n_features = train_ft.shape[1]

# 학습 loop 구현

In [9]:
reset_seeds(SEED)
model = Net(n_features).to(device)
optimizer = torch.optim.Adam(model.parameters())
train_dt = TitanicDataset(train_ft, target)
train_dl = torch.utils.data.DataLoader(train_dt, batch_size=batch_size, shuffle=True)

model.train()
for _ in range(10):
    epoch_loss = 0

    for batch in train_dl:
        pred = model(batch["x"].to(device))
        loss = loss_fn(pred, batch["y"].to(device))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    epoch_loss /= len(train_dl)
    print(epoch_loss)

0.6511606820698442
0.5812931019684364
0.5402021366974403
0.5049006209291261
0.4832064971841615
0.46134479498041087
0.45694437006424216
0.43889735382178735
0.4304857675371499
0.4153964375627452


# 테스트 데이터에 대한 배치 단위 예측 loop 구현

In [11]:
# test_x = torch.Tensor(test_ft)
# model.eval()
# model(test_x)

In [12]:
test_dt = TitanicDataset(test_ft)
test_dl = torch.utils.data.DataLoader(test_dt, batch_size=batch_size, shuffle=False)

In [13]:
model.eval()

pred_list = []
act = torch.nn.Sigmoid()

with torch.no_grad():
    for batch in test_dl:
        pred = model(batch["x"].to(device))
        pred = act(pred)
        pred = pred.to("cpu").numpy()
        pred_list.append(pred)

In [14]:
pred = np.concatenate(pred_list)
pred.shape

(393, 1)