# 타이타닉 생존자 예측 (v1)

- 입력: 데이터의 독립변인 전부 (작은게 좋음)
- 결과: 생존자를 예측(0 / 1)

## 과정

1. 데이터 불러오기
2. 데이터 전처리
3. 데이터 분할
4. 데이터 정규화
---
5. 학습/검증/테스트 데이터(텐서로 변경해야 함)
6. 모델 생성(a. 베이스라인, b. 개선)
7. 학습
8. 예측
9. 평가

> 주의: 5번부터는 PyTorch를 사용해서 코드 작성

In [1]:
import pandas as pd
import numpy as np
import torch

from torch.utils.data import Dataset, DataLoader

In [2]:
def load_data(file_path):
    df = pd.read_csv(file_path)
    # 형태(행, 열 크기)
    # - 열(독립변인, 측정값)
    # - 행(종속변인, 관측값)
    # 자료형(info)
    # 컬럼이름
    return df

In [3]:
def preprocess_data(df):
    df = df.copy()
    target_col = "Survived"
    columns_to_drop = ["PassengerId", "Name", "Ticket", "Cabin"]
    df = df.dropna(subset=[target_col])
    df = df.drop(columns=columns_to_drop)
    if "Sex" in df.columns:
        df["Sex"] = df["Sex"].map({"male": 0, "female":1})
    if "Embarked" in df.columns:
        df = pd.get_dummies(df, columns=["Embarked"], prefix="Embarked")
    numeric_cols = df.select_dtypes(include=[np.number]).columns.to_list()
    feature_cols = [col for col in numeric_cols if col != target_col]
    X = df[feature_cols].copy()
    y = df[target_col].copy()
    X = X.fillna(X.mean())
    return X.values, y.values, feature_cols

In [4]:
def split_data(X, y, train_ratio=0.8):
    n_samples = len(X)
    n_train = int(n_samples * train_ratio)
    indices = np.random.permutation(n_samples)
    train_indices = indices[:n_train]
    test_indices = indices[n_train:]
    X_train = X[train_indices]
    X_test = X[test_indices]
    y_train = y[train_indices]
    y_test = y[test_indices]
    return X_train, X_test, y_train, y_test

In [5]:
def normalize_features(X_train, X_test):
    mean = X_train.mean(axis=0)
    std = X_train.std(axis=0) + 1e-8
    X_train_norm = (X_train - mean) / std
    X_test_norm = (X_test - mean) / std
    return X_train_norm, X_test_norm, mean, std

In [6]:
class TitanicDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.FloatTensor(X)
        self.y = torch.LongTensor(y) # 분류 문제라서 LongTensor 사용
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, index):
        return self.X[index], self.y[index]

In [7]:
import torch.nn as nn
class LinearClassificationModel(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(LinearClassificationModel, self).__init__()
        self.linear = nn.Linear(input_dim, num_classes)

    def forward(self, x):
        return self.linear(x)

In [8]:
def train_model(model, train_loader, criterion, optimizer, epochs=100):
    model.train()
    loss_history = []
    acc_history = []
    for epoch in range(epochs):
        epoch_loss = 0.0
        correct = 0
        total = 0        
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            predictions = model(batch_X)
            loss = criterion(predictions, batch_y)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
            _, predicted = torch.max(predictions.data, 1)
            total += batch_y.size(0)
            correct += (predicted == batch_y).sum().item()
        avg_loss = epoch_loss / len(train_loader)
        acc = 100 * correct / total
        loss_history.append(avg_loss)
        acc_history.append(acc)
        if (epoch + 1) % 10 == 0:
            print(f"{epoch+1} : {avg_loss}, {acc}")
    return loss_history, acc_history

In [9]:
# [1] 데이터 불러오기
df = load_data("data/titanic/train.csv")
# [2] 데이터 전처리
X, y, feature_cols = preprocess_data(df)
# [3] 데이터 분할
X_train, X_test, y_train, y_test = split_data(X, y)
# [4] 데이터 정규화
X_train_norm, X_test_norm, mean, std = normalize_features(X_train, X_test)

In [10]:
# [5a] 
train_dataset = TitanicDataset(X_train_norm, y_train)
test_dataset = TitanicDataset(X_test_norm, y_test)
# [5b]
batch_size = 32 # 하이퍼 파라미터(a)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# 초기 데이터의 크기와 결과에 대해서 미리 알고 있어야 함
input_dim = X_train_norm.shape[1]
num_classes = 2
model = LinearClassificationModel(input_dim, num_classes)

In [11]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
train_model(model, train_loader, criterion, optimizer)

10 : 0.5189841260080752, 77.9494382022472
20 : 0.4718442507412123, 78.23033707865169
30 : 0.45304796877114667, 79.21348314606742
40 : 0.4460901112660118, 79.35393258426966
50 : 0.4425584809935611, 79.49438202247191
60 : 0.44347754509552667, 79.21348314606742
70 : 0.4503637591133947, 79.49438202247191
80 : 0.4404614813949751, 79.49438202247191
90 : 0.44180770283160004, 79.21348314606742
100 : 0.4476490467786789, 79.49438202247191


([0.6455990983092267,
  0.6214880710062773,
  0.6022348948146986,
  0.5892752473768981,
  0.564428132513295,
  0.5505003851392994,
  0.5453239951444708,
  0.5328584559585737,
  0.5256781694681748,
  0.5189841260080752,
  0.5101650862590127,
  0.5094598790873652,
  0.49903669823770935,
  0.4928934457509414,
  0.4892320257166158,
  0.4832260582758033,
  0.4823308263135993,
  0.4958657803742782,
  0.4759669770365176,
  0.4718442507412123,
  0.48019252782282623,
  0.4796934425830841,
  0.47388859676278156,
  0.4642861964909927,
  0.47025074129519256,
  0.46833237487336865,
  0.4581053995567819,
  0.4690203018810438,
  0.4570896871711897,
  0.45304796877114667,
  0.47488500760949176,
  0.46349727589151135,
  0.4545773034510405,
  0.4591495420621789,
  0.4668999638246453,
  0.4519225151642509,
  0.45835418156955554,
  0.4635588060254636,
  0.4543311479298965,
  0.4460901112660118,
  0.46214729417925293,
  0.45038784457289655,
  0.45611301712367847,
  0.45036738851796027,
  0.4568336593068164