# 타이타닉 생존자 예측 (v1)

- 입력: 데이터의 독립변인 전부 (작은게 좋음)
- 결과: 생존자를 예측(0 / 1)

## 과정

1. 데이터 불러오기
2. 데이터 전처리
3. 데이터 분할
4. 데이터 정규화
---
5. 학습/검증/테스트 데이터(텐서로 변경해야 함)
6. 모델 생성(a. 베이스라인, b. 개선)
7. 학습
8. 예측
9. 평가

> 주의: 5번부터는 PyTorch를 사용해서 코드 작성

In [25]:
import pandas as pd
import numpy as np
import torch

from torch.utils.data import Dataset, DataLoader

In [26]:
def load_data(file_path):
    df = pd.read_csv(file_path)
    # 형태(행, 열 크기)
    # - 열(독립변인, 측정값)
    # - 행(종속변인, 관측값)
    # 자료형(info)
    # 컬럼이름
    return df

In [27]:
def preprocess_data(df):
    df = df.copy()
    target_col = "Survived"
    columns_to_drop = ["PassengerId", "Name", "Ticket", "Cabin"]
    df = df.dropna(subset=[target_col])
    df = df.drop(columns=columns_to_drop)
    if "Sex" in df.columns:
        df["Sex"] = df["Sex"].map({"male": 0, "female":1})
    if "Embarked" in df.columns:
        df = pd.get_dummies(df, columns=["Embarked"], prefix="Embarked")
    numeric_cols = df.select_dtypes(include=[np.number]).columns.to_list()
    feature_cols = [col for col in numeric_cols if col != target_col]
    X = df[feature_cols].copy()
    y = df[target_col].copy()
    X = X.fillna(X.mean())
    return X.values, y.values, feature_cols

In [28]:
def split_data(X, y, train_ratio=0.8):
    n_samples = len(X)
    n_train = int(n_samples * train_ratio)
    indices = np.random.permutation(n_samples)
    train_indices = indices[:n_train]
    test_indices = indices[n_train:]
    X_train = X[train_indices]
    X_test = X[test_indices]
    y_train = y[train_indices]
    y_test = y[test_indices]
    return X_train, X_test, y_train, y_test

In [29]:
def normalize_features(X_train, X_test):
    mean = X_train.mean(axis=0)
    std = X_train.std(axis=0) + 1e-8
    X_train_norm = (X_train - mean) / std
    X_test_norm = (X_test - mean) / std
    return X_train_norm, X_test_norm, mean, std

In [30]:
class TitanicDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.FloatTensor(X)
        self.y = torch.LongTensor(y) # 분류 문제라서 LongTensor 사용
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, index):
        return self.X[index], self.y[index]

In [31]:
import torch.nn as nn
class LinearClassificationModel(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(LinearClassificationModel, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Dropout(0.2),            
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Dropout(0.2),              
            nn.Linear(16, num_classes)
        )

    def forward(self, x):
        return self.net(x)

In [32]:
def train_model(model, train_loader, criterion, optimizer, epochs=100):
    model.train()
    loss_history = []
    acc_history = []
    for epoch in range(epochs):
        epoch_loss = 0.0
        correct = 0
        total = 0        
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            predictions = model(batch_X)
            loss = criterion(predictions, batch_y)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
            _, predicted = torch.max(predictions.data, 1)
            total += batch_y.size(0)
            correct += (predicted == batch_y).sum().item()
        avg_loss = epoch_loss / len(train_loader)
        acc = 100 * correct / total
        loss_history.append(avg_loss)
        acc_history.append(acc)
        if (epoch + 1) % 10 == 0:
            print(f"{epoch+1} : {avg_loss}, {acc}")
    return loss_history, acc_history

In [33]:
def evaluate_model(model, test_loader):
    model.eval()
    predictions = []
    actuals = []
    correct = 0
    total = 0
    
    with torch.no_grad():
        for batch_X, batch_y in test_loader:
            pred = model(batch_X)
            _, predicted = torch.max(pred.data, 1)
            predictions.extend(predicted.numpy())
            actuals.extend(batch_y.numpy())
            
            total += batch_y.size(0)
            correct += (predicted == batch_y).sum().item()
    
    accuracy = 100 * correct / total
    
    predictions = np.array(predictions)
    actuals = np.array(actuals)
    
    print(f"\n=== 테스트 세트 평가 ===")
    print(f"Accuracy: {accuracy:.2f}%")
    print(f"Correct: {correct}/{total}")
    
    return predictions, actuals, accuracy



In [34]:
# [1] 데이터 불러오기
df = load_data("data/titanic/train.csv")
# [2] 데이터 전처리
X, y, feature_cols = preprocess_data(df)
# [3] 데이터 분할
X_train, X_test, y_train, y_test = split_data(X, y)
# [4] 데이터 정규화
X_train_norm, X_test_norm, mean, std = normalize_features(X_train, X_test)

In [35]:
# [5a] 
train_dataset = TitanicDataset(X_train_norm, y_train)
test_dataset = TitanicDataset(X_test_norm, y_test)
# [5b]
batch_size = 32 # 하이퍼 파라미터(a)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# [6]
input_dim = X_train_norm.shape[1]
num_classes = 2
model = LinearClassificationModel(input_dim, num_classes)

# [7]
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_history, acc_history = train_model(model, train_loader, criterion, optimizer)

# [8]
predictions, actuals, accuracy = evaluate_model(model, test_loader)

10 : 0.4288137768921645, 81.60112359550561
20 : 0.432538909756619, 82.4438202247191
30 : 0.4322083760862765, 82.7247191011236
40 : 0.4027125291202379, 84.41011235955057
50 : 0.39380335678224976, 83.42696629213484
60 : 0.3870399704445963, 84.69101123595506
70 : 0.3984350173369698, 82.86516853932584
80 : 0.37969425255837647, 84.5505617977528
90 : 0.3675023038750109, 84.69101123595506
100 : 0.3830209847377694, 84.26966292134831

=== 테스트 세트 평가 ===
Accuracy: 81.01%
Correct: 145/179
