# 타이타닉 생존자 예측 (v1)
- 입력: 데이터의 독립변인 전부
- 결과: 생존자를 예측(0 / 1)

## 과정

1. 데이터 불러오기
2. 데이터 전처리
3. 데이터 분할
4. 데이터 정규화
---
5. 학습/검증/테스트 데이터(텐서로 변경해야 함)
6. 모델 생성
7. 학습
8. 예측
9. 평가

> 주의: 5번부터는 PyTorch를 사용해서 코드 작성

In [1]:
import pandas as pd
import numpy as np
import torch

from torch.utils.data import Dataset, DataLoader

In [2]:
def load_data(file_path):
    df = pd.read_csv(file_path)
    # print(f"\n결측치 : {df.isnull().sum()}")
    return df

In [3]:
def preprocess_data(df):
    df = df.copy()
    target_col = "Survived"
    columns_to_drop = ["PassengerId", "Name", "Ticket", "Cabin"]
    df = df.dropna(subset=[target_col])
    df = df.drop(columns=columns_to_drop)
    if "Sex" in df.columns:
        df["Sex"] = df["Sex"].map({"male": 0, "female":1})
    if "Embarked" in df.columns:
        df = pd.get_dummies(df, columns=["Embarked"], prefix="Embarked")
    numeric_cols = df.select_dtypes(include=[np.number]).columns.to_list()
    feature_cols = [col for col in numeric_cols if col != target_col]
    X = df[feature_cols].copy()
    y = df[target_col].copy()
    X = X.fillna(X.mean())
    return X.values, y.values, feature_cols

In [4]:
def split_data(X, y, train_ratio=0.8):
    n_samples = len(X)
    n_train = int(n_samples * train_ratio)
    indices = np.random.permutation(n_samples)
    train_indices = indices[:n_train]
    test_indices = indices[n_train:]
    X_train = X[train_indices]
    X_test = X[test_indices]
    y_train = y[train_indices]
    y_test = y[test_indices]
    return X_train, X_test, y_train, y_test

In [5]:
def normalize_features(X_train, X_test):
    mean = X_train.mean(axis=0)
    std = X_train.std(axis=0) + 1e-8
    X_train_norm = (X_train - mean) / std
    X_test_norm = (X_test - mean) / std
    return X_train_norm, X_test_norm, mean, std

In [8]:
class TitanicDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.FloatTensor(X)
        self.y = torch.LongTensor(y) # 분류 문제라서 LongTensor 사용
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, index):
        return self.X[index], self.y[index]

In [9]:
# [1] 데이터 불러오기
df = load_data("data/titanic/train.csv")
# [2] 데이터 전처리
X, y, feature_cols = preprocess_data(df)
# [3] 데이터 분할
X_train, X_test, y_train, y_test = split_data(X, y)
# [4] 데이터 정규화
X_train_norm, X_test_norm, mean, std = normalize_features(X_train, X_test)
# [5] 
train_dataset = TitanicDataset(X_train_norm, y_train)
test_dataset = TitanicDataset(X_test_norm, y_test)