In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from google.colab import drive
drive.mount('/content/drive')
df_train = pd.read_csv("/content/drive/My Drive/csv/train.csv")
df_test = pd.read_csv("/content/drive/My Drive/csv/test.csv")

Mounted at /content/drive


In [3]:
df_train['Age'].fillna(df_train['Age'].median(), inplace=True)
df_test['Age'].fillna(df_train['Age'].median(), inplace=True)

for df in [df_train, df_test]:
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df["Cabin"].fillna('U', inplace=True)
    df["Deck"] = df["Cabin"].map(lambda x: x[0])
    df['HasCabin'] = df["Deck"].apply(lambda x: 0 if x == "U" else 1)
    df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    df['Title'] = df['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Major', 'Sir', 'Jonkheer', 'Dona'], 'Other')
    df['Title'] = df['Title'].replace('Mlle', 'Miss')
    df['Title'] = df['Title'].replace('Ms', 'Miss')
    df['Title'] = df['Title'].replace('Mme', 'Mrs')
    df['Embarked'].fillna(df_train['Embarked'].mode()[0], inplace=True)  # 최빈값으로 결측치 채우기
    df['AgeBand'] = pd.cut(df['Age'], 5, labels=[0, 1, 2, 3, 4]).astype(int)
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)  # 혼자인지 아닌지

deck_mapping = {deck: idx for idx, deck in enumerate(df_train['Deck'].unique())}
title_mapping = {title: idx for idx, title in enumerate(df_train['Title'].unique())}
embarked_mapping = {'S': 0, 'C': 1, 'Q': 2}

x_train = np.zeros([len(df_train), 8])
x_train[:, 0] = df_train['AgeBand']
x_train[:, 1] = df_train['FamilySize']
x_train[:, 2] = df_train["Deck"].map(deck_mapping).astype(float)
x_train[:, 3] = df_train['HasCabin']
x_train[:, 4] = df_train['Title'].map(title_mapping).fillna(0).astype(float)
x_train[:, 5] = df_train['Embarked'].map(embarked_mapping).fillna(0).astype(float)
x_train[:, 6] = df_train['IsAlone']
x_train[:, 7] = df_train['Pclass']
y_train = df_train["Survived"]

def sigmoid(x):
    return 1.0 / (1.0 + np.exp(-x))

def hypothesis(w, x, b):
    return sigmoid(x.dot(w) + b)

def cost_function(h, y):
    return -np.mean(y * np.log(h + 1e-8) + (1.0 - y) * np.log(1.0 - h + 1e-8))

w = np.random.randn(8)
b = np.random.randn(1)

epoch = 20000
alpha = 6e-2
total_loss = []

for i in range(epoch):
    h = hypothesis(w, x_train, b)
    loss = cost_function(h, y_train)
    g_w = np.dot(x_train.T, (h - y_train)) / len(y_train)
    g_b = np.mean(h - y_train)
    w = w - alpha * g_w
    b = b - alpha * g_b
    total_loss.append(loss)

x_test = np.zeros([len(df_test), 8])
x_test[:, 0] = df_test['AgeBand']
x_test[:, 1] = df_test['FamilySize']
x_test[:, 2] = df_test["Deck"].map(deck_mapping).fillna(-1).astype(float)
x_test[:, 3] = df_test['HasCabin']
x_test[:, 4] = df_test['Title'].map(title_mapping).fillna(-1).astype(float)
x_test[:, 5] = df_test['Embarked'].map(embarked_mapping).fillna(-1).astype(float)
x_test[:, 6] = df_test['IsAlone']
x_test[:, 7] = df_test['Pclass']

y_pred = hypothesis(w, x_test, b)
y_pred = np.round(y_pred)

In [4]:
submission = pd.DataFrame({
 "PassengerId" : df_test["PassengerId"].astype(int),
 "Survived" : y_pred.astype(int)
})
submission.to_csv("/content/drive/My Drive/csv/result.csv", index=False)