In [None]:
import pandas as pd
import torch.nn as nn
import torch
from torch.utils.data import Dataset, DataLoader

In [None]:
df = pd.read_csv("train.csv")
features = ["Pclass", "Sex", "Age", "Survived"]

train_df = df[features]
train_df.loc[:, "Sex"].replace({"male": 0, "female": 1}).astype(int)
train_df.loc[:, "Pclass"] -= 1

train_df.loc[:, "age_present"] = False
train_df.loc[train_df["Age"].isnull() == False, "age_present"] = True
train_df.loc[train_df["age_present"] == False, "Age"] = train_df.loc[train_df["age_present"] == True, "Age"].mean()
train_df.loc[:, "age_present"] = train_df["age_present"].astype(float)

train_df.loc[:, "age_normalized"] = (train_df.loc[:, "Age"] - train_df["Age"].mean()) / train_df["Age"].std()


In [None]:
class TitanicDataset(Dataset):
    def __init__(self, path):
        df = pd.read_csv(path)
        features = ["Pclass", "Sex", "Survived", "Age"]

        df = df[features]
        df["Sex"] = df["Sex"].replace({"male": 0, "female": 1}).astype(int)
        df.loc[:, "Pclass"] -= 1

        df.loc[:, "age_present"] = False
        df.loc[df["Age"].isnull() == False, "age_present"] = True
        df.loc[df["age_present"] == False, "Age"] = df.loc[df["age_present"] == True, "Age"].mean()
        df.loc[:, "age_present"] = df["age_present"].astype(float)

        df.loc[:, "age_normalized"] = (df.loc[:, "Age"] - df["Age"].mean()) / df["Age"].std()

        self.df = df

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, index):
        row = self.df.iloc[index]
        # print(row[["Pclass", "Sex", "age_normalized", "age_present"]].values)
        return torch.tensor(row[["Pclass", "Sex", "age_normalized", "age_present"]].values).long(), torch.tensor(row["Survived"]).long()

In [None]:
ds = TitanicDataset("train.csv")
dl = DataLoader(ds, batch_size=64, shuffle=True)

In [None]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.p_class_embed = nn.Embedding(num_embeddings=3, embedding_dim=60)
        self.sex_embed = nn.Embedding(num_embeddings=2, embedding_dim=60)
        self.stack = nn.Sequential(
            nn.Linear(120+2, 10),
            nn.ReLU(),
            nn.Linear(10, 2)
        )

    def forward(self, x):
        x = torch.concat((self.p_class_embed(x[:, 0].int()), self.sex_embed(x[:, 1].int()), x[:, 2].unsqueeze(1), x[:, 3].unsqueeze(1)), dim=1)
        logits = self.stack(x)
        return logits

In [None]:
def train(dataloader, model, loss_fn, optimizer):
    model.train()
    for _, (X, y) in enumerate(dataloader):
        pred = model(X)
        loss = loss_fn(pred, y)

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        print(loss.item())

In [None]:
def test(dataloader, model):
    size = len(dataloader.dataset)
    model.eval()
    correct = 0
    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X)
            correct += (pred.argmax(1) == y).to(torch.int).sum().item()
    print(correct, size)
    correct /= size
    print(f"Accuracy: {(100*correct):>0.1f}%\n")

In [None]:
model = Model()

In [None]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

epochs = 100
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(dl, model, loss_fn, optimizer)
print("Done!")


In [None]:
test(dl, model)

In [None]:
test_df = pd.read_csv("test.csv")
features = ["Pclass", "Sex"]
# test_df = test_df[features]
test_df["Sex"] = test_df["Sex"].replace({"male": 0, "female": 1}).astype(int)
test_df.loc[:, "Pclass"] -= 1
X_test = torch.tensor(test_df[["Pclass", "Sex"]].values)
y_test = model(X_test).argmax(1)

test_df["Survived"] = y_test

In [None]:
test_df
test_df.to_csv("submission_neuralnet.csv", columns=["PassengerId", "Survived"], index=False)