In [428]:
import pandas as pd
import torch.nn as nn
import torch
from torch.utils.data import Dataset, DataLoader

In [498]:
df = pd.read_csv("train.csv")
features = ["Pclass", "Sex", "Age", "Survived"]

train_df = df[features]
train_df.loc[:, "Sex"].replace({"male": 0, "female": 1}).astype(int)
train_df.loc[:, "Pclass"] -= 1

train_df.loc[:, "age_present"] = False
train_df.loc[train_df["Age"].isnull() == False, "age_present"] = True
train_df.loc[train_df["age_present"] == False, "Age"] = train_df.loc[train_df["age_present"] == True, "Age"].mean()
train_df.loc[:, "age_present"] = train_df["age_present"].astype(float)

train_df.loc[:, "age_normalized"] = (train_df.loc[:, "Age"] - train_df["Age"].mean()) / train_df["Age"].std()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df.loc[:, "age_present"] = False
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df.loc[:, "age_normalized"] = (train_df.loc[:, "Age"] - train_df["Age"].mean()) / train_df["Age"].std()


Unnamed: 0,Pclass,Sex,Age,Survived,age_present,age_normalized
0,2,male,22.000000,0,1.0,-0.592148
1,0,female,38.000000,1,1.0,0.638430
2,2,female,26.000000,1,1.0,-0.284503
3,0,female,35.000000,1,1.0,0.407697
4,2,male,35.000000,0,1.0,0.407697
...,...,...,...,...,...,...
886,1,male,27.000000,0,1.0,-0.207592
887,0,female,19.000000,1,1.0,-0.822881
888,2,female,29.699118,0,0.0,0.000000
889,0,male,26.000000,1,1.0,-0.284503


In [516]:
class TitanicDataset(Dataset):
    def __init__(self, path):
        df = pd.read_csv(path)
        features = ["Pclass", "Sex", "Survived", "Age"]

        df = df[features]
        df["Sex"] = df["Sex"].replace({"male": 0, "female": 1}).astype(int)
        df.loc[:, "Pclass"] -= 1

        df.loc[:, "age_present"] = False
        df.loc[df["Age"].isnull() == False, "age_present"] = True
        df.loc[df["age_present"] == False, "Age"] = df.loc[df["age_present"] == True, "Age"].mean()
        df.loc[:, "age_present"] = df["age_present"].astype(float)

        df.loc[:, "age_normalized"] = (df.loc[:, "Age"] - df["Age"].mean()) / df["Age"].std()

        self.df = df

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, index):
        row = self.df.iloc[index]
        print(row[["Pclass", "Sex", "age_normalized", "age_present"]].values)
        return torch.tensor(row[["Pclass", "Sex", "age_normalized", "age_present"]].values), torch.tensor(row["Survived"])

In [514]:
ds = TitanicDataset("train.csv")
dl = DataLoader(ds, batch_size=64, shuffle=True)

In [508]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.p_class_embed = nn.Embedding(num_embeddings=3, embedding_dim=50)
        self.sex_embed = nn.Embedding(num_embeddings=2, embedding_dim=50)
        self.stack = nn.Sequential(
            nn.Linear(100, 50),
            nn.ReLU(),
            nn.Linear(50, 2)
        )

    def forward(self, x):
        x = torch.concat((self.p_class_embed(x[:, 0]), self.sex_embed(x[:, 1]), x[:, 2], x[:, 3]), dim=1)
        logits = self.stack(x)
        return logits

In [509]:
def train(dataloader, model, loss_fn, optimizer):
    model.train()
    for _, (X, y) in enumerate(dataloader):
        pred = model(X)
        loss = loss_fn(pred, y)

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        print(loss.item())

In [510]:
def test(dataloader, model):
    size = len(dataloader.dataset)
    model.eval()
    correct = 0
    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X)
            correct += (pred.argmax(1) == y).to(torch.int).sum().item()
    print(correct, size)
    correct /= size
    print(f"Accuracy: {(100*correct):>0.1f}%\n")

In [511]:
model = Model()

In [515]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

epochs = 20
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(dl, model, loss_fn, optimizer)
print("Done!")


Epoch 1
-------------------------------
Pclass             2.000000
Sex                1.000000
Survived           1.000000
Age               19.000000
age_present        1.000000
age_normalized    -0.822881
Name: 192, dtype: float64
Pclass             2.000000
Sex                0.000000
Survived           0.000000
Age               44.000000
age_present        1.000000
age_normalized     1.099897
Name: 603, dtype: float64
Pclass             2.000000
Sex                0.000000
Survived           0.000000
Age               25.000000
age_present        1.000000
age_normalized    -0.361415
Name: 703, dtype: float64
Pclass             1.000000
Sex                1.000000
Survived           1.000000
Age               29.699118
age_present        0.000000
age_normalized     0.000000
Name: 303, dtype: float64
Pclass             2.00000
Sex                0.00000
Survived           1.00000
Age               20.00000
age_present        1.00000
age_normalized    -0.74597
Name: 622, dtype: floa

RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.DoubleTensor instead (while checking arguments for embedding)

In [464]:
test(dl, model)

701 891
Accuracy: 78.7%



In [425]:
test_df = pd.read_csv("test.csv")
features = ["Pclass", "Sex"]
# test_df = test_df[features]
test_df["Sex"] = test_df["Sex"].replace({"male": 0, "female": 1}).astype(int)
test_df.loc[:, "Pclass"] -= 1
X_test = torch.tensor(test_df[["Pclass", "Sex"]].values)
y_test = model(X_test).argmax(1)

test_df["Survived"] = y_test

In [427]:
test_df
test_df.to_csv("submission_neuralnet.csv", columns=["PassengerId", "Survived"], index=False)