In [10]:
import torch
import torch.nn as nn
import torchvision.transforms as transforms
from sklearn.preprocessing import normalize
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch.nn.functional as F
from torch.utils.data import Dataset
import copy

device = torch.device('cuda')

In [2]:
def pclass_to_one_hot(x):
    hot = np.zeros((len(x), 3))
    for i, pclass in enumerate(x):
        hot[i,pclass-1] = 1
    return hot

def sex_to_isfemale(x):
    is_female = np.zeros(len(x))
    for i, sex in enumerate(x):
        if sex == "female":
            is_female[i] = 1
    return is_female

def embarked_to_one_hot(x):
    hot = np.zeros((len(x), 4))
    for i, embarked in enumerate(x):
        if embarked == 'S':
            embarked = 0
        elif embarked == 'C':
            embarked = 1
        elif embarked == 'Q':
            embarked = 2
        else:
            embarked = 3
        hot[i,embarked] = 1.
    return hot

def age_to_one_hot(x):
    hot = np.zeros((len(x), 4))
    
    for i, age in enumerate(x):
        index = -1
        if np.isnan(age):
            index = 0
        elif age < 7:
            index = 1
        elif age >= 7 and age < 64:
            index = 2
        elif age >= 64:
            index = 3
        else:
            print("age_to_one_hot encountered eroneous value.")
        hot[i,index] = 1.0

    return hot

def normalize_fare(x):
    np_x = np.array(x)
    mean = np_x.mean()
    np_x = np.nan_to_num(np_x, mean)
    np_x = np.log(np_x + 1)
    return (np_x / np_x.max()).reshape((-1,1))

def preprocess_df(df):
    np_survived = np.array(df["Survived"]).reshape((-1,1))
    np_pclass = pclass_to_one_hot(df['Pclass'])
    np_is_female = sex_to_isfemale(df['Sex']).reshape((-1,1))
    np_embarked = embarked_to_one_hot(df['Embarked'])
    #np_age = (np.array(df['Age']) / 80).reshape(-1,1) # Age as float
    np_age = age_to_one_hot(df['Age'])  # Age as one-hot [under_7, 7_or_older, unknown]
    np_fare = normalize_fare(df['Fare'])
    
    return np.hstack([np_survived, np_pclass, np_is_female, np_embarked, np_age, np_fare])



df = pd.read_csv('data/train.csv') 
df = df.loc[:, ["Survived", "Pclass", "Sex", "Age", "Fare", "Embarked"]]


val_split = 0.8
df.sample(frac=1).reset_index(drop=True)
train_df = df.iloc[:int(val_split * len(df))]
val_df = df.iloc[int(val_split * len(df)):]

# Balance train_df
train_df = train_df.groupby('Survived')
train_df = pd.DataFrame(train_df.apply(lambda x: x.sample(train_df.size().min()).reset_index(drop=True)))

train_np = preprocess_df(train_df)
val_np = preprocess_df(val_df)

batch_size = 4

class TitanicDataset(Dataset):
    def __init__(self, xy):
        self.x = torch.tensor(xy[:,1:], dtype=torch.float32)
        self.y = torch.tensor(xy[:,0], dtype=torch.int64)
        self.n_samples = len(self.x)
    
    def __getitem__(self, index):
        return self.x[index], self.y[index]
    
    def __len__(self):
        return self.n_samples

# val_split_loc = 0.8
# np.random.shuffle(numpy)

# train_dataset = TitanicDataset(numpy[:int(val_split_loc*len(numpy)),:])
# val_dataset = TitanicDataset(numpy[int(val_split_loc*len(numpy)):,:])
train_dataset = TitanicDataset(train_np)
val_dataset = TitanicDataset(val_np)

train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset, batch_size=batch_size)


samples, labels = iter(train_loader).next()
print(samples.shape, labels.shape)

torch.Size([4, 13]) torch.Size([4])


In [16]:
input_size = 13
hidden_size = 4
num_classes = 2

class NeuralNet(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(NeuralNet, self).__init__()
        self.l1 = nn.Linear(input_size, hidden_size)
        self.l2 = nn.Linear(hidden_size, num_classes)
        
    def forward(self, x):
        out = torch.tanh(self.l1(x))
        out = self.l2(out)
        return out
model = NeuralNet(input_size, hidden_size, num_classes).to(device)
    
    
# class NeuralNet(nn.Module):
#     def __init__(self, input_size, num_classes):
#         super(NeuralNet, self).__init__()
#         self.l1 = nn.Linear(input_size, num_classes)
        
#     def forward(self, x):
#         return self.l1(x)
    
# model = NeuralNet(input_size, num_classes).to(device)

In [17]:
learning_rate = 0.001
criterion = nn.CrossEntropyLoss()
optimiser = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-4)
#optimiser = torch.optim.SGD(model.parameters(), lr=learning_rate, weight_decay=1e-5)

In [18]:
def evaluate_validation(model):
    with torch.no_grad():
        n_correct = 0
        n_samples = 0
        for inputs, labels in val_loader:
            inputs = inputs.reshape(-1,input_size).to(device)
            labels = labels.to(device)

            outputs = model(inputs)
            _, predictions = torch.max(outputs, 1)
            n_samples += labels.shape[0]
            n_correct += (predictions == labels).sum().item()
        acc = 100.0 * (n_correct/n_samples)
        return acc
        
    

num_epochs = 50

n_total_steps = len(train_loader)

#opt_state_dict = model.state_dict().copy()
opt_model = copy.deepcopy(model)

max_val_acc = 0.
for epoch in range(num_epochs):
    print('----------------------')
    for i, (inputs, labels) in enumerate(train_loader):
        inputs = inputs.reshape(-1, input_size).to(device)
        labels = labels.to(device)
        
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        
        loss.backward()
        optimiser.step()
        optimiser.zero_grad()
        
        if (i+1) % 20 == 0:
            print(f'epoch {epoch+1}/{num_epochs}, step {i+1}/{n_total_steps}, loss = {loss.item():.4f}')
            acc = evaluate_validation(model)
            print(f'Val_Acc = {acc:.4f}\n')
            if acc > max_val_acc:
                max_val_acc = acc
                opt_model = copy.deepcopy(model)
                

                
print(f'max_val_acc: {max_val_acc:.4f}')
final_val_acc = evaluate_validation(opt_model)
print(f'final_val_acc: {final_val_acc:.4f}')

----------------------
epoch 1/50, step 20/139, loss = 0.7173
Val_Acc = 41.3408

epoch 1/50, step 40/139, loss = 0.6383
Val_Acc = 64.2458

epoch 1/50, step 60/139, loss = 0.5934
Val_Acc = 64.2458

epoch 1/50, step 80/139, loss = 0.8718
Val_Acc = 64.2458

epoch 1/50, step 100/139, loss = 0.7602
Val_Acc = 64.2458

epoch 1/50, step 120/139, loss = 0.7069
Val_Acc = 76.5363

----------------------
epoch 2/50, step 20/139, loss = 0.6939
Val_Acc = 75.9777

epoch 2/50, step 40/139, loss = 0.6419
Val_Acc = 77.6536

epoch 2/50, step 60/139, loss = 0.6141
Val_Acc = 64.8045

epoch 2/50, step 80/139, loss = 0.8388
Val_Acc = 64.2458

epoch 2/50, step 100/139, loss = 0.7369
Val_Acc = 70.3911

epoch 2/50, step 120/139, loss = 0.6781
Val_Acc = 82.6816

----------------------
epoch 3/50, step 20/139, loss = 0.6795
Val_Acc = 77.0950

epoch 3/50, step 40/139, loss = 0.6369
Val_Acc = 81.0056

epoch 3/50, step 60/139, loss = 0.6252
Val_Acc = 77.6536

epoch 3/50, step 80/139, loss = 0.8275
Val_Acc = 72.0670


epoch 22/50, step 120/139, loss = 0.3865
Val_Acc = 82.6816

----------------------
epoch 23/50, step 20/139, loss = 0.1798
Val_Acc = 81.5642

epoch 23/50, step 40/139, loss = 0.2448
Val_Acc = 81.5642

epoch 23/50, step 60/139, loss = 0.9125
Val_Acc = 81.5642

epoch 23/50, step 80/139, loss = 0.8458
Val_Acc = 82.1229

epoch 23/50, step 100/139, loss = 0.6038
Val_Acc = 81.5642

epoch 23/50, step 120/139, loss = 0.3870
Val_Acc = 82.6816

----------------------
epoch 24/50, step 20/139, loss = 0.1788
Val_Acc = 81.5642

epoch 24/50, step 40/139, loss = 0.2438
Val_Acc = 81.5642

epoch 24/50, step 60/139, loss = 0.9158
Val_Acc = 81.5642

epoch 24/50, step 80/139, loss = 0.8448
Val_Acc = 82.1229

epoch 24/50, step 100/139, loss = 0.6046
Val_Acc = 81.5642

epoch 24/50, step 120/139, loss = 0.3874
Val_Acc = 82.6816

----------------------
epoch 25/50, step 20/139, loss = 0.1780
Val_Acc = 81.5642

epoch 25/50, step 40/139, loss = 0.2429
Val_Acc = 81.5642

epoch 25/50, step 60/139, loss = 0.9188
V

epoch 44/50, step 80/139, loss = 0.8261
Val_Acc = 81.5642

epoch 44/50, step 100/139, loss = 0.6022
Val_Acc = 82.1229

epoch 44/50, step 120/139, loss = 0.3954
Val_Acc = 81.5642

----------------------
epoch 45/50, step 20/139, loss = 0.1827
Val_Acc = 81.0056

epoch 45/50, step 40/139, loss = 0.2333
Val_Acc = 81.5642

epoch 45/50, step 60/139, loss = 0.9618
Val_Acc = 81.0056

epoch 45/50, step 80/139, loss = 0.8249
Val_Acc = 81.5642

epoch 45/50, step 100/139, loss = 0.6015
Val_Acc = 82.1229

epoch 45/50, step 120/139, loss = 0.3958
Val_Acc = 81.5642

----------------------
epoch 46/50, step 20/139, loss = 0.1835
Val_Acc = 80.4469

epoch 46/50, step 40/139, loss = 0.2329
Val_Acc = 81.5642

epoch 46/50, step 60/139, loss = 0.9639
Val_Acc = 81.0056

epoch 46/50, step 80/139, loss = 0.8236
Val_Acc = 81.5642

epoch 46/50, step 100/139, loss = 0.6007
Val_Acc = 82.1229

epoch 46/50, step 120/139, loss = 0.3964
Val_Acc = 81.5642

----------------------
epoch 47/50, step 20/139, loss = 0.1844


In [20]:
torch.save(opt_model, 'PATH')

In [114]:
# def pclass_to_one_hot(x):
#     hot = np.zeros((len(x), 3))
#     for i, pclass in enumerate(x):
#         hot[i,pclass-1] = 1
#     return hot

# def sex_to_isfemale(x):
#     is_female = np.zeros(len(x))
#     for i, sex in enumerate(x):
#         if sex == "female":
#             is_female[i] = 1
#     return is_female

# def embarked_to_one_hot(x):
#     hot = np.zeros((len(x), 4))
#     for i, embarked in enumerate(x):
#         if embarked == 'S':
#             embarked = 0
#         elif embarked == 'C':
#             embarked = 1
#         elif embarked == 'Q':
#             embarked = 2
#         else:
#             embarked = 3
#         hot[i,embarked] = 1
#     return hot

def preprocess_test_df(df):
    np_pclass = pclass_to_one_hot(df['Pclass'])
    np_is_female = sex_to_isfemale(df['Sex']).reshape((-1,1))
    np_embarked = embarked_to_one_hot(df['Embarked'])
    #np_age = (np.array(df['Age']) / 80).reshape(-1,1) # Age as float
    np_age = age_to_one_hot(df['Age'])  # Age as one-hot [under_7, 7_or_older, unknown]
    np_fare = normalize_fare(df['Fare'])
    
    return np.hstack([np_pclass, np_is_female, np_embarked, np_age, np_fare])



test_df = pd.read_csv('data/test.csv') 
test_df = test_df.loc[:, ["Pclass", "Sex", "Age", "Fare", "Embarked"]]



test_np = preprocess_test_df(test_df)



_, preds = torch.max(model(torch.tensor(test_np, dtype=torch.float32).to(device)),1)
for pred in preds:
    print(pred.item())

0
1
0
0
1
0
1
0
1
0
0
1
1
0
1
1
0
0
1
1
1
0
1
1
1
0
1
0
1
0
0
0
1
1
1
0
1
1
0
0
0
1
0
1
1
0
1
0
1
1
1
0
1
1
0
0
0
0
0
1
0
0
0
1
1
1
1
1
1
1
1
0
1
1
1
1
0
1
0
1
1
1
1
0
1
0
1
1
1
1
1
0
1
0
1
0
1
0
1
0
1
0
0
0
1
0
0
0
0
0
0
1
1
1
1
0
0
1
1
1
1
0
1
0
0
1
0
1
0
0
0
1
1
0
0
0
0
0
1
0
1
1
1
0
1
0
1
0
1
0
1
0
0
1
0
0
1
1
1
1
1
0
1
0
0
1
1
0
1
1
0
0
0
0
0
1
1
1
1
1
0
1
1
0
1
0
1
0
1
0
0
1
0
0
0
0
1
1
0
1
1
1
1
1
0
1
1
0
1
0
0
0
0
1
1
1
1
1
1
0
1
0
1
0
1
1
0
1
0
0
0
1
0
0
1
0
1
0
1
1
1
1
1
0
0
1
1
0
1
1
1
0
1
0
0
0
0
0
1
0
0
0
1
1
0
0
1
0
1
0
1
0
1
1
0
1
0
0
0
0
1
1
1
1
1
0
0
1
0
0
1
1
0
1
0
0
1
0
1
0
0
0
0
0
1
1
1
1
1
1
0
0
0
1
1
1
1
0
0
0
0
0
0
1
1
0
1
1
0
0
1
1
0
1
0
1
0
0
0
0
0
0
0
1
0
1
0
1
0
1
1
0
0
0
1
1
1
0
0
1
0
1
1
0
1
1
0
1
1
0
0
1
0
0
1
1
1
0
1
1
0
0
1
1
0
1
0
0
0
1
1
1
0
0
0
1
0
1
0
0
1
0
1
1
1
0
0
1
1
1
1
1
1
0
1
0
0
0


In [45]:
print(numpy)

[[0.         0.         0.         ... 0.         0.         0.25828491]
 [1.         0.         0.         ... 0.         0.         0.34839018]
 [1.         0.         0.         ... 0.         0.         0.33600814]
 ...
 [0.         0.         0.         ... 0.         0.         0.34801082]
 [0.         1.         0.         ... 0.         0.         0.63575497]
 [0.         0.         0.         ... 0.         0.         0.33763919]]
