In [43]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import os

from sklearn.preprocessing import LabelEncoder 

import torch 
import torch.nn as nn 
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F


In [44]:
train = pd.read_csv("/home/hasan/Data Set/titanic/train.csv")
test = pd.read_csv("/home/hasan/Data Set/titanic/test.csv")
sub = pd.read_csv("/home/hasan/Data Set/titanic/gender_submission.csv")

In [45]:
train.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [46]:
test.head(3)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q


In [47]:
print("Shape of train {} Shape of test {}".format(train.shape, test.shape))

Shape of train (891, 12) Shape of test (418, 11)


In [48]:
train_test = pd.concat([train, test], sort=False)
train_test.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [49]:
def preprocess(df, cat_cols):
    df = df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
  
    for cat_col in cat_cols:
        if cat_col in ['Embarked']:
            df[cat_col] = LabelEncoder().fit_transform(df[cat_col].astype(str))
        else:
            df[cat_col] = LabelEncoder().fit_transform(df[cat_col])
  
    df = df.fillna(df.mean())
    return df

In [50]:
cat_cols = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked']
train_test = preprocess(train_test, cat_cols)
train_test.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0.0,2,1,22.0,1,0,7.25,2
1,1.0,0,0,38.0,1,0,71.2833,0
2,1.0,2,0,26.0,0,0,7.925,2
3,1.0,0,0,35.0,1,0,53.1,2
4,0.0,2,1,35.0,0,0,8.05,2


In [51]:
train_df = train_test.head(train.shape[0])
train_df.shape

(891, 8)

In [52]:
class TabularDataset(Dataset):
    
    def __init__(self, df, categorical_columns, output_column=None):
           
        super().__init__()
        self.len = df.shape[0]

        self.categorical_columns = categorical_columns
        self.continous_columns = [col for col in df.columns if col not in self.categorical_columns + [output_column]]

        if self.continous_columns:
            self.cont_X = df[self.continous_columns].astype(np.float32).values
        else:
            self.cont_X = np.zeros((self.len, 1))

        if self.categorical_columns:
            self.cat_X = df[self.categorical_columns].astype(np.int64).values
        else:
            self.cat_X = np.zeros((self.len, 1))

        if output_column != None:
            self.has_label = True
            self.label = df[output_column].astype(np.float32).values.reshape(-1, 1)
        else:
            self.has_label = False

    def __len__(self):
        return self.len
  
    def __getitem__(self, index):
        if self.has_label:
            return [self.label[index], self.cont_X[index], self.cat_X[index]]
        else:
            return [self.cont_X[index], self.cat_X[index]]
        

In [53]:
train_ds = TabularDataset(train_df, cat_cols, 'Survived')
train_dl = DataLoader(train_ds, 64, shuffle=True)

In [54]:
len(train_ds)

891

# Model

In [55]:
class TitanicNet(nn.Module):
    def __init__(self, emb_dims, n_cont, lin_layer_sizes, output_size):
        super().__init__()
        self.emb_layers = nn.ModuleList([nn.Embedding(x, y) for x, y in emb_dims])

        self.n_embs = sum([y for x, y in emb_dims])
        self.n_cont = n_cont

        # Linear Layers
        first_lin_layer = nn.Linear(self.n_embs + self.n_cont, lin_layer_sizes[0])

        self.lin_layers = nn.ModuleList(
            [first_lin_layer] + 
            [nn.Linear(lin_layer_sizes[i], lin_layer_sizes[i + 1]) for i in range(len(lin_layer_sizes) - 1)]
        )

    #     for lin_layer in self.lin_layers:
    #       nn.init.kaiming_normal_(lin_layer.weight.data)

        # Output Layer
        self.output_layer = nn.Linear(lin_layer_sizes[-1], output_size)
        nn.init.kaiming_normal_(self.output_layer.weight.data)

        # Batch Norm Layers
        self.first_bn_layer = nn.BatchNorm1d(self.n_cont)
        self.bn_layers = nn.ModuleList([nn.BatchNorm1d(size) for size in lin_layer_sizes])

    def forward(self, cont_data, cat_data):
        if self.n_embs != 0:
            x = [emb_layer(cat_data[:, i]) for i, emb_layer in enumerate(self.emb_layers)]
            x = torch.cat(x, 1)

        if self.n_cont != 0:
            normalized_cont_data = self.first_bn_layer(cont_data)

            if self.n_embs != 0:
                x = torch.cat([x, normalized_cont_data], 1) 
            else:
                x = cont_data

        for lin_layer, bn_layer in zip(self.lin_layers, self.bn_layers):
            x = torch.relu(lin_layer(x))
            x = bn_layer(x)

        x = self.output_layer(x)
        x = torch.sigmoid(x)
        return x
    

In [56]:
cat_dims = [int(train_test[col].nunique()) for col in cat_cols]
cat_dims


[3, 2, 7, 8, 4]

In [57]:
emb_dims = [(x, min(50, (x + 1) // 2)) for x in cat_dims]
emb_dims


[(3, 2), (2, 1), (7, 4), (8, 4), (4, 2)]

In [58]:
torch.manual_seed(2)

<torch._C.Generator at 0x7f7a2146d9b0>

In [59]:
model = TitanicNet(emb_dims, n_cont=2, lin_layer_sizes=[50, 100, 50], output_size=1)
optimizer = torch.optim.Adam(model.parameters(), lr=0.003)
no_of_epochs = 10
criterion = nn.BCELoss()

In [60]:
model = TitanicNet(emb_dims, n_cont=2, lin_layer_sizes=[50, 100, 50], output_size=1)
optimizer = torch.optim.Adam(model.parameters(), lr=0.003)
no_of_epochs = 10
criterion = nn.BCELoss()

for epoch in range(no_of_epochs):
    epoch_loss = 0
    epoch_accuracy = 0
    i = 0
    for y, cont_x, cat_x in train_dl:
        preds = model(cont_x, cat_x)
        loss = criterion(preds, y)
        epoch_loss += loss
    
        accuracy = ((preds > 0.5).float() == y).float().mean()
        epoch_accuracy += accuracy

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
  
    print("Epoch ", epoch, ", loss: ", epoch_loss.item()/len(train_dl), "accuracy: ", epoch_accuracy.item()/len(train_dl))

Epoch  0 , loss:  0.6176180158342633 accuracy:  0.7024440084184919
Epoch  1 , loss:  0.47217856134687153 accuracy:  0.7886652265276227
Epoch  2 , loss:  0.42706472533089773 accuracy:  0.8192721094403949
Epoch  3 , loss:  0.4166689600263323 accuracy:  0.8143348693847656
Epoch  4 , loss:  0.4038721833910261 accuracy:  0.8123865127563477
Epoch  5 , loss:  0.4123155048915318 accuracy:  0.8107974869864327
Epoch  6 , loss:  0.3864325114658901 accuracy:  0.8360131808689663
Epoch  7 , loss:  0.384969881602696 accuracy:  0.8317380632672992
Epoch  8 , loss:  0.3684449536459787 accuracy:  0.8428987775530133
Epoch  9 , loss:  0.3463811193193708 accuracy:  0.8517327308654785


In [36]:
test_df = all_df.tail(test.shape[0])
test_ds = TabularDataset(test_df, cat_cols, 'Survived') # The label is actually useless. But to keep our code consistent, we leave it here.
test_dl = DataLoader(test_ds, len(test_ds))


In [38]:
with torch.no_grad():
      for _, cont_x, cat_x in test_dl:
            
            preds = model(cont_x, cat_x)
            preds = (preds > 0.5)

In [39]:
preds.flatten().shape


torch.Size([418])

In [40]:
output_df = pd.DataFrame({'PassengerId':test['PassengerId'],'Survived':preds.flatten().numpy()})


In [41]:
output_df.head()

Unnamed: 0,PassengerId,Survived
0,892,False
1,893,False
2,894,False
3,895,False
4,896,False


In [42]:
output_df.to_csv('titanic_preds.csv', index=False)