# Titanic model using PyTorch

- https://www.kaggle.com/michaelabehsera/titanic-solution-a-beginner-s-guide
- https://www.kaggle.com/sashr07/kaggle-titanic-tutorial

## Loading

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("datasets/titanic_train.csv") # we assume we don't have test set.

In [3]:
np.random.seed(10)

## Split

- train set: 70%
- test set: 30%

Remove rows where target is not available

In [4]:
np.random.seed(10)
raw_train_data = df.sample(frac=0.7)
raw_test_data = df.drop(raw_train_data.index) # creates new pd.DataFrame object as always

## Transformation

- category to one-hot encoding
  - use some classes only which are more than 30 rows
- standardize fields
- fill not available values as its mean
- keep each transformation information and reuse them for testing
  - test set should not be analyzed to be fair.

In [5]:
all_column_names = set(df.columns)

df_categories = df.select_dtypes('object').astype('category')
category_column_names = set(df_categories.columns)

number_column_names = all_column_names - category_column_names

print(number_column_names)
print(category_column_names)

{'Passenger_Id', 'Passenger_Fare', 'Age', 'Number_of_Parents_or_Children', 'Number_of_Siblings_or_Spouses', 'Survived'}
{'Name', 'Cabin', 'Ticket_Number', 'Passenger_Class', 'Port_of_Embarkation', 'Sex'}


In [6]:
def get_valid_values(series, min_category_counts=30):
        all_val_counts = series.dropna().value_counts()
        all_val_counts = all_val_counts[all_val_counts >= min_category_counts]
        return all_val_counts.index

class BinaryEncoding():
    """binary-string to one-hot"""
    
    min_category_counts = 30
    
    def __init__(self):
        self.mapping = {} # {field: {class1: 0.0, class2: 1.0}

    def fit(self, df):
        for col_name in df.columns:
            if not pd.api.types.is_string_dtype(df[col_name].dtype):
                continue
            unique_values = get_valid_values(df[col_name], min_category_counts=self.min_category_counts)

            if len(unique_values) != 2:
                continue
            for val in unique_values:
                self.mapping[col_name] = {unique_values[i]: float(i) for i in range(len(unique_values))}

    def transform(self, df):
        for col_name in self.mapping:
            df[col_name] = df[col_name].map(self.mapping[col_name])

class OneHotEncoding():
    """multi-class-string to one-hot"""
    prefix_sep = '!'
    min_category_counts = 30

    def __init__(self):
        self.column_mapping = {} # {field: {class1, class2, class3, ...}}

    def fit(self, df):
        for col_name in df.columns:
            if not pd.api.types.is_string_dtype(df[col_name].dtype):
                continue
            unique_values = get_valid_values(df[col_name], min_category_counts=self.min_category_counts)

            if len(unique_values) <= 2:
                continue
            self.column_mapping[col_name] = set(unique_values)

    def transform(self, df):
        column_names = set(self.column_mapping)
        for col_name in column_names:
            values = self.column_mapping[col_name]
            for val in values:
                new_col_name = "{}{}{}".format(col_name, self.prefix_sep, val)
                df[new_col_name] = (df[col_name] == val).astype('float64')
        df.drop(columns=list(column_names), inplace=True)
    
class Standardization():
    def __init__(self):
        self.mean = None
        self.std = None

    def fit(self, df):
        self.mean = df.mean()
        self.std = df.std()

    def transform(self, df):
        transform_columns = self.mean.index
        input_columns = set(df.columns)
        for col_name in transform_columns:
            if col_name in input_columns:
                df[col_name] = (df[col_name] - self.mean[col_name])/self.std[col_name]

class Imputation():
    def __init__(self):
        self.mean = None

    def fit(self, df):
        self.mean = df.mean()

    def transform(self, df):
        for col_name in df.columns:
            if pd.api.types.is_numeric_dtype(df[col_name].dtype):
                df[col_name].fillna(self.mean[col_name], inplace=True)

class ColumnRemover():
    def __init__(self):
        self.column_names = set()

    def fit(self, df):
        for col_name in df.columns:
            if pd.api.types.is_numeric_dtype(df[col_name].dtype):
                self.column_names.add(col_name)
    def transform(self, df):
        for col_name in df.columns:
            if col_name not in self.column_names:
                df.drop(columns=col_name, inplace=True)
                
class FloatConverter():
    def __init__(self):
        self.column_names = set()

    def fit(self, df):
        for col_name in df.columns:
            if pd.api.types.is_numeric_dtype(df[col_name].dtype):
                self.column_names.add(col_name)
    def transform(self, df):
        for col_name in df.columns:
            if col_name in self.column_names:
                df[col_name] = df[col_name].astype('float64')

class Chain():
    def __init__(self, transforms):
        self.transforms = transforms

    def fit_transform(self, df):
        for tr in self.transforms:
            tr.fit(df)
            tr.transform(df)
    
    def transform(self, df):
        for tr in self.transforms:
            tr.transform(df)

target_column_name = 'Survived'
fields_to_ignore = ["Name", "Passenger_Id"] + [target_column_name]
input_transform_chain = Chain([BinaryEncoding(), OneHotEncoding(), Imputation(), Standardization(), ColumnRemover(), FloatConverter()])
train_data_x = raw_train_data.copy().drop(columns=fields_to_ignore)
input_transform_chain.fit_transform(train_data_x)
train_data_x

Unnamed: 0,Sex,Age,Number_of_Siblings_or_Spouses,Number_of_Parents_or_Children,Passenger_Fare,Passenger_Class!Third,Passenger_Class!Second,Passenger_Class!First,Port_of_Embarkation!Southampton,Port_of_Embarkation!Queenstown,Port_of_Embarkation!Cherbourg
590,-0.724451,0.425540,-0.496711,-0.470389,-0.500695,0.904394,-0.502600,-0.574422,0.618852,-0.298098,-0.490073
131,-0.724451,-0.718107,-0.496711,-0.470389,-0.502181,0.904394,-0.502600,-0.574422,0.618852,-0.298098,-0.490073
628,-0.724451,-0.260648,-0.496711,-0.470389,-0.485420,0.904394,-0.502600,-0.574422,0.618852,-0.298098,-0.490073
195,1.378143,2.179132,-0.496711,-0.470389,2.261676,-1.103940,-0.502600,1.738090,-1.613307,-0.298098,2.037244
230,1.378143,0.425540,0.474912,-0.470389,1.012314,-1.103940,-0.502600,1.738090,0.618852,-0.298098,-0.490073
...,...,...,...,...,...,...,...,...,...,...,...
620,-0.724451,-0.184405,0.474912,-0.470389,-0.355454,0.904394,-0.502600,-0.574422,-1.613307,-0.298098,2.037244
291,1.378143,-0.794350,0.474912,-0.470389,1.163004,-1.103940,-0.502600,1.738090,-1.613307,-0.298098,2.037244
800,-0.724451,0.349297,-0.496711,-0.470389,-0.384272,-1.103940,1.986466,-0.574422,0.618852,-0.298098,-0.490073
787,-0.724451,-1.633025,3.389782,0.773351,-0.064726,0.904394,-0.502600,-0.574422,-1.613307,3.349222,-0.490073


In [7]:
label_transform = FloatConverter()
train_data_y = raw_train_data[[target_column_name]].copy()
label_transform.fit(train_data_y)
label_transform.transform(train_data_y)
train_data_y

Unnamed: 0,Survived
590,0.0
131,0.0
628,0.0
195,1.0
230,1.0
...,...
620,0.0
291,1.0
800,0.0
787,0.0


In [8]:
test_data_x = raw_test_data.copy().drop(columns=fields_to_ignore)
input_transform_chain.transform(test_data_x)
test_data_x

Unnamed: 0,Sex,Age,Number_of_Siblings_or_Spouses,Number_of_Parents_or_Children,Passenger_Fare,Passenger_Class!Third,Passenger_Class!Second,Passenger_Class!First,Port_of_Embarkation!Southampton,Port_of_Embarkation!Queenstown,Port_of_Embarkation!Cherbourg
0,-0.724451,-5.656210e-01,0.474912,-0.470389,-0.498218,0.904394,-0.502600,-0.574422,0.618852,-0.298098,-0.490073
4,-0.724451,4.255397e-01,-0.496711,-0.470389,-0.482364,0.904394,-0.502600,-0.574422,0.618852,-0.298098,-0.490073
5,-0.724451,2.708700e-16,-0.496711,-0.470389,-0.474273,0.904394,-0.502600,-0.574422,-1.613307,3.349222,-0.490073
8,1.378143,-1.844053e-01,-0.496711,2.017092,-0.421263,0.904394,-0.502600,-0.574422,0.618852,-0.298098,-0.490073
9,1.378143,-1.175566e+00,0.474912,-0.470389,-0.045984,-1.103940,1.986466,-0.574422,-1.613307,-0.298098,2.037244
...,...,...,...,...,...,...,...,...,...,...,...
879,1.378143,2.026645e+00,-0.496711,0.773351,1.006038,-1.103940,-0.502600,1.738090,-1.613307,-0.298098,2.037244
881,-0.724451,2.730535e-01,-0.496711,-0.470389,-0.485420,0.904394,-0.502600,-0.574422,0.618852,-0.298098,-0.490073
882,1.378143,-5.656210e-01,-0.496711,-0.470389,-0.433482,0.904394,-0.502600,-0.574422,0.618852,-0.298098,-0.490073
883,-0.724451,-1.081622e-01,-0.496711,-0.470389,-0.433813,-1.103940,1.986466,-0.574422,0.618852,-0.298098,-0.490073


In [9]:
test_data_y = raw_test_data[[target_column_name]].copy()
label_transform.transform(test_data_y)
test_data_y

Unnamed: 0,Survived
0,0.0
4,0.0
5,0.0
8,1.0
9,1.0
...,...
879,1.0
881,0.0
882,0.0
883,0.0


## Training

### Vanilla NumPy

- logistic regression
- numerical differentiation

In [10]:
np.random.seed(10)

x_train = train_data_x.values
y_train = train_data_y.values
x_test = test_data_x.values
y_test = test_data_y.values

def mse(y_pred, y):
    return np.mean((y_pred - y)**2)

def sigmoid(x):
    return 1.0 / (1.0 + np.e**(-x))

class LogisticRegression:
    def __init__(self, n_input_fields):
        n_output_fields = 1
        shape = (n_input_fields + 1, n_output_fields)
        self.weight = np.random.normal(0, (1 / n_input_fields) ** 0.5 , shape)

    def forward(self, x, weight=None):
        if weight is None:
            weight = self.weight
        y_pred = sigmoid(x@weight[1:] + weight[0])
        return y_pred

model = LogisticRegression(x_train.shape[1])
lr = 0.01
epochs = 2000

for epoch in range(epochs):
    diff = np.zeros(model.weight.shape)
    weight = model.weight.copy()
    
    y_pred1 = model.forward(x_train)
    loss1 = mse(y_pred1, y_train)
    
    for wi in range(model.weight.shape[0]):
        h = 0.0000000000001
        weight[wi][0] += h
        y_pred2 = model.forward(x_train, weight)
        loss2 = mse(y_pred2, y_train)
        diff[wi][0] = (loss2 - loss1) / h
        weight[wi][0] = model.weight[wi][0]
    model.weight -= (lr * diff)

y_pred = model.forward(x_test)

print("predicted y values:")
for i in range(10):
    print(i, y_pred[i][0])

y_pred = (y_pred >= 0.5).astype(np.float64)
accuracy_test = np.mean(np.equal(y_pred, y_test))
print("accuracy:", accuracy_test)

print("weights:", model.weight)

predicted y values:
0 0.15736739097296065
1 0.13152104676554174
2 0.20552674158824866
3 0.5679862998019055
4 0.8534556771252904
5 0.1227336254387511
6 0.6186761460372673
7 0.49371394291825715
8 0.6286761413521978
9 0.4275160887060856
accuracy: 0.8089887640449438
weights: [[-0.41616862]
 [ 0.8941803 ]
 [-0.36339054]
 [-0.15325725]
 [ 0.02136428]
 [ 0.1035462 ]
 [-0.3428514 ]
 [ 0.13686198]
 [ 0.39111542]
 [ 0.08442419]
 [ 0.15585319]
 [ 0.17827974]]


### PyTorch - 1

- logistic regression

In [11]:
import torch
from torch.autograd import Variable
from torch.nn import functional as F

torch.manual_seed(10)
np.random.seed(10)

x_train = torch.Tensor(train_data_x.values)
y_train = torch.Tensor(train_data_y.values)
x_test = torch.Tensor(test_data_x.values)
y_test = test_data_y.values

class LogisticRegression(torch.nn.Module):
    def __init__(self, n_input_fields):
        super().__init__()
        n_output_fields = 1
        self.linear = torch.nn.Linear(n_input_fields, n_output_fields)

    def forward(self, x):
        y_pred = F.sigmoid(self.linear(x))
        return y_pred

model = LogisticRegression(x_train.shape[1])

criterion = torch.nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
for epoch in range(2000):
    model.train()
    optimizer.zero_grad()    # Forward pass
    y_pred = model(x_train)    # Compute Loss
    loss = criterion(y_pred, y_train)    # Backward pass
    loss.backward()
    optimizer.step()
    
y_pred = model(x_test)

print("predicted y values:")
for i in range(10):
    print(i, y_pred.data[i][0])

y_pred = y_pred.detach().numpy()
y_pred = (y_pred >= 0.5).astype(np.float64)
accuracy_test = np.mean(np.equal(y_pred, y_test))
print("accuracy:", accuracy_test)

print("weights:")
for name, param in model.named_parameters():
    if param.requires_grad:
        print (name, param.data)



predicted y values:
0 tensor(0.1504)
1 tensor(0.1285)
2 tensor(0.2032)
3 tensor(0.5333)
4 tensor(0.8248)
5 tensor(0.1193)
6 tensor(0.6188)
7 tensor(0.4734)
8 tensor(0.6106)
9 tensor(0.4168)
accuracy: 0.8127340823970037
weights:
linear.weight tensor([[ 0.8643, -0.3107, -0.1267,  0.0142,  0.0979, -0.2700,  0.1999,  0.4555,
         -0.0509,  0.0828,  0.0301]])
linear.bias tensor([-0.4880])


### PyTorch - 2

- neural net

In [12]:
import torch
from torch.autograd import Variable
from torch.nn import functional as F

torch.manual_seed(10)
np.random.seed(10)

x_train = torch.Tensor(train_data_x.values)
y_train = torch.Tensor(train_data_y.values)
x_test = torch.Tensor(test_data_x.values)
y_test = test_data_y.values

class MyNeuralNet(torch.nn.Module):
    def __init__(self, input_size):
        super().__init__()
        output_size = 1
        hidden1_size = 10
        hidden2_size = 10
        self.linear1 = torch.nn.Linear(input_size, hidden1_size)
        self.linear2 = torch.nn.Linear(hidden1_size, hidden2_size)
        self.linear3 = torch.nn.Linear(hidden2_size, output_size)

    def forward(self, x):
        hidden1 = F.relu(self.linear1(x))
        hidden2 = F.relu(self.linear2(hidden1))
        y_pred = F.sigmoid(self.linear3(hidden2))
        return y_pred

model = MyNeuralNet(x_train.shape[1])
criterion = torch.nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
for epoch in range(4000):
    model.train()
    optimizer.zero_grad()    # Forward pass
    y_pred = model(x_train)    # Compute Loss
    loss = criterion(y_pred, y_train)    # Backward pass
    loss.backward()
    optimizer.step()
    
y_pred = model(x_test)

print("predicted y values:")
for i in range(10):
    print(i, y_pred.data[i][0])

y_pred = y_pred.detach().numpy()
y_pred = (y_pred >= 0.5).astype(np.float64)
accuracy_test = np.mean(np.equal(y_pred, y_test))
print("accuracy:", accuracy_test)

print("weights:")
for name, param in model.named_parameters():
    if param.requires_grad:
        print (name, param.data)

predicted y values:
0 tensor(0.2084)
1 tensor(0.2015)
2 tensor(0.2219)
3 tensor(0.4938)
4 tensor(0.8356)
5 tensor(0.1982)
6 tensor(0.6546)
7 tensor(0.4009)
8 tensor(0.6275)
9 tensor(0.3369)
accuracy: 0.8089887640449438
weights:
linear1.weight tensor([[ 0.4212, -0.1824, -0.1009,  0.0351, -0.1524, -0.2346,  0.2414,  0.3762,
          0.0645, -0.1064,  0.0372],
        [ 0.0073,  0.1165,  0.0336,  0.0992,  0.3135, -0.1490, -0.0100,  0.0962,
          0.0121, -0.0452, -0.2729],
        [-0.4361, -0.0800, -0.3159, -0.3770, -0.2779, -0.2737, -0.2005,  0.0576,
          0.2095,  0.1355,  0.1822],
        [ 0.0468,  0.1730, -0.1771,  0.3046, -0.0351,  0.3026,  0.0604,  0.2169,
          0.2111, -0.0450, -0.0669],
        [ 0.3635, -0.0795, -0.0146,  0.0645, -0.2166, -0.1712, -0.1202,  0.3642,
         -0.2076,  0.0509, -0.0856],
        [ 0.1963, -0.0215, -0.0267,  0.1975,  0.2516, -0.2148, -0.2818,  0.0121,
          0.0771,  0.0689,  0.2996],
        [ 0.5542, -0.2863, -0.2288,  0.0971,  0.3