# Note on titanic binary classification project

**Objective**: Here I attempt to build a simple neural network from scratch

** Step 1 **: Load libraries

In [222]:
#importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf
import keras as k

from sklearn import preprocessing

%matplotlib inline

** Step 2 **: Need to load data set from a csv file. Numpy's genfromtxt can be used, however, because ',' is present in string, use pandas's read_csv instead.

In [223]:
#load data sets
train_set = pd.read_csv('data/train.csv', delimiter=',', quotechar='"')
test_set = pd.read_csv('data/test.csv', delimiter=',', quotechar='"')

#have a peep at the training data
train_set.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [224]:
#data types in the training set
#train_set.dtypes

#missing data in 'Cabin'
print('Missing data in ''Cabin'' =', train_set['Cabin'].isnull().sum())

#total missing data
print('Missing data in each feature in training set:\n', train_set.isnull().sum())

#replacing missing values in 'Age' with median age
median_age = train_set['Age'].median()
train_set['Age'].fillna(median_age, inplace=True)
print('Median age =', median_age)
#print(train_set.isnull().sum())

#check missing values in test set
print('Missing data in each feature in test set:\n', test_set.isnull().sum())
#also replace mssing values in 'Age' with median age
median_age = test_set['Age'].median()
test_set['Age'].fillna(median_age, inplace=True)
print('Median age in test set =', median_age)

Missing data in Cabin = 687
Missing data in each feature in training set:
 PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
Median age = 28.0
Missing data in each feature in test set:
 PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64
Median age in test set = 27.0


The shape of the training set is correctly (891, 12), and that of the test set (418, 11). Note that the test set has one fewer input feature because we don't know the survival of each passenger.

Next, determine the number of training examples n_train to be 891 and  number of test examples n_test 418.

In [225]:
n_train = train_set.shape[0]
n_test = test_set.shape[0]
n_x = train_set.shape[1]-2 #passenger ID and survived

print("Number of training examples: n_train = " + str(n_train))
print("Number of testing examples: n_test = " + str(n_test))
print("Number of features for each example: n_x = " + str(n_x))

Number of training examples: n_train = 891
Number of testing examples: n_test = 418
Number of features for each example: n_x = 10


In [226]:
#Extract data from train_set
train_features = [train_set['Pclass'], train_set['Sex'], train_set['Age'], train_set['SibSp'], train_set['Parch'], train_set['Fare']]
X1 = pd.concat(train_features, axis=1)
#X1.head()
n_features = X1.shape[1]
print('Number of features to train = ', n_features)

#Extract data into X_train and Y_train, now numpy arrays -- not a good way!!!!
X_train = X1.iloc[:, :].values #matrix dimensions (n_train, n_x-4) excluded 'Name', 'Ticket', 'Cabin', 'Embarked'
Y_train = train_set.iloc[:, 1].values #vector dimension (n_train)
gender = preprocessing.LabelEncoder()
X_train[:, 1] = gender.fit_transform(X_train[:, 1])
print('X_train', X_train[:, :])

#Extract data from train_set
test_features = [test_set['Pclass'], test_set['Sex'], test_set['Age'], test_set['SibSp'], test_set['Parch'], test_set['Fare']]
X2 = pd.concat(test_features, axis=1)
X2.head()

X_test = X2.iloc[:, :].values
X_test[:, 1] = gender.fit_transform(X_test[:, 1])
print('X_test', X_test[0:5, :])

Number of features to train =  6
X_train [[3 1 22.0 1 0 7.25]
 [1 0 38.0 1 0 71.2833]
 [3 0 26.0 0 0 7.925]
 ...
 [3 0 28.0 1 2 23.45]
 [1 1 26.0 0 0 30.0]
 [3 1 32.0 0 0 7.75]]
X_test [[3 1 34.5 0 0 7.8292]
 [3 0 47.0 1 0 7.0]
 [2 1 62.0 0 0 9.6875]
 [3 1 27.0 0 0 8.6625]
 [3 0 22.0 1 1 12.2875]]


In [227]:
#normalizing inputs - appears to me so far using mean and variance 
sc = preprocessing.StandardScaler()
X_train = sc.fit_transform(X_train)
X_trainT = np.transpose(X_train)

X_test = sc.fit_transform(X_test)
X_testT = np.transpose(X_test)
print('X_train', X_train[:, :], X_trainT.shape)
print('X_test', X_test[:, :], X_testT.shape)

X_train [[ 0.82737724  0.73769513 -0.56573646  0.43279337 -0.47367361 -0.50244517]
 [-1.56610693 -1.35557354  0.66386103  0.43279337 -0.47367361  0.78684529]
 [ 0.82737724 -1.35557354 -0.25833709 -0.4745452  -0.47367361 -0.48885426]
 ...
 [ 0.82737724 -1.35557354 -0.1046374   0.43279337  2.00893337 -0.17626324]
 [-1.56610693  0.73769513 -0.25833709 -0.4745452  -0.47367361 -0.04438104]
 [ 0.82737724  0.73769513  0.20276197 -0.4745452  -0.47367361 -0.49237783]] (6, 891)
X_test [[ 0.87348191  0.75592895  0.38623105 -0.49947002 -0.4002477  -0.49781052]
 [ 0.87348191 -1.32287566  1.37137004  0.61699237 -0.4002477  -0.51265996]
 [-0.31581919  0.75592895  2.55353683 -0.49947002 -0.4002477  -0.46453181]
 ...
 [ 0.87348191  0.75592895  0.70147553 -0.49947002 -0.4002477  -0.50818292]
 [ 0.87348191  0.75592895 -0.20485235 -0.49947002 -0.4002477  -0.4938564 ]
 [ 0.87348191  0.75592895 -0.20485235  0.61699237  0.61989583 -0.23762123]] (6, 418)




### Strategy

This is a binary classification problem––a simple neural network using logistic regression may work. Let's review the theory.

We have a training set inputs represented by a matrix $X = [x^{(1)}, x^{(2)},\cdots, x^{(m)}]$, where $m$ is the number of training examples, `n_train`. The shape of $X$ is $(n_x, m)$, $n_x$ is the number of input features.

The output label vector for the training set $Y = [y^{(1)}, y^{(2)}, \cdots, y^{(m)}]$ has the dimension $(1, m)$, where $y^{(i)}=0$ or $1$. Our goal is basically to find a function that best describes the relationship between $X$ and $Y$, or best fitting function. 

$\hat{y}$ is an estimate of how likely $y=1$ given $x$, $\hat{y} = P(y=1 | x)$, and given the probability nature of the problem, the sigmoid function is used to map a predicted value into a probability, that is, a number between 0 and 1.

$$\hat{y} ^{(i)} = a^{(i)} = \mathrm{sigmoid}(z^{(i)}) = \sigma(z^{(i)}) = \frac{1}{1 + e^{-z^{(i)}}}$$

where $z^{(i)} = w^T x^{(i)} + b$ for a training example (or data point), $x^{(i)}$. Here $b$ is a bias vector, $a^{(i)}$ is called the activation function, the weight matrix $w$ can be understood as fitting coefficients, which give the "importance" or contribution of each input feature.

It follows that the loss function: $\mathcal{L}(a^{(i)}, y^{(i)}) = -y^{(i)}log(a^{(i)})-(1-y^{(i)})log(1-a^{(i)})$

This is a logistic loss function.

The cost function: 
$$J = \frac{1}{m}\sum_{i=1}^m \mathcal{L}(a^{(i)}, y^{(i)})$$

The basic structure of a neural network:
* Calculate the current loss (forward propagation)
* Calculate the current gradient (backward propagation)
* Update parameters (gradient descent)

Ideally, we can extract the training examples $X$ and output labels $Y$ from the training set and implement the different functions as shown below. However, there are different data types in the training set, making this not a trivial task.

In [228]:
# attempt to use Keras to train a fully connected network
model = k.models.Sequential()

#add input layer and the first hidden layer
model.add(k.layers.Dense(5, kernel_initializer='uniform', input_shape=(n_features,), activation='relu'))

#add second hidden layer
model.add(k.layers.Dense(5, kernel_initializer='uniform', activation='relu'))

#add output layer
model.add(k.layers.Dense(1, kernel_initializer='uniform', activation='sigmoid'))

#compile network
optimizer = k.optimizers.Adam(lr=0.005)
model.compile(optimizer, loss='binary_crossentropy', metrics=['accuracy'])

#run network
model.fit(X_train, Y_train, batch_size=20, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1a3afe2b00>

In [231]:
Y_pred = model.predict(X_test, batch_size=100)
Y_pred = (Y_pred > 0.5).astype(int)
print(Y_pred.shape)
Y = pd.DataFrame({'Survived' : Y_pred[:, 0]})
submission = pd.concat((test_set['PassengerId'], Y), axis=1)
submission.head(100)
print('Number of passengers predicted to survive =', Y.sum())
submission.to_csv('data/submission_AR.csv', index=False)

(418, 1)
Number of passengers predicted to survive = Survived    144
dtype: int64


  


### Functions to implement:
* `initialization` which initializes parameters
* `forward propagation` which computes the activations $A = \mathrm{sigmoid}(w^T X+b)$ where $A = (a^{(1)}, a^{(2)},\cdots, a^{(m)})$, $m$ is the number of training examples, and the cost function $J$
* `backward propagation` which computes the gradients
$$\frac{\partial J}{\partial w} = \frac{1}{m}X(A-Y)^T$$
$$\frac{\partial J}{\partial b} = \frac{1}{m}\sum_{i=1}^m (a^{(i)}-y^{(i)})$$
* `optimization` which minimizes the cost function $J$ using gradient descent. For any parameter $\theta$, the update rule is $\theta = \theta-\alpha d\theta$ where $\alpha$ is the learning rate.
* `prediction` which makes the prediction using $\hat{Y} = A = \sigma(W^T X + b)$. Values smaller than a threshold (let's say 0.5) are converted into 0's and values larger than 0.5 are converted into 1's. 