# Titanic dataset from Kaggle at:
https://www.kaggle.com/c/titanic/data
## We wish to train our algorithm to tell to what extent titanic passengers were or not doomed to die

In [108]:
import pandas as pd
import numpy as np
from prettytable import PrettyTable
from sklearn import preprocessing

In [3]:
titanic_data = pd.read_csv("./Datasets/titanic_train.csv")
titanic_data.head(20)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


## Drop columns id, name, ticket, and cabin
Because they contain information of no use *or hard use* for our algorithm

In [4]:
titanic_data = titanic_data.drop(['PassengerId','Name', 'Ticket', 'Cabin' ], axis=1)

## Change the way Embarked and sex are encoded

In [5]:
titanic_data['Sex'] = titanic_data['Sex'].map({'female': 1, 'male': 0})
titanic_data['Embarked'] = titanic_data['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})

In [6]:
titanic_data.head(10)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,0,22.0,1,0,7.25,0.0
1,1,1,1,38.0,1,0,71.2833,1.0
2,1,3,1,26.0,0,0,7.925,0.0
3,1,1,1,35.0,1,0,53.1,0.0
4,0,3,0,35.0,0,0,8.05,0.0
5,0,3,0,,0,0,8.4583,2.0
6,0,1,0,54.0,0,0,51.8625,0.0
7,0,3,0,2.0,3,1,21.075,0.0
8,1,3,1,27.0,0,2,11.1333,0.0
9,1,2,1,14.0,1,0,30.0708,1.0


In [96]:
# For now lets drop thos rows with missing values ):
titanic_data = titanic_data.dropna()

## Apply Gradient Descent
### Following Logistic Regression Heuristics
- Hypothesis Function
$$ h_\theta (X^{i}) = \frac{1}{1+e^{-\Theta ^{T}X^{i}}} $$

- Cost Function
$$ J(\theta) = - \frac{1}{m} \displaystyle \sum_{i=1}^m [Y^{i}\log (h_\theta (x^{i})) + (1 - Y^{i})\log (1 - h_\theta(X^{i}))] $$

- Gradient Descent
$$ \theta_{j}  = \theta_{j} - \alpha\frac{1}{m} \sum_{i=1}^{m} (h_\theta (X^{i}) - Y^{i})*X_j^{i} $$

In [112]:
dataMatrix = titanic_data.as_matrix()

NbPoints = titanic_data.shape[0]
NbVariables = titanic_data.shape[1] #because the result is not considered a variable, but we added a new clm with 1's

X = np.delete(dataMatrix, 0, 1) #only the parameters
X = np.insert(X, 0, 1, axis = 1) #Add a column with all 1 for independen parameter

Y = dataMatrix.T[0].reshape(NbPoints, 1) # Only the result which is the nb or rings as a column

In [113]:
# Define hypothesis function

# Th is the current Theta vector for our linear model
# X is the matrix with the training data
# i is the row of the data we want to know the prediction of 
def hyp (Th, X_i): 
    return 1 / (1 + ( np.exp( - np.dot( Th, X_i ) ) ) )

In [114]:
# Define the gradient correspondent to the logistic const function

# Th is the previous value of Th we had as a vector containing many thetas
# j is the index of Th we wish to upate, we will have to update all or some for stochastic gradient descent
# X is a matrix with colums being the variables used for learning
# Y is a column vecotr which gives the correct results for the parameters of X
def gradient (Th, j, X, Y):
    gradient = 0
    
    for i in range(NbPoints):
        gradient += ( hyp( Th, X[i] ) - Y[i] ) * X[i][j]
    
    return gradient

In [116]:
# Use the gradient of each parameter to update accordingly
Th = np.ones(NbVariables) # Initialize to random values like all 1's
Alph = 0.001
m = NbPoints

iterations = 70

### for printing the steps ###
lables = ['k', 'indep','Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
t = PrettyTable(lables)

for k in range(iterations):
    NewTh = Th #Define a new th as placeholder because we want to update all at once
    
    for j in range(NbVariables):
        NewTh[j] -= Alph/m * gradient(Th, j, X, Y)
    
    Th = NewTh
    t.add_row(np.insert( np.round(Th, decimals=3), 0, k+1 ) ) # Add to the printer table, prepend the step
    
print(t)

+------+-------+--------+-------+--------+-------+-------+-------+----------+
|  k   | indep | Pclass |  Sex  |  Age   | SibSp | Parch |  Fare | Embarked |
+------+-------+--------+-------+--------+-------+-------+-------+----------+
| 1.0  | 0.999 | 0.999  |  1.0  | 0.982  |  1.0  |  1.0  | 0.986 |   1.0    |
| 2.0  | 0.999 | 0.997  |  1.0  | 0.964  | 0.999 |  1.0  | 0.973 |   1.0    |
| 3.0  | 0.998 | 0.996  |  1.0  | 0.945  | 0.999 | 0.999 | 0.959 |   1.0    |
| 4.0  | 0.998 | 0.994  |  1.0  | 0.927  | 0.999 | 0.999 | 0.945 |  0.999   |
| 5.0  | 0.997 | 0.993  |  1.0  | 0.909  | 0.998 | 0.999 | 0.932 |  0.999   |
| 6.0  | 0.996 | 0.991  | 0.999 | 0.891  | 0.998 | 0.999 | 0.918 |  0.999   |
| 7.0  | 0.996 |  0.99  | 0.999 | 0.872  | 0.998 | 0.998 | 0.904 |  0.999   |
| 8.0  | 0.995 | 0.988  | 0.999 | 0.854  | 0.997 | 0.998 | 0.891 |  0.999   |
| 9.0  | 0.995 | 0.987  | 0.999 | 0.836  | 0.997 | 0.998 | 0.877 |  0.999   |
| 10.0 | 0.994 | 0.985  | 0.999 | 0.818  | 0.997 | 0.998 | 0.863

In [None]:
# Use the gradient of each parameter to update accordingly
Th = np.ones(NbVariables) # Initialize to random values like all 1's
Alph = 0.001
m = NbPoints #should be nb of points in set

iterations = 10000
memo = np.array([Th]) # Lets give the CPU a brake

for k in range(iterations):
    NewTh = Th #Define a new th as placeholder because we want to update all at once
    
    for j in range(NbVariables):
        NewTh[j] -= Alph/m * gradient(Th, j, X, Y)
    
    Th = NewTh
    if (k % 500 == 0):
        memo = np.append(memo ,[Th], axis=0)

In [174]:
memo

array([[ 1.        ,  1.        ,  1.        ,  1.        ,  1.        ,
         1.        ,  1.        ,  1.        ],
       [ 0.99940449,  0.99851966,  0.99991011,  0.98176194,  0.9996868 ,
         0.9997823 ,  0.98632394,  0.99987219],
       [ 0.99344944,  0.98371629,  0.99901124,  0.79938132,  0.99655478,
         0.99760534,  0.84956336,  0.9985941 ],
       [ 0.98749438,  0.96891292,  0.99811236,  0.6170007 ,  0.99342275,
         0.99542837,  0.71280278,  0.99731601],
       [ 0.98153933,  0.95410955,  0.99721348,  0.43462009,  0.99029073,
         0.9932514 ,  0.5760422 ,  0.99603792],
       [ 0.97558427,  0.93930619,  0.99631461,  0.2522395 ,  0.98715871,
         0.99107444,  0.43928163,  0.99475983],
       [ 0.96962944,  0.92450339,  0.99541575,  0.06986464,  0.9840267 ,
         0.98889748,  0.30252302,  0.99348176],
       [ 0.96379048,  0.90997   ,  0.99452571, -0.10761794,  0.98089754,
         0.9867217 ,  0.16759013,  0.99221841],
       [ 0.9604369 ,  0.90104773

In [166]:
def isAlive(Th, X, i):
    return 1. if int(np.dot(X[i], Th)) > 0 else 0.

In [167]:
t = PrettyTable(['Exp', 'Real'])

for i in range(NbPoints):
    t.add_row( [ int(100*(hyp(Th, X[i]))), Y[i][0]] )
    
print(t.get_string(start = 50, end = 100))

+-----+------+
| Exp | Real |
+-----+------+
|  99 | 1.0  |
|  73 | 0.0  |
|  15 | 0.0  |
|  99 | 0.0  |
|  51 | 0.0  |
|  77 | 0.0  |
|  29 | 1.0  |
|  38 | 0.0  |
|  97 | 1.0  |
|  50 | 1.0  |
|  46 | 0.0  |
|  28 | 1.0  |
|  19 | 0.0  |
|  72 | 1.0  |
|  88 | 1.0  |
|  98 | 0.0  |
|  99 | 1.0  |
|  41 | 0.0  |
|  28 | 0.0  |
|  52 | 0.0  |
|  7  | 0.0  |
|  87 | 0.0  |
|  1  | 0.0  |
|  0  | 0.0  |
|  74 | 1.0  |
|  50 | 1.0  |
|  26 | 0.0  |
|  55 | 0.0  |
|  61 | 0.0  |
|  20 | 0.0  |
|  44 | 0.0  |
|  30 | 0.0  |
|  72 | 1.0  |
|  12 | 0.0  |
|  2  | 0.0  |
|  97 | 0.0  |
|  46 | 0.0  |
|  87 | 0.0  |
|  92 | 0.0  |
|  49 | 0.0  |
|  2  | 0.0  |
|  37 | 0.0  |
|  92 | 0.0  |
|  99 | 0.0  |
|  83 | 0.0  |
|  53 | 0.0  |
|  32 | 1.0  |
|  3  | 0.0  |
|  94 | 1.0  |
|  40 | 1.0  |
+-----+------+


In [168]:
from sklearn import metrics

predictions = np.array([])
for i in range(NbPoints):
    predictions = np.append(predictions ,[isAlive(Th, X, i)])

print(str(int(100*metrics.accuracy_score(Y, predictions))) + "% accuracy on train data") #nice cast nesting btw

64% accuracy on train data
