## Import necessary packages

In [1]:
from __future__ import division,print_function

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from IPython.display import clear_output

import tensorflow as tf

clear_output()

## Read data description

In [2]:
with open("data_description.txt","r") as fp:
    for line in fp:
        print(line)

--------------------Data Dictionary--------------------

Variable|	Definition|	Key|

-------------------------------------------------------

survival| 	Survival | 	0 = No, 1 = Yes |

pclass |	Ticket class |	1 = 1st, 2 = 2nd, 3 = 3rd|

name|  Name of the passenger |-

sex |	Sex |	

Age |	Age in years |	-

sibsp| 	# of siblings / spouses aboard the Titanic |-

parch |	# of parents / children aboard the Titanic 	|-

ticket| 	Ticket number |	-

fare |	Passenger fare 	| -

cabin |	Cabin number |	-

embarked |	Port of Embarkation |	C = Cherbourg, Q = Queenstown, S = Southampton|





-----------------Variable Notes-----------------------



pclass: A proxy for socio-economic status (SES)

1st = Upper

2nd = Middle

3rd = Lower



age: Age is fractional if less than 1. If the age is estimated, it is in the form of xx.5



sibsp: The dataset defines family relations in this way...

Sibling = brother, sister, stepbrother, stepsister

Spouse = husband, wife (mistresses and fiancés were ignored)

## Loading the training data using pandas

In [3]:
data = pd.read_csv("./data/titanic.csv")
print(data.shape) #891 data points for training

(891, 10)


In [4]:
train = data.iloc[:700,:].reset_index()
val = data.iloc[701:,:].reset_index()

In [5]:
train.columns

Index(['index', 'survived', 'sex', 'age', 'n_siblings_spouses', 'parch',
       'fare', 'class', 'deck', 'embark_town', 'alone'],
      dtype='object')

## Feature Engeenering

In [6]:
#we are interested in predicting whether a passanger survived or not
#so Survived column becomes our labels
#pop the labels into ytrain
ytrain = train.pop("survived")
yval = val.pop("survived")
#after popping the labels train datafram contains the feature columns only and not
#Survived columns

In [7]:
ytrain.head()

0    0
1    1
2    1
3    1
4    0
Name: survived, dtype: int64

In [8]:
# CATEGORICAL_COLUMNS = ['sex', 'n_siblings_spouses', 'parch', 'class', 'deck',
#                        'embark_town', 'alone']
CATEGORICAL_COLUMNS = ['sex', 'class', 'deck','n_siblings_spouses','parch', 'alone']
NUMERIC_COLUMNS = ['age', 'fare']

feature_columns = []
for feature_name in CATEGORICAL_COLUMNS:
    vocabulary = train[feature_name].unique()  # gets a list of all unique values from given feature column
    feature_columns.append(tf.feature_column.categorical_column_with_vocabulary_list(feature_name, vocabulary))

for feature_name in NUMERIC_COLUMNS:
    feature_columns.append(tf.feature_column.numeric_column(feature_name, dtype=tf.float32))

#print(feature_columns)

In [9]:
#one case of the above loop
vocabulary = train["sex"].unique()
tf.feature_column.categorical_column_with_vocabulary_list("sex", vocabulary)

VocabularyListCategoricalColumn(key='sex', vocabulary_list=('male', 'female'), dtype=tf.string, default_value=-1, num_oov_buckets=0)

In [10]:
def make_input_fn(data_df, label_df, epochs=500, shuffle=True, batch_size=16):
    def input_function():  # inner function, this will be returned
        #creating tensorflow object for which we will train the model on
        ds = tf.data.Dataset.from_tensor_slices((dict(data_df), label_df))
        if shuffle:
            # randomize order of data
            ds = ds.shuffle(1000)  
        # split dataset into batches of 32 and repeat process for number of epochs
        ds = ds.batch(batch_size).repeat(epochs)  
        return ds  # return a batch of the dataset
    return input_function  # return a function object for use

train_input_fn = make_input_fn(train, ytrain)  # here we will call the input_function that was returned to us to get a dataset object we can feed to the model
eval_input_fn = make_input_fn(val, yval, epochs=1, shuffle=False)


## The model

In [11]:
# We create a linear estimtor by passing the feature columns we created earlier
linear_est = tf.estimator.LinearClassifier(feature_columns=feature_columns)
clear_output()

In [12]:
 # train the model
linear_est.train(train_input_fn) 
clear_output()

In [13]:
#evaluate the model on the train data
result_train = linear_est.evaluate(train_input_fn) 
clear_output() #Clear the console
print(result_train) #model perfomance on training data

{'accuracy': 0.81857145, 'accuracy_baseline': 0.61142856, 'auc': 0.8767481, 'auc_precision_recall': 0.8507003, 'average_loss': 0.4122788, 'label/mean': 0.38857144, 'loss': 6.558981, 'precision': 0.78431374, 'prediction/mean': 0.39817023, 'recall': 0.7352941, 'global_step': 22000}


In [14]:
#evaluate the model on the train data
result_val = linear_est.evaluate(eval_input_fn) 
clear_output() #Clear the console
print(result_val) #model perfomance on training data

{'accuracy': 0.77894735, 'accuracy_baseline': 0.63684213, 'auc': 0.83369267, 'auc_precision_recall': 0.74817693, 'average_loss': 0.4813557, 'label/mean': 0.3631579, 'loss': 7.621465, 'precision': 0.68, 'prediction/mean': 0.4087389, 'recall': 0.73913044, 'global_step': 22000}


In [15]:
#this is how one predictions will look like
pred_dicts = list(linear_est.predict(eval_input_fn))
clear_output()
pred_dicts[0]

{'logits': array([-2.4195952], dtype=float32),
 'logistic': array([0.08169058], dtype=float32),
 'probabilities': array([0.91830933, 0.08169061], dtype=float32),
 'class_ids': array([0]),
 'classes': array([b'0'], dtype=object),
 'all_class_ids': array([0, 1], dtype=int32),
 'all_classes': array([b'0', b'1'], dtype=object)}

In [16]:
#predictions on the validation data
pred_dicts = list(linear_est.predict(eval_input_fn))
clear_output()
probs = pd.DataFrame(pd.Series([pred['probabilities'][1] for pred in pred_dicts]),columns=["Prob of surviving"])
probs["prediction"] = pd.Series([int(pred["class_ids"]) for pred in pred_dicts])
probs["actual"] = pd.Series(yval)
probs.head(20)

Unnamed: 0,Prob of surviving,prediction,actual
0,0.081691,0,0
1,0.091583,0,1
2,0.120701,0,0
3,0.105153,0,1
4,0.740305,1,1
5,0.978332,1,1
6,0.726106,1,0
7,0.3476,0,0
8,0.114053,0,0
9,0.992481,1,0
