## Import necessary packages

In [3]:
from __future__ import division,print_function

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from IPython.display import clear_output

import tensorflow as tf

import seaborn

clear_output()

## Read data description

In [4]:
with open("data_description.txt","r") as fp:
    for line in fp:
        print(line)

--------------------Data Dictionary--------------------

Variable|	Definition|	Key|

-------------------------------------------------------

survival| 	Survival | 	0 = No, 1 = Yes |

pclass |	Ticket class |	1 = 1st, 2 = 2nd, 3 = 3rd|

name|  Name of the passenger |-

sex |	Sex |	

Age |	Age in years |	-

sibsp| 	# of siblings / spouses aboard the Titanic |-

parch |	# of parents / children aboard the Titanic 	|-

ticket| 	Ticket number |	-

fare |	Passenger fare 	| -

cabin |	Cabin number |	-

embarked |	Port of Embarkation |	C = Cherbourg, Q = Queenstown, S = Southampton|





-----------------Variable Notes-----------------------



pclass: A proxy for socio-economic status (SES)

1st = Upper

2nd = Middle

3rd = Lower



age: Age is fractional if less than 1. If the age is estimated, it is in the form of xx.5



sibsp: The dataset defines family relations in this way...

Sibling = brother, sister, stepbrother, stepsister

Spouse = husband, wife (mistresses and fiancés were ignored)

## Loading the training data using pandas

In [83]:
train = pd.read_csv("./data/titanic.csv")
print(train.shape) #891 data points for training

(891, 10)


In [84]:
train.columns

Index(['survived', 'sex', 'age', 'n_siblings_spouses', 'parch', 'fare',
       'class', 'deck', 'embark_town', 'alone'],
      dtype='object')

## Feature Engeenering

In [86]:
#we are interested in predicting whether a passanger survived or not
#so Survived column becomes our labels
#pop the labels into ytrain
ytrain = train.pop("survived")
#after popping the labels train datafram contains the feature columns only and not
#Survived columns

In [87]:
ytrain.head()

0    0
1    1
2    1
3    1
4    0
Name: survived, dtype: int64

In [90]:
CATEGORICAL_COLUMNS = ['sex', 'n_siblings_spouses', 'parch', 'class', 'deck',
                       'embark_town', 'alone']
NUMERIC_COLUMNS = ['age', 'fare']

feature_columns = []
for feature_name in CATEGORICAL_COLUMNS:
    vocabulary = train[feature_name].unique()  # gets a list of all unique values from given feature column
    feature_columns.append(tf.feature_column.categorical_column_with_vocabulary_list(feature_name, vocabulary))

for feature_name in NUMERIC_COLUMNS:
    feature_columns.append(tf.feature_column.numeric_column(feature_name, dtype=tf.float32))

#print(feature_columns)

In [91]:
#one case of the above loop
vocabulary = train["sex"].unique()
tf.feature_column.categorical_column_with_vocabulary_list("sex", vocabulary)

VocabularyListCategoricalColumn(key='sex', vocabulary_list=('male', 'female'), dtype=tf.string, default_value=-1, num_oov_buckets=0)

In [92]:
def make_input_fn(data_df, label_df, num_epochs=50, shuffle=True, batch_size=32):
    def input_function():  # inner function, this will be returned
        #creating tensorflow object for which we will train the model on
        ds = tf.data.Dataset.from_tensor_slices((dict(data_df), label_df))
        if shuffle:
            # randomize order of data
            ds = ds.shuffle(1000)  
        # split dataset into batches of 32 and repeat process for number of epochs
        ds = ds.batch(batch_size).repeat(num_epochs)  
        return ds  # return a batch of the dataset
    return input_function  # return a function object for use

train_input_fn = make_input_fn(train, ytrain)  # here we will call the input_function that was returned to us to get a dataset object we can feed to the model
#eval_input_fn = make_input_fn(dfeval, y_eval, num_epochs=1, shuffle=False)


In [93]:
# We create a linear estimtor by passing the feature columns we created earlier
linear_est = tf.estimator.LinearClassifier(feature_columns=feature_columns)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmp4y5h696i', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fe46b235748>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [94]:
 # train the model
linear_est.train(train_input_fn) 
#evaluate the model on the train data
result_train = linear_est.evaluate(train_input_fn) 
clear_output() #Clear the console
print(result_train) #model perfomance on training data

{'accuracy': 0.81705946, 'accuracy_baseline': 0.6161616, 'auc': 0.86898565, 'auc_precision_recall': 0.84195495, 'average_loss': 0.42431173, 'label/mean': 0.3838384, 'loss': 13.502206, 'precision': 0.7609329, 'prediction/mean': 0.40397578, 'recall': 0.7631579, 'global_step': 1400}


In [95]:
pred_dicts = list(linear_est.predict(eval_input_fn))
clear_output()
pred_dicts[0]

NameError: name 'eval_input_fn' is not defined