In [1]:
%tensorflow_version 2.x  

In [2]:
from __future__ import absolute_import, division, print_function, unicode_literals

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
from IPython.display import clear_output 

import tensorflow as tf 
from tensorflow import feature_column 

In [3]:
# Load our training and test data 
dftrain = pd.read_csv('https://storage.googleapis.com/tf-datasets/titanic/train.csv') # training dataset
dfeval = pd.read_csv('https://storage.googleapis.com/tf-datasets/titanic/eval.csv') # testing dataset
#print(dftrain.head())

y_train = dftrain.pop('survived') # pop survived data from dataframe, this is the data we want to isolate and do the regression on 
y_eval = dfeval.pop('survived') 

In [4]:
# Observe and get to know structure and characteristics of data (Not Required to train Model)
dftrain.describe() # prints out a summary table of data 


Unnamed: 0,age,n_siblings_spouses,parch,fare
count,627.0,627.0,627.0,627.0
mean,29.631308,0.545455,0.379585,34.385399
std,12.511818,1.15109,0.792999,54.59773
min,0.75,0.0,0.0,0.0
25%,23.0,0.0,0.0,7.8958
50%,28.0,0.0,0.0,15.0458
75%,35.0,1.0,0.0,31.3875
max,80.0,8.0,5.0,512.3292


In [None]:
# Observe and get to know structure and characteristics of data (Not Required to train Model)
dftrain['sex'].value_counts().plot(kind='bar')


In [None]:
# Observe and get to know structure and characteristics of data (Not Required to train Model)
dftrain['class'].value_counts().plot(kind='bar')

In [7]:
# setup features for model 

CATEGORICAL_COLUMNS = ['sex', 'n_siblings_spouses', 'parch', 'class', 'deck', 'embark_town', 'alone']
NUMERICAL_COLUMNS = ['age', 'fare']

feature_columns = [] 

# iterate through categorical columns
for feature_name in CATEGORICAL_COLUMNS: 
    vocabulary = dftrain[feature_name].unique() # get all the unique values for each category (ex. Sex would M/F)
    new_feature = tf.feature_column.categorical_column_with_vocabulary_list(feature_name, vocabulary) # creates a feature for a category and all the unique entries
    feature_columns.append(tf.feature_column.categorical_column_with_vocabulary_list(feature_name, vocabulary)) # add the feature to the list  

# iterate through numerical columns 
for feature_name in NUMERICAL_COLUMNS: 
    new_feature = tf.feature_column.numeric_column(feature_name, dtype = tf.float32) # create a feature with the category and a float value
    feature_columns.append(new_feature) # give feature name and data type for numeric # add the features to the list

In [8]:
# Create an input function that will convert our pandas dataframe into a tf.data.dataset obj, source: https://www.tensorflow.org/tutorials/estimator/linear
def make_input_fn(data_df, label_df, num_epochs=10, shuffle=True, batch_size=100):
  def input_function():  # Nested function 
    ds = tf.data.Dataset.from_tensor_slices((dict(data_df), label_df))  # create tf.data.Dataset object with data and its label
    if shuffle:
      ds = ds.shuffle(1000)  # randomize order of data
    ds = ds.batch(batch_size).repeat(num_epochs)  # split dataset into batches of 32 and repeat process for number of epochs
    # example of repeat and batch method are used:
    # repeat: [1, 2, 3].repeat(3) -> [1, 2, 3, 1, 2, 3, 1, 2, 3]
    # batch: dataset = tf.data.Dataset.range(8).batch(3) -> [[0, 1, 2], [3, 4, 5], [6, 7]]
    return ds  # return batch 
  return input_function  # return dataset object 

train_input_fn = make_input_fn(dftrain, y_train)  # here we will call the input_function that was returned to us to get a dataset object we can feed to the model
eval_input_fn = make_input_fn(dfeval, y_eval, num_epochs=1, shuffle=False)

In [None]:
# Create model with feature columns 
linear_est = tf.estimator.LinearClassifier(feature_columns=feature_columns)

In [None]:
# train model with function 
linear_est.train(train_input_fn)  # train
result = linear_est.evaluate(eval_input_fn)  # get model accuracy on tetsing data

clear_output()
print(result['accuracy'])  