## TensorFlow: Logistic Regression

In [1]:
import tensorflow as tf
import numpy as np

import pandas as pd
from pandas import DataFrame as DF, Series

  from ._conv import register_converters as _register_converters


In [13]:
data = pd.read_csv("titanic.csv")

In [14]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [15]:
data.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)

In [16]:
# save both to csv
data.to_csv('titanic_data.csv', index=False)

In [17]:
del data
import gc
gc.collect()

0

In [18]:
# read data
data = pd.read_csv('titanic_data.csv')
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,7.25,,S
1,1,1,female,38.0,1,0,71.2833,C85,C
2,1,3,female,26.0,0,0,7.925,,S
3,1,1,female,35.0,1,0,53.1,C123,S
4,0,3,male,35.0,0,0,8.05,,S


In [19]:
data.shape

(891, 9)

In [20]:
data.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Cabin       687
Embarked      2
dtype: int64

In [21]:
data.fillna({'Age': -1,
             'Cabin': 'Unk',
             'Embarked': 'Unk',
             'Fare': -1},
            inplace=True);

## Very Basic - Hand Coded Logistic Regression

In [22]:
# convert sex binary
data.loc[:, 'Sex'] = (data.Sex == 'female').astype(int)

# train/test split
Xtr = data.loc[:, ['Pclass','Sex','Age','SibSp','Parch','Fare']].sample(frac=0.75)
Xts = data[~data.index.isin(Xtr.index)].loc[:, ['Pclass','Sex','Age','SibSp','Parch','Fare']]

# one-hot-encode Ytr and Yts (quick method)
Ytr = pd.get_dummies(data[data.index.isin(Xtr.index)].Survived).values
Yts = pd.get_dummies(data[~data.index.isin(Xtr.index)].Survived).values

In [24]:
Ytr

array([[1, 0],
       [0, 1],
       [0, 1],
       ...,
       [1, 0],
       [0, 1],
       [1, 0]], dtype=uint8)

In [26]:
import tensorflow as tf

# data format is as usual:
# Xtr and test_X have shape (num_instances, num_features)
# Ytr and test_Y have shape (num_instances, num_classes)
num_features = Xtr.shape[1]
num_classes = 2

# shape=[None, num_features] tells the model to accept different numbers of datapoints
X = tf.placeholder('float', [None, num_features])
Y = tf.placeholder('float', [None, num_classes])

# W - weights array
W = tf.Variable(tf.zeros([num_features, num_classes]))
# B - bias array
B = tf.Variable(tf.zeros([num_classes]))

# define the logistic model
# y=wx+b as argument of softmax
yhat = tf.nn.softmax(tf.matmul(X, W) + B)

# define a loss function
loss_fn = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=yhat, labels=Y))

# define optimizer and minimize on loss_fn
opt = tf.train.AdamOptimizer(0.01).minimize(loss_fn)

# create session
sess = tf.Session()

# init vars
init = tf.global_variables_initializer()
sess.run(init)

num_epochs = 10
# loop over num_epochs and run optimization step on
# full data each time
for i in range(num_epochs):
    sess.run(opt, feed_dict={X: Xtr, Y: Ytr})

# accuracy function
accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(yhat, 1), tf.argmax(Y, 1)), 'float'))
# get the test accuracy
accuracy_value = sess.run(accuracy, feed_dict={X: Xts, Y: Yts})

In [27]:
accuracy_value

0.6367713

## Logistic Regression With Batching
### Input Function

In [29]:
# read data
data = pd.read_csv('titanic_data.csv')
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,7.25,,S
1,1,1,female,38.0,1,0,71.2833,C85,C
2,1,3,female,26.0,0,0,7.925,,S
3,1,1,female,35.0,1,0,53.1,C123,S
4,0,3,male,35.0,0,0,8.05,,S


In [30]:
# define columns and default values
_csv_column_defaults = [[0],[-1],['Unk'],[-1.],[0],[0],[-1.],['Unk'],['Unk']]
_csv_columns = data.columns.tolist()

# define input function
def input_fn(csv_file, feature_names, batch_size=16, n_epochs=10, shuffle=False):
    def decode_csv(line):
        parsed_line = tf.decode_csv(line, _csv_column_defaults)
        features_dict = dict(zip(feature_names, parsed_line))
#         features_dict['Age'] = tf.to_int32(features_dict['Age'])
        labels = features_dict.pop('Survived') # removes this from dict
        return features_dict, labels
    
    if shuffle:
        dataset = dataset.shuffle(buffer_size=100*1024) # buffer 100KB

    dataset = (tf.data.TextLineDataset(csv_file) # Read text file
           .skip(1) # Skip header row
           .map(decode_csv, num_parallel_calls=3)) # Transform each elem by applying decode_csv fn

    dataset = dataset.batch(batch_size)  # create a batch of size `batch_size`
    dataset = dataset.repeat(n_epochs)
    iterator = dataset.make_one_shot_iterator()
    batch_features, batch_labels = iterator.get_next()
    
    return batch_features, batch_labels

## Handling Categorical Features
Using tf.feature_column is a way to map data to a model, as opposed to using feed dictionaries. It can be efficient and help with certain preprocessing tasks.

### Base Categorical Features

In [31]:
# pclass = tf.feature_column.categorical_column_with_identity(
#     'Pclass', num_buckets=3)

sex = tf.feature_column.categorical_column_with_vocabulary_list(
    'Sex', vocabulary_list=['female','male','Unk'])

embarked = tf.feature_column.categorical_column_with_vocabulary_list(
    'Embarked', vocabulary_list=['S','C','Q','Unk'])

### Base Continuous Features

In [33]:
age = tf.feature_column.numeric_column('Age')

# age_buckets = tf.feature_column.bucketized_column(
#     age, boundaries=[5.,10,18,25,35,45,55,65])

sib = tf.feature_column.numeric_column('SibSp')

parch = tf.feature_column.numeric_column('Parch')

fare = tf.feature_column.numeric_column('Fare')

### Define Model

In [34]:
columns = [age, sib, parch, fare, sex, embarked]

model_dir = 'lr_model'
model = tf.estimator.LinearClassifier(model_dir=model_dir,
                                      feature_columns=columns,
                                      optimizer=tf.train.AdamOptimizer())

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'lr_model', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x0000021F53943E48>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


## Train Model

In [36]:
model.train(input_fn=lambda: input_fn('titanic_data.csv', _csv_columns))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from lr_model\model.ckpt-0
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into lr_model\model.ckpt.
INFO:tensorflow:loss = 11.090355, step = 1
INFO:tensorflow:global_step/sec: 257.088
INFO:tensorflow:loss = 9.610938, step = 101 (0.395 sec)
INFO:tensorflow:global_step/sec: 396.286
INFO:tensorflow:loss = 12.629805, step = 201 (0.247 sec)
INFO:tensorflow:global_step/sec: 498.837
INFO:tensorflow:loss = 8.880047, step = 301 (0.202 sec)
INFO:tensorflow:global_step/sec: 583.015
INFO:tensorflow:loss = 9.075473, step = 401 (0.170 sec)
INFO:tensorflow:global_step/sec: 622.618
INFO:tensorflow:loss = 6.975941, step = 501 (0.160 sec)
INFO:tensorflow:Saving checkpoints for 560 into lr_model\model.ckpt.
INFO:tensorflow:Loss for final ste

<tensorflow.python.estimator.canned.linear.LinearClassifier at 0x21f53943dd8>

In [37]:
results = model.evaluate(input_fn=lambda: input_fn('titanic_data.csv', _csv_columns, n_epochs=1))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-06-11-08:48:07
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from lr_model\model.ckpt-560
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2019-06-11-08:48:07
INFO:tensorflow:Saving dict for global step 560: accuracy = 0.7665544, accuracy_baseline = 0.6161616, auc = 0.8267983, auc_precision_recall = 0.7445791, average_loss = 0.5473142, global_step = 560, label/mean = 0.3838384, loss = 8.70816, precision = 0.79385966, prediction/mean = 0.4256734, recall = 0.5292398
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 560: lr_model\model.ckpt-560


In [38]:
results

{'accuracy': 0.7665544,
 'accuracy_baseline': 0.6161616,
 'auc': 0.8267983,
 'auc_precision_recall': 0.7445791,
 'average_loss': 0.5473142,
 'label/mean': 0.3838384,
 'loss': 8.70816,
 'precision': 0.79385966,
 'prediction/mean': 0.4256734,
 'recall': 0.5292398,
 'global_step': 560}