# Working with data in Tensorflow

Taken from freecodecamp, tensorflow docs. Experimentational.

 Implementing to make notes + see data

In [1]:
# Import modules

import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow.compat.v2.feature_column as fc
import tensorflow as tf
import tensorflow_datasets as tfds

cur_dir = os.getcwd()
tf.__version__

'2.7.0'

In [27]:
# Testing import data from tfds
# This split means it loads 75% of our data as our training set
ds = tfds.load('Titanic', split='train[:75%]', shuffle_files=True, data_dir=cur_dir)

In [22]:
# Check size of our dataset
# It should be 75% of the size of our total dataset.
len(ds)

982

In [32]:
# alternative method of loading (.load is built around builder)
builder = tfds.builder('Titanic')
builder.download_and_prepare()
ds = builder.as_dataset(split='train', shuffle_files=True)

[1mDownloading and preparing dataset Unknown size (download: Unknown size, generated: Unknown size, total: Unknown size) to C:\Users\User\tensorflow_datasets\titanic\2.0.0...[0m


Dl Size...: 0 MiB [00:00, ? MiB/s]
Dl Completed...: 100%|██████████| 1/1 [00:00<00:00,  6.69 url/s]
                                                                        

[1mDataset titanic downloaded and prepared to C:\Users\User\tensorflow_datasets\titanic\2.0.0. Subsequent calls will reuse this data.[0m




In [33]:
# Since we didn't specify the % of the dataset to load for training this time, we have the full number of rows
len(ds)

1309

In [45]:
# iterating over the dataset:
# First two data points only (checking structure)
for ex in ds.take(2):
    # check what keys exist
    print(list(ex.keys()))

    features = ex['features'] # parameters
    survived = ex['survived'] # label

    print('features: ', features, '\n')
    print ('survived: ', survived)


['features', 'survived']
features:  {'age': <tf.Tensor: shape=(), dtype=float32, numpy=30.0>, 'boat': <tf.Tensor: shape=(), dtype=string, numpy=b'Unknown'>, 'body': <tf.Tensor: shape=(), dtype=int32, numpy=-1>, 'cabin': <tf.Tensor: shape=(), dtype=string, numpy=b'Unknown'>, 'embarked': <tf.Tensor: shape=(), dtype=int64, numpy=2>, 'fare': <tf.Tensor: shape=(), dtype=float32, numpy=13.0>, 'home.dest': <tf.Tensor: shape=(), dtype=string, numpy=b'Sarnia, ON'>, 'name': <tf.Tensor: shape=(), dtype=string, numpy=b'McCrie, Mr. James Matthew'>, 'parch': <tf.Tensor: shape=(), dtype=int32, numpy=0>, 'pclass': <tf.Tensor: shape=(), dtype=int64, numpy=1>, 'sex': <tf.Tensor: shape=(), dtype=int64, numpy=0>, 'sibsp': <tf.Tensor: shape=(), dtype=int32, numpy=0>, 'ticket': <tf.Tensor: shape=(), dtype=string, numpy=b'233478'>} 

survived:  tf.Tensor(0, shape=(), dtype=int64)
['features', 'survived']
features:  {'age': <tf.Tensor: shape=(), dtype=float32, numpy=37.0>, 'boat': <tf.Tensor: shape=(), dtype=

-------------------

# Using data from tfds to train a Linear Classifier

Load the data as test and train sets

In [83]:
train_ds, test_ds = tfds.load('Titanic', split=['train[:75%]', 'train[75%:]'], data_dir=cur_dir)

print('Size of training dataset: ', len(train_ds), '\n', 'Size of validation set: ', len(test_ds))

Size of training dataset:  982 
 Size of validation set:  327


In [84]:
# according to the documentation (https://www.tensorflow.org/datasets/overview)
# this should give me _OptionsDataset
# Not sure why it's still giving me PrefetchDataset

assert isinstance(train_ds, tf.data.Dataset)
assert isinstance(test_ds, tf.data.Dataset)
print(train_ds)



<PrefetchDataset shapes: {features: {age: (), boat: (), body: (), cabin: (), embarked: (), fare: (), home.dest: (), name: (), parch: (), pclass: (), sex: (), sibsp: (), ticket: ()}, survived: ()}, types: {features: {age: tf.float32, boat: tf.string, body: tf.int32, cabin: tf.string, embarked: tf.int64, fare: tf.float32, home.dest: tf.string, name: tf.string, parch: tf.int32, pclass: tf.int64, sex: tf.int64, sibsp: tf.int32, ticket: tf.string}, survived: tf.int64}>


Accessing the data inside PrefetchDataset

In [92]:
# Feature category
list(train_ds)[0]['features'].keys()

dict_keys(['age', 'boat', 'body', 'cabin', 'embarked', 'fare', 'home.dest', 'name', 'parch', 'pclass', 'sex', 'sibsp', 'ticket'])

In [94]:
# data type of each column + their value
list(train_ds)[0]['features'].values()

dict_values([<tf.Tensor: shape=(), dtype=float32, numpy=30.0>, <tf.Tensor: shape=(), dtype=string, numpy=b'Unknown'>, <tf.Tensor: shape=(), dtype=int32, numpy=-1>, <tf.Tensor: shape=(), dtype=string, numpy=b'Unknown'>, <tf.Tensor: shape=(), dtype=int64, numpy=2>, <tf.Tensor: shape=(), dtype=float32, numpy=13.0>, <tf.Tensor: shape=(), dtype=string, numpy=b'Sarnia, ON'>, <tf.Tensor: shape=(), dtype=string, numpy=b'McCrie, Mr. James Matthew'>, <tf.Tensor: shape=(), dtype=int32, numpy=0>, <tf.Tensor: shape=(), dtype=int64, numpy=1>, <tf.Tensor: shape=(), dtype=int64, numpy=0>, <tf.Tensor: shape=(), dtype=int32, numpy=0>, <tf.Tensor: shape=(), dtype=string, numpy=b'233478'>])

In [97]:
# label
list(train_ds)[0]['survived']

<tf.Tensor: shape=(), dtype=int64, numpy=0>

In [4]:
# I can turn the data into something more recognizable as a dataframe...
tfds.as_dataframe(train_ds).head()

Unnamed: 0,features/age,features/boat,features/body,features/cabin,features/embarked,features/fare,features/home.dest,features/name,features/parch,features/pclass,features/sex,features/sibsp,features/ticket,survived
0,30.0,b'Unknown',-1,b'Unknown',2,13.0,"b'Sarnia, ON'","b'McCrie, Mr. James Matthew'",0,1,0,0,b'233478',0
1,37.0,b'Unknown',98,b'Unknown',2,7.925,"b'Ruotsinphytaa, Finland New York, NY'","b'Gustafsson, Mr. Anders Vilhelm'",0,2,0,2,b'3101276',0
2,28.0,b'9',-1,b'Unknown',2,13.0,b'Spain',"b'Reynaldo, Ms. Encarnacion'",0,1,1,0,b'230434',1
3,18.0,b'Unknown',-1,b'Unknown',2,73.5,"b'Lyndhurst, England'","b'Davies, Mr. Charles Henry'",0,1,0,0,b'S.O.C. 14879',0
4,-1.0,b'Unknown',-1,b'Unknown',0,7.8958,b'Unknown',"b'Gheorgheff, Mr. Stanio'",0,2,0,0,b'349254',0


In [85]:
# Let's just define it as a dataframe since this is the only way I know how to handle data
train = tfds.as_dataframe(train_ds)

# Rename the columns to remove 'features/'
for col in train.columns:
    if 'features' in col:
        train.rename(columns={col: col.split('features/')[1]}, inplace=True)
       
print(train.columns)

# Do the same for test. to be honest i should probably just define a function for this
# but since i'm just learning tensorflow this is ok for now

test = tfds.as_dataframe(test_ds)

# Rename the columns to remove 'features/'
for col in test.columns:
    if 'features' in col:
        test.rename(columns={col: col.split('features/')[1]}, inplace=True)

Index(['age', 'boat', 'body', 'cabin', 'embarked', 'fare', 'home.dest', 'name',
       'parch', 'pclass', 'sex', 'sibsp', 'ticket', 'survived'],
      dtype='object')


In [73]:
# Now we define the features for tensorflow
# I'm not adding in features I don't understand. Unfortunately tensorflow doesn't have documentation for what's coming out
# I have no idea what the "body" feature means
# I don't think home.dest matters for whether you survive or not either.

CATEGORICAL_COLUMNS = ['sex', 'sibsp', 'parch', 'pclass', 'cabin', 'boat', 'ticket']
NUMERIC_COLUMNS = ['age', 'fare']

feature_columns = []
for feature_name in CATEGORICAL_COLUMNS:
    # Get unique values of each categorical column
    # If you downloaded the data from tfds then we already have this
    vocabulary = train[feature_name].unique() 
    # associate the vocabulary with each feature column
    # i.e. 'male' and 'female' are vocabularies associated with 'sex' feature column
    feature_columns.append(tf.feature_column.categorical_column_with_vocabulary_list(feature_name, vocabulary)) 

for feature_name in NUMERIC_COLUMNS:
    # define a data type for numeric columns
    feature_columns.append(tf.feature_column.numeric_column(feature_name, dtype=tf.float32))


---------------------

Testing various methods of getting input function

In [80]:
# For some reason this method of getting the input function doesn't work
features = CATEGORICAL_COLUMNS + NUMERIC_COLUMNS # Get colnames of features
label = 'survived'

# Source of input function definer: https://www.guru99.com/linear-classifier-tensorflow.html
# modified to suit my needs a little
def get_input_fn(data_set, features, label, num_epochs=None, n_batch = 32, shuffle=True):
    return tf.compat.v1.estimator.inputs.pandas_input_fn(
       x = pd.DataFrame({k: data_set[k].values for k in features}),
       y = pd.Series(data_set[label].values),
       batch_size=n_batch,   
       num_epochs=num_epochs,
       shuffle=shuffle)

train_input = get_input_fn(train, features, label, num_epochs=10, shuffle=True)

In [89]:
# A utility method to create a tf.data dataset from a Pandas Dataframe
# Taken from https://www.tensorflow.org/tutorials/structured_data/feature_columns
def df_to_dataset(df, label, shuffle=True, batch_size=32):
  def input_function():
    dataframe = df.copy()
    labels = dataframe.pop(label)
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    if shuffle:
      ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    return ds
  return input_function

# use the function to create our batches
train_input = df_to_dataset(train, label)
test_input = df_to_dataset(test, label)

In [87]:
# Method from FCC course
data_df = train.copy(deep=True)[CATEGORICAL_COLUMNS+NUMERIC_COLUMNS]
label_df = train.copy(deep=True).pop('survived')

eval_data = test.copy(deep=True)[CATEGORICAL_COLUMNS+NUMERIC_COLUMNS]
eval_label = test.copy(deep=True).pop('survived')

def make_input_fn(data_df, label_df, num_epochs=10, shuffle=True, batch_size=32):
    def input_function():
        # Create tf.data.Dataset object
        ds = tf.data.Dataset.from_tensor_slices((dict(data_df), label_df))
        if shuffle:
            ds = ds.shuffle(1000)
        ds = ds.batch(batch_size).repeat(num_epochs) # Split data into groups of batch_size, repeat process by num_epochs
        return ds
    return input_function

train_input = make_input_fn(data_df, label_df)
test_input = make_input_fn(eval_data, eval_label)

In [77]:
# instantiate our model
# Uhhh looks like i can't use the same model in the same folder
# Checkpoints name will be different
# I'll just let it go to temp for now instead of using cur_dir
model = tf.estimator.LinearClassifier(
    feature_columns = feature_columns
)

INFO:tensorflow:Using default config.


INFO:tensorflow:Using default config.






INFO:tensorflow:Using config: {'_model_dir': 'C:\\Users\\User\\AppData\\Local\\Temp\\tmp9omeusc9', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


INFO:tensorflow:Using config: {'_model_dir': 'C:\\Users\\User\\AppData\\Local\\Temp\\tmp9omeusc9', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [79]:
model.train(train_input)

INFO:tensorflow:Calling model_fn.


INFO:tensorflow:Calling model_fn.
  self.bias = self.add_variable(


INFO:tensorflow:Done calling model_fn.


INFO:tensorflow:Done calling model_fn.


INFO:tensorflow:Create CheckpointSaverHook.


INFO:tensorflow:Create CheckpointSaverHook.


INFO:tensorflow:Graph was finalized.


INFO:tensorflow:Graph was finalized.


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Done running local_init_op.


INFO:tensorflow:Done running local_init_op.


INFO:tensorflow:Calling checkpoint listeners before saving checkpoint 0...


INFO:tensorflow:Calling checkpoint listeners before saving checkpoint 0...


INFO:tensorflow:Saving checkpoints for 0 into C:\Users\User\AppData\Local\Temp\tmp9omeusc9\model.ckpt.


INFO:tensorflow:Saving checkpoints for 0 into C:\Users\User\AppData\Local\Temp\tmp9omeusc9\model.ckpt.


INFO:tensorflow:Calling checkpoint listeners after saving checkpoint 0...


INFO:tensorflow:Calling checkpoint listeners after saving checkpoint 0...


INFO:tensorflow:loss = 0.6931472, step = 0


INFO:tensorflow:loss = 0.6931472, step = 0


INFO:tensorflow:global_step/sec: 440.43


INFO:tensorflow:global_step/sec: 440.43


INFO:tensorflow:loss = 0.17148066, step = 100 (0.227 sec)


INFO:tensorflow:loss = 0.17148066, step = 100 (0.227 sec)


INFO:tensorflow:global_step/sec: 1162.52


INFO:tensorflow:global_step/sec: 1162.52


INFO:tensorflow:loss = 0.15737921, step = 200 (0.087 sec)


INFO:tensorflow:loss = 0.15737921, step = 200 (0.087 sec)


INFO:tensorflow:global_step/sec: 1176.21


INFO:tensorflow:global_step/sec: 1176.21


INFO:tensorflow:loss = 0.13819736, step = 300 (0.084 sec)


INFO:tensorflow:loss = 0.13819736, step = 300 (0.084 sec)


INFO:tensorflow:Calling checkpoint listeners before saving checkpoint 310...


INFO:tensorflow:Calling checkpoint listeners before saving checkpoint 310...


INFO:tensorflow:Saving checkpoints for 310 into C:\Users\User\AppData\Local\Temp\tmp9omeusc9\model.ckpt.


INFO:tensorflow:Saving checkpoints for 310 into C:\Users\User\AppData\Local\Temp\tmp9omeusc9\model.ckpt.


INFO:tensorflow:Calling checkpoint listeners after saving checkpoint 310...


INFO:tensorflow:Calling checkpoint listeners after saving checkpoint 310...


INFO:tensorflow:Loss for final step: 0.25818083.


INFO:tensorflow:Loss for final step: 0.25818083.


<tensorflow_estimator.python.estimator.canned.linear.LinearClassifierV2 at 0x20df5cb26a0>

In [88]:
model.evaluate(test_input)

INFO:tensorflow:Calling model_fn.


INFO:tensorflow:Calling model_fn.
  self.bias = self.add_variable(


INFO:tensorflow:Done calling model_fn.


INFO:tensorflow:Done calling model_fn.


INFO:tensorflow:Starting evaluation at 2021-11-16T21:28:57


INFO:tensorflow:Starting evaluation at 2021-11-16T21:28:57


INFO:tensorflow:Graph was finalized.


INFO:tensorflow:Graph was finalized.


INFO:tensorflow:Restoring parameters from C:\Users\User\AppData\Local\Temp\tmp9omeusc9\model.ckpt-310


INFO:tensorflow:Restoring parameters from C:\Users\User\AppData\Local\Temp\tmp9omeusc9\model.ckpt-310


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Done running local_init_op.


INFO:tensorflow:Done running local_init_op.


INFO:tensorflow:Inference Time : 0.64207s


INFO:tensorflow:Inference Time : 0.64207s


INFO:tensorflow:Finished evaluation at 2021-11-16-21:28:58


INFO:tensorflow:Finished evaluation at 2021-11-16-21:28:58


INFO:tensorflow:Saving dict for global step 310: accuracy = 0.97553515, accuracy_baseline = 0.58409786, auc = 0.99384046, auc_precision_recall = 0.9938363, average_loss = 0.12646903, global_step = 310, label/mean = 0.41590214, loss = 0.12457414, precision = 0.9848485, prediction/mean = 0.39291134, recall = 0.9558824


INFO:tensorflow:Saving dict for global step 310: accuracy = 0.97553515, accuracy_baseline = 0.58409786, auc = 0.99384046, auc_precision_recall = 0.9938363, average_loss = 0.12646903, global_step = 310, label/mean = 0.41590214, loss = 0.12457414, precision = 0.9848485, prediction/mean = 0.39291134, recall = 0.9558824


INFO:tensorflow:Saving 'checkpoint_path' summary for global step 310: C:\Users\User\AppData\Local\Temp\tmp9omeusc9\model.ckpt-310


INFO:tensorflow:Saving 'checkpoint_path' summary for global step 310: C:\Users\User\AppData\Local\Temp\tmp9omeusc9\model.ckpt-310


{'accuracy': 0.97553515,
 'accuracy_baseline': 0.58409786,
 'auc': 0.99384046,
 'auc_precision_recall': 0.9938363,
 'average_loss': 0.12646903,
 'label/mean': 0.41590214,
 'loss': 0.12457414,
 'precision': 0.9848485,
 'prediction/mean': 0.39291134,
 'recall': 0.9558824,
 'global_step': 310}

Holy crap. Looks like I was able to train a model with 97% accuracy. For reference, the model trained via the FCC tutorial was the following:

{'accuracy': 0.75757575,
 'accuracy_baseline': 0.625,
 'auc': 0.8289256,
 'auc_precision_recall': 0.8013525,
 'average_loss': 0.48232135,
 'label/mean': 0.375,
 'loss': 0.47344536,
 'precision': 0.6923077,
 'prediction/mean': 0.37914413,
 'recall': 0.6363636,
 'global_step': 200}

In [90]:
model.train(train_input)

INFO:tensorflow:Calling model_fn.


INFO:tensorflow:Calling model_fn.
  self.bias = self.add_variable(


INFO:tensorflow:Done calling model_fn.


INFO:tensorflow:Done calling model_fn.


INFO:tensorflow:Create CheckpointSaverHook.


INFO:tensorflow:Create CheckpointSaverHook.


INFO:tensorflow:Graph was finalized.


INFO:tensorflow:Graph was finalized.


INFO:tensorflow:Restoring parameters from C:\Users\User\AppData\Local\Temp\tmp9omeusc9\model.ckpt-310


INFO:tensorflow:Restoring parameters from C:\Users\User\AppData\Local\Temp\tmp9omeusc9\model.ckpt-310


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Done running local_init_op.


INFO:tensorflow:Done running local_init_op.


INFO:tensorflow:Calling checkpoint listeners before saving checkpoint 310...


INFO:tensorflow:Calling checkpoint listeners before saving checkpoint 310...


INFO:tensorflow:Saving checkpoints for 310 into C:\Users\User\AppData\Local\Temp\tmp9omeusc9\model.ckpt.


INFO:tensorflow:Saving checkpoints for 310 into C:\Users\User\AppData\Local\Temp\tmp9omeusc9\model.ckpt.


INFO:tensorflow:Calling checkpoint listeners after saving checkpoint 310...


INFO:tensorflow:Calling checkpoint listeners after saving checkpoint 310...


INFO:tensorflow:loss = 0.0839168, step = 310


INFO:tensorflow:loss = 0.0839168, step = 310


INFO:tensorflow:Calling checkpoint listeners before saving checkpoint 341...


INFO:tensorflow:Calling checkpoint listeners before saving checkpoint 341...


INFO:tensorflow:Saving checkpoints for 341 into C:\Users\User\AppData\Local\Temp\tmp9omeusc9\model.ckpt.


INFO:tensorflow:Saving checkpoints for 341 into C:\Users\User\AppData\Local\Temp\tmp9omeusc9\model.ckpt.


INFO:tensorflow:Calling checkpoint listeners after saving checkpoint 341...


INFO:tensorflow:Calling checkpoint listeners after saving checkpoint 341...


INFO:tensorflow:Loss for final step: 0.13718586.


INFO:tensorflow:Loss for final step: 0.13718586.


<tensorflow_estimator.python.estimator.canned.linear.LinearClassifierV2 at 0x20df5cb26a0>

In [91]:
model.evaluate(test_input)

INFO:tensorflow:Calling model_fn.


INFO:tensorflow:Calling model_fn.
  self.bias = self.add_variable(


INFO:tensorflow:Done calling model_fn.


INFO:tensorflow:Done calling model_fn.


INFO:tensorflow:Starting evaluation at 2021-11-16T21:57:00


INFO:tensorflow:Starting evaluation at 2021-11-16T21:57:00


INFO:tensorflow:Graph was finalized.


INFO:tensorflow:Graph was finalized.


INFO:tensorflow:Restoring parameters from C:\Users\User\AppData\Local\Temp\tmp9omeusc9\model.ckpt-341


INFO:tensorflow:Restoring parameters from C:\Users\User\AppData\Local\Temp\tmp9omeusc9\model.ckpt-341


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Done running local_init_op.


INFO:tensorflow:Done running local_init_op.


INFO:tensorflow:Inference Time : 0.68916s


INFO:tensorflow:Inference Time : 0.68916s


INFO:tensorflow:Finished evaluation at 2021-11-16-21:57:00


INFO:tensorflow:Finished evaluation at 2021-11-16-21:57:00


INFO:tensorflow:Saving dict for global step 341: accuracy = 0.9724771, accuracy_baseline = 0.58409786, auc = 0.9942062, auc_precision_recall = 0.99411845, average_loss = 0.12865344, global_step = 341, label/mean = 0.41590214, loss = 0.12611307, precision = 0.9847328, prediction/mean = 0.38225845, recall = 0.9485294


INFO:tensorflow:Saving dict for global step 341: accuracy = 0.9724771, accuracy_baseline = 0.58409786, auc = 0.9942062, auc_precision_recall = 0.99411845, average_loss = 0.12865344, global_step = 341, label/mean = 0.41590214, loss = 0.12611307, precision = 0.9847328, prediction/mean = 0.38225845, recall = 0.9485294


INFO:tensorflow:Saving 'checkpoint_path' summary for global step 341: C:\Users\User\AppData\Local\Temp\tmp9omeusc9\model.ckpt-341


INFO:tensorflow:Saving 'checkpoint_path' summary for global step 341: C:\Users\User\AppData\Local\Temp\tmp9omeusc9\model.ckpt-341


{'accuracy': 0.9724771,
 'accuracy_baseline': 0.58409786,
 'auc': 0.9942062,
 'auc_precision_recall': 0.99411845,
 'average_loss': 0.12865344,
 'label/mean': 0.41590214,
 'loss': 0.12611307,
 'precision': 0.9847328,
 'prediction/mean': 0.38225845,
 'recall': 0.9485294,
 'global_step': 341}

In [54]:
def df_to_dataset(df, label, shuffle=True, batch_size=32):
    dataframe = df.copy()
    labels = dataframe.pop(label)
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    if shuffle:
      ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    return ds

train_ds = df_to_dataset(train, label)

for feature_batch, label_batch in train_ds.take(1):
  print('Every feature:', list(feature_batch.keys()))
  print('A batch of ages:', feature_batch['age'])
  print('A batch of targets:', label_batch )


Every feature: ['age', 'boat', 'body', 'cabin', 'embarked', 'fare', 'home.dest', 'name', 'parch', 'pclass', 'sex', 'sibsp', 'ticket']
A batch of ages: tf.Tensor(
[28.   30.   55.   21.   -1.   41.   -1.    9.    0.75 41.   26.5  22.
 34.   33.   -1.   43.   30.   37.   25.   33.   38.   50.   22.   20.
 -1.   21.   30.   21.    3.   33.   32.   -1.  ], shape=(32,), dtype=float32)
A batch of targets: tf.Tensor([0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0 1 0 1 0 1 0 0 0 0 0 1 1 1 0], shape=(32,), dtype=int64)


--------------------------

Below is code taken directly from the machine learning module at freecodecamp. It manually converts the csv files into tensors for tf to take in.

Source: https://www.tensorflow.org/tutorials/estimator/linear

In [55]:
# Load the datasets (already in test/train format)
dftrain = pd.read_csv("https://storage.googleapis.com/tf-datasets/titanic/train.csv")
dfeval = pd.read_csv("https://storage.googleapis.com/tf-datasets/titanic/eval.csv")

Example of what the dataset looks like

In [56]:
dftrain.head()

Unnamed: 0,survived,sex,age,n_siblings_spouses,parch,fare,class,deck,embark_town,alone
0,0,male,22.0,1,0,7.25,Third,unknown,Southampton,n
1,1,female,38.0,1,0,71.2833,First,C,Cherbourg,n
2,1,female,26.0,0,0,7.925,Third,unknown,Southampton,y
3,1,female,35.0,1,0,53.1,First,C,Southampton,n
4,0,male,28.0,0,0,8.4583,Third,unknown,Queenstown,y


In [57]:
# Remove the column for prediction and store elsewhere
# i.e. our labels
y_train = dftrain.pop('survived')
y_eval = dfeval.pop('survived')

'y' frame just contains rows of whether someone survived or not

In [7]:
y_train.head()

0    0
1    1
2    1
3    1
4    0
Name: survived, dtype: int64

In [58]:
# data manipulation to put csv files in a format that tensorflow can process
# manually define the feature columns
CATEGORICAL_COLUMNS = ['sex', 'n_siblings_spouses', 'parch', 'class', 'deck', 'embark_town', 'alone']
NUMERIC_COLUMNS = ['age', 'fare']

feature_columns = []
for feature_name in CATEGORICAL_COLUMNS:
    # Get unique values of each categorical column
    # If you downloaded the data from tfds then we already have this
    vocabulary = dftrain[feature_name].unique() 
    # associate the vocabulary with each feature column
    # i.e. 'male' and 'female' are vocabularies associated with 'sex' feature column
    feature_columns.append(tf.feature_column.categorical_column_with_vocabulary_list(feature_name, vocabulary)) 

for feature_name in NUMERIC_COLUMNS:
    # define a data type for numeric columns
    feature_columns.append(tf.feature_column.numeric_column(feature_name, dtype=tf.float32))

# Show the data type or list of categories for categorical variables
print(feature_columns)


[VocabularyListCategoricalColumn(key='sex', vocabulary_list=('male', 'female'), dtype=tf.string, default_value=-1, num_oov_buckets=0), VocabularyListCategoricalColumn(key='n_siblings_spouses', vocabulary_list=(1, 0, 3, 4, 2, 5, 8), dtype=tf.int64, default_value=-1, num_oov_buckets=0), VocabularyListCategoricalColumn(key='parch', vocabulary_list=(0, 1, 2, 5, 3, 4), dtype=tf.int64, default_value=-1, num_oov_buckets=0), VocabularyListCategoricalColumn(key='class', vocabulary_list=('Third', 'First', 'Second'), dtype=tf.string, default_value=-1, num_oov_buckets=0), VocabularyListCategoricalColumn(key='deck', vocabulary_list=('unknown', 'C', 'G', 'A', 'B', 'D', 'F', 'E'), dtype=tf.string, default_value=-1, num_oov_buckets=0), VocabularyListCategoricalColumn(key='embark_town', vocabulary_list=('Southampton', 'Cherbourg', 'Queenstown', 'unknown'), dtype=tf.string, default_value=-1, num_oov_buckets=0), VocabularyListCategoricalColumn(key='alone', vocabulary_list=('n', 'y'), dtype=tf.string, def

# Training a linear classification model

In [59]:
# Create a tf.data.Dataset object. 
# .from_tensor_slices creates a dataset where each input tensor is a column from the data
# not to be confused with .from_tensors where each input tensor is a row in our dataset
#  for more info: https://www.tensorflow.org/tutorials/load_data/pandas_dataframe
# https://www.tensorflow.org/api_docs/python/tf/data/Dataset#from_tensor_slices
# https://medium.com/when-i-work-data/converting-a-pandas-dataframe-into-a-tensorflow-dataset-752f3783c168

# TODO: Look up tf.estimator.inputs.numpy_input_fn (this seems like the way to go about making an input function for your model)
# NOTE: Batch = number of samples processed before the model is updated; epoch = number of passes the model makes through the entire dataset

def make_input_fn(data_df, label_df, num_epochs=10, shuffle=True, batch_size=32):
    def input_function():
        # Create tf.data.Dataset object
        ds = tf.data.Dataset.from_tensor_slices((dict(data_df), label_df))
        if shuffle:
            ds = ds.shuffle(1000)
        ds = ds.batch(batch_size).repeat(num_epochs) # Split data into groups of batch_size, repeat process by num_epochs
        return ds
    return input_function

In [60]:
train_input_fn = make_input_fn(dftrain, y_train)
train_input_fn

<function __main__.make_input_fn.<locals>.input_function()>

In [61]:
eval_input_function = make_input_fn(dfeval, y_eval, num_epochs=1, shuffle=False)

Begin training now that we have our inputs in place

In [62]:
# Instantiate our linear classifier
linear_est = tf.estimator.LinearClassifier(feature_columns=feature_columns, model_dir=cur_dir)
# Checking what's stored
linear_est

INFO:tensorflow:Using default config.


INFO:tensorflow:Using default config.


INFO:tensorflow:Using config: {'_model_dir': 'd:\\Users\\User\\OneDrive\\Documents\\Projects\\tensorflow-learning', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


INFO:tensorflow:Using config: {'_model_dir': 'd:\\Users\\User\\OneDrive\\Documents\\Projects\\tensorflow-learning', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


<tensorflow_estimator.python.estimator.canned.linear.LinearClassifierV2 at 0x20df95a0970>

In [63]:
# Train based on our input dataset
# https://www.tensorflow.org/api_docs/python/tf/estimator/LinearClassifier
linear_est.train(train_input_fn)


INFO:tensorflow:Calling model_fn.


INFO:tensorflow:Calling model_fn.
  self.bias = self.add_variable(


INFO:tensorflow:Done calling model_fn.


INFO:tensorflow:Done calling model_fn.


INFO:tensorflow:Create CheckpointSaverHook.


INFO:tensorflow:Create CheckpointSaverHook.


INFO:tensorflow:Graph was finalized.


INFO:tensorflow:Graph was finalized.


INFO:tensorflow:Restoring parameters from d:\Users\User\OneDrive\Documents\Projects\tensorflow-learning\model.ckpt-200


INFO:tensorflow:Restoring parameters from d:\Users\User\OneDrive\Documents\Projects\tensorflow-learning\model.ckpt-200


Instructions for updating:
Use standard file utilities to get mtimes.


Instructions for updating:
Use standard file utilities to get mtimes.


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Done running local_init_op.


INFO:tensorflow:Done running local_init_op.


INFO:tensorflow:Calling checkpoint listeners before saving checkpoint 200...


INFO:tensorflow:Calling checkpoint listeners before saving checkpoint 200...


INFO:tensorflow:Saving checkpoints for 200 into d:\Users\User\OneDrive\Documents\Projects\tensorflow-learning\model.ckpt.


INFO:tensorflow:Saving checkpoints for 200 into d:\Users\User\OneDrive\Documents\Projects\tensorflow-learning\model.ckpt.


INFO:tensorflow:Calling checkpoint listeners after saving checkpoint 200...


INFO:tensorflow:Calling checkpoint listeners after saving checkpoint 200...


INFO:tensorflow:loss = 0.405249, step = 200


INFO:tensorflow:loss = 0.405249, step = 200


INFO:tensorflow:global_step/sec: 327.795


INFO:tensorflow:global_step/sec: 327.795


INFO:tensorflow:loss = 0.5998769, step = 300 (0.307 sec)


INFO:tensorflow:loss = 0.5998769, step = 300 (0.307 sec)


INFO:tensorflow:Calling checkpoint listeners before saving checkpoint 400...


INFO:tensorflow:Calling checkpoint listeners before saving checkpoint 400...


INFO:tensorflow:Saving checkpoints for 400 into d:\Users\User\OneDrive\Documents\Projects\tensorflow-learning\model.ckpt.


INFO:tensorflow:Saving checkpoints for 400 into d:\Users\User\OneDrive\Documents\Projects\tensorflow-learning\model.ckpt.


INFO:tensorflow:Calling checkpoint listeners after saving checkpoint 400...


INFO:tensorflow:Calling checkpoint listeners after saving checkpoint 400...


INFO:tensorflow:Loss for final step: 0.43712386.


INFO:tensorflow:Loss for final step: 0.43712386.


<tensorflow_estimator.python.estimator.canned.linear.LinearClassifierV2 at 0x20df95a0970>

In [68]:
result = linear_est.evaluate(eval_input_function) # Evaluate our trained model and get the metrics

INFO:tensorflow:Calling model_fn.


INFO:tensorflow:Calling model_fn.
  self.bias = self.add_variable(


INFO:tensorflow:Done calling model_fn.


INFO:tensorflow:Done calling model_fn.


INFO:tensorflow:Starting evaluation at 2021-11-14T18:59:00


INFO:tensorflow:Starting evaluation at 2021-11-14T18:59:00


INFO:tensorflow:Graph was finalized.


INFO:tensorflow:Graph was finalized.


INFO:tensorflow:Restoring parameters from d:\Users\User\OneDrive\Documents\Projects\tensorflow-learning\model.ckpt-200


INFO:tensorflow:Restoring parameters from d:\Users\User\OneDrive\Documents\Projects\tensorflow-learning\model.ckpt-200


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Done running local_init_op.


INFO:tensorflow:Done running local_init_op.


INFO:tensorflow:Inference Time : 0.48812s


INFO:tensorflow:Inference Time : 0.48812s


INFO:tensorflow:Finished evaluation at 2021-11-14-18:59:00


INFO:tensorflow:Finished evaluation at 2021-11-14-18:59:00


INFO:tensorflow:Saving dict for global step 200: accuracy = 0.75757575, accuracy_baseline = 0.625, auc = 0.8289256, auc_precision_recall = 0.8013525, average_loss = 0.48232135, global_step = 200, label/mean = 0.375, loss = 0.47344536, precision = 0.6923077, prediction/mean = 0.37914413, recall = 0.6363636


INFO:tensorflow:Saving dict for global step 200: accuracy = 0.75757575, accuracy_baseline = 0.625, auc = 0.8289256, auc_precision_recall = 0.8013525, average_loss = 0.48232135, global_step = 200, label/mean = 0.375, loss = 0.47344536, precision = 0.6923077, prediction/mean = 0.37914413, recall = 0.6363636


INFO:tensorflow:Saving 'checkpoint_path' summary for global step 200: d:\Users\User\OneDrive\Documents\Projects\tensorflow-learning\model.ckpt-200


INFO:tensorflow:Saving 'checkpoint_path' summary for global step 200: d:\Users\User\OneDrive\Documents\Projects\tensorflow-learning\model.ckpt-200


In [70]:
# All the metrics from running our trained model against our evaluation data.
result

{'accuracy': 0.75757575,
 'accuracy_baseline': 0.625,
 'auc': 0.8289256,
 'auc_precision_recall': 0.8013525,
 'average_loss': 0.48232135,
 'label/mean': 0.375,
 'loss': 0.47344536,
 'precision': 0.6923077,
 'prediction/mean': 0.37914413,
 'recall': 0.6363636,
 'global_step': 200}

In [79]:
# Try predicting with our model (only 75% accurate)
result = list(linear_est.predict(eval_input_function))

# peek our first result
print(result[0])

INFO:tensorflow:Calling model_fn.


INFO:tensorflow:Calling model_fn.
  self.bias = self.add_variable(


INFO:tensorflow:Done calling model_fn.


INFO:tensorflow:Done calling model_fn.


INFO:tensorflow:Graph was finalized.


INFO:tensorflow:Graph was finalized.


INFO:tensorflow:Restoring parameters from d:\Users\User\OneDrive\Documents\Projects\tensorflow-learning\model.ckpt-200


INFO:tensorflow:Restoring parameters from d:\Users\User\OneDrive\Documents\Projects\tensorflow-learning\model.ckpt-200


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Done running local_init_op.


INFO:tensorflow:Done running local_init_op.


{'logits': array([-2.2847672], dtype=float32), 'logistic': array([0.09239241], dtype=float32), 'probabilities': array([0.90760756, 0.09239241], dtype=float32), 'class_ids': array([0], dtype=int64), 'classes': array([b'0'], dtype=object), 'all_class_ids': array([0, 1]), 'all_classes': array([b'0', b'1'], dtype=object)}


In [80]:
# peek our first result
# 90% chance of not surviving vs 9.2% of surviving for the first person
print(result[0]['probabilities'])

[0.90760756 0.09239241]


In [78]:
# what does the profile of this person look like?
print(dfeval.loc[0])

sex                          male
age                          35.0
n_siblings_spouses              0
parch                           0
fare                         8.05
class                       Third
deck                      unknown
embark_town           Southampton
alone                           y
Name: 0, dtype: object


In [81]:
# Check if person survived or not in reality
print(y_eval.loc[0])

0
