In [7]:
import pandas as pd
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams["patch.force_edgecolor"] = True # in matplotlib, edge borders are turned off by default. 
# quell warnings
import warnings
warnings.filterwarnings('ignore')

## Using TensorFlow Estimators
### Analyzing the Iris Dataset

The newer and easier way of doing things. Much closer to what we've done with Scikit-Learn

**Estimator Steps**
- Read in the data (normalize if necessary)
- Train/Test Split the data
- Create Estimator Feature Columns (a list of specialized feature columns)
- Create Input Estimator Function (a way of organizing your training data)
- Train Estimator Model 
- Predict with New Test Input Function



**Using the Iris CSV file with pandas get data into correct formats**

In [8]:
df = pd.read_csv('iris.csv')

In [9]:
df.head(3)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
sepal length (cm)    150 non-null float64
sepal width (cm)     150 non-null float64
petal length (cm)    150 non-null float64
petal width (cm)     150 non-null float64
target               150 non-null float64
dtypes: float64(5)
memory usage: 5.9 KB


**To use TensorFlow we have to:**
- scrub the column names of spaces and special characters
- the Target column (binary yes or no) must be an integer


In [11]:
df.columns

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)', 'target'],
      dtype='object')

In [12]:
# scrub spaces and parens
df.columns = ['sepal_length', 'sepal_width', 'petal_length',
       'petal_width', 'target']

In [13]:
df.columns

Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'target'], dtype='object')

In [14]:
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,target
0,5.1,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
4,5.0,3.6,1.4,0.2,0.0


In [15]:
# recast target as an integer
df['target'] = df['target'].apply(int)

In [16]:
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


#### Now grab the features and separate them from the target

target y = just 'target'  
features X = 'everything but 'target''

In [17]:
y = df['target']
# remember to use axis=1 for columnsa
X = df.drop('target',axis=1)

In [18]:
# Y is just the actual classes. 
# One problem is that they're currently sorted in order, we'll shuffle during training
y.head()

0    0
1    0
2    0
3    0
4    0
Name: target, dtype: int64

In [19]:
X.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


**Train/Test Split**

In [20]:
# ML imports
import sklearn
from sklearn import metrics
# Linear Regression 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
# Logistic Regression
#
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [21]:
# The Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

**Using TensorFlow**

**Create Feature Columns** using tf.feature_column.numeric_column

In [22]:
X.columns

Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width'], dtype='object')

In [23]:
feat_cols = []
for col in X.columns:
    feat_cols.append(tf.feature_column.numeric_column(col))

In [24]:
feat_cols

[NumericColumn(key='sepal_length', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='sepal_width', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='petal_length', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='petal_width', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None)]

**Now create an Input Function**

need 2 - one for training and one for evaluation


A note about batch_size - play around with the value - too large a size will produce errors.  
Errors in TensorFlow often present themselves as *Predictions that are empty or None*

An *Epoch* is when you've gone through your training data one time. So num_epochs=5 sets a limit - after the data has been trained 5 times, it's complete. (even if it's hit the number of steps indicated in the estimator). We'll work with steps later on.

In [26]:
# Create The Input Function
# two main types of inputs - one for numpy and one for pandas
# 'x', 'y=None', 'batch_size=128', 'num_epochs=1', 'shuffle=None',et al
# we want shuffle=True to compensate for the earlier Classes being sorted in the dataframe
# Train/Test Split already does a shuffle by default, also
tf.compat.v1.estimator.inputs.pandas_input_fn
input_func = tf.compat.v1.estimator.inputs.pandas_input_fn(x=X_train,y=y_train,batch_size=10,num_epochs=5,shuffle=True)

W0828 13:22:23.515229 4421977536 module_wrapper.py:138] From /usr/local/lib/python3.7/site-packages/tensorflow/python/util/lazy_loader.py:63: The name tf.estimator.inputs is deprecated. Please use tf.compat.v1.estimator.inputs instead.



**create the classifier**


Output will look something like this:

INFO:tensorflow:Using default config.
WARNING:tensorflow:Using temporary folder as model directory: /var/folders/y7/npvsnnsx4fb77q7hsmzg0rn80000gn/T/tmpq97tjf54
INFO:tensorflow:Using config: {'_model_dir': '/var/folders/y7/npvsnnsx4fb77q7hsmzg0rn80000gn/T/tmpq97tjf54', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x136ea0ba8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}

In [27]:
# Use the Deep Neural Network Classifier/Estimator
# Hidden Layers
# hidden_units takes a list where every number is a layer and every are the number of neurons in that layer
# so [10,20,10] means 3x layers, first layer with 10 neurons, 2nd layer with 20, etc
# input layer->hidden layers->output layer
# n_classes is 3 (species of irises)
classifier = tf.estimator.DNNClassifier(hidden_units=[10,20,10], n_classes=3, feature_columns=feat_cols)

W0828 13:22:32.757361 4421977536 estimator.py:1846] Using temporary folder as model directory: /var/folders/y7/npvsnnsx4fb77q7hsmzg0rn80000gn/T/tmpy4idptdv


**Now train the Estimator/Classifier**

This step is very similar to running .fit() on a scikit-learn model.

This is where we drop in the input function we created.

classifier.train(
    ['input_fn', 'hooks=None', 'steps=None', 'max_steps=None', 'saving_listeners=None'],
    
----

When this is run you'll actually see the training occur. Something like: 

WARNING:tensorflow:From /usr/local/lib/python3.7/site-packages/tensorflow_estimator/python/estimator/inputs/queues/feeding_queue_runner.py:62: QueueRunner.__init__ (from tensorflow.python.training.queue_runner_impl) is deprecated and will be removed in a future version.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
WARNING:tensorflow:From /usr/local/lib/python3.7/site-packages/tensorflow_estimator/python/estimator/inputs/queues/feeding_functions.py:500: add_queue_runner (from tensorflow.python.training.queue_runner_impl) is deprecated and will be removed in a future version.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
INFO:tensorflow:Calling model_fn.
WARNING:tensorflow:From /usr/local/lib/python3.7/site-packages/tensorflow/python/feature_column/feature_column_v2.py:2703: to_float (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.cast instead.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
WARNING:tensorflow:From /usr/local/lib/python3.7/site-packages/tensorflow/python/training/monitored_session.py:809: start_queue_runners (from tensorflow.python.training.queue_runner_impl) is deprecated and will be removed in a future version.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
INFO:tensorflow:Saving checkpoints for 0 into /var/folders/y7/npvsnnsx4fb77q7hsmzg0rn80000gn/T/tmpq97tjf54/model.ckpt.
INFO:tensorflow:loss = 12.018873, step = 1
INFO:tensorflow:Saving checkpoints for 50 into /var/folders/y7/npvsnnsx4fb77q7hsmzg0rn80000gn/T/tmpq97tjf54/model.ckpt.
INFO:tensorflow:Loss for final step: 2.7068002.

In [28]:
classifier.train(input_fn=input_func,steps=50)

W0828 13:22:47.365128 4421977536 deprecation.py:506] From /usr/local/lib/python3.7/site-packages/tensorflow/python/ops/resource_variable_ops.py:1666: calling BaseResourceVariable.__init__ (from tensorflow.python.ops.resource_variable_ops) with constraint is deprecated and will be removed in a future version.
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
W0828 13:22:47.366128 4421977536 deprecation.py:323] From /usr/local/lib/python3.7/site-packages/tensorflow/python/training/training_util.py:236: Variable.initialized_value (from tensorflow.python.ops.variables) is deprecated and will be removed in a future version.
Instructions for updating:
Use Variable.read_value. Variables in 2.X are initialized automatically both in eager and graph (inside tf.defun) contexts.
W0828 13:22:47.390623 4421977536 deprecation.py:323] From /usr/local/lib/python3.7/site-packages/tensorflow_estimator/python/estimator/inputs/queues/feeding_queue_runner.py:65: QueueRunner.__

<tensorflow_estimator.python.estimator.canned.dnn.DNNClassifierV2 at 0x13d54eb90>

 **Now lets evaluate how well it performed**
 
 To do that we need to create another input function, but this one will pass test data

In [30]:
# Prediction Function
# No need to set y because that's what we're predicting, shuffle=False because there's no need to shuffle again
pred_fn = tf.compat.v1.estimator.inputs.pandas_input_fn(x=X_test,batch_size=len(X_test),shuffle=False)

**now create predictions**

In [31]:
# Classifier.predict is a generator, not a static list. You want to cast it to a list 
# if you aren't going to iterate through the predictions
list(classifier.predict(input_fn=pred_fn))

W0828 13:23:27.247546 4421977536 base_layer_v1.py:1825] Layer dnn is casting an input tensor from dtype float64 to the layer's dtype of float32, which is new behavior in TensorFlow 2.  The layer has dtype float32 because it's dtype defaults to floatx.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



[{'logits': array([0.7330219 , 0.50403357, 0.894268  ], dtype=float32),
  'probabilities': array([0.336665  , 0.2677624 , 0.39557263], dtype=float32),
  'class_ids': array([2]),
  'classes': array([b'2'], dtype=object),
  'all_class_ids': array([0, 1, 2], dtype=int32),
  'all_classes': array([b'0', b'1', b'2'], dtype=object)},
 {'logits': array([0.15127988, 0.34906673, 0.28618482], dtype=float32),
  'probabilities': array([0.297342  , 0.36237147, 0.34028652], dtype=float32),
  'class_ids': array([1]),
  'classes': array([b'1'], dtype=object),
  'all_class_ids': array([0, 1, 2], dtype=int32),
  'all_classes': array([b'0', b'1', b'2'], dtype=object)},
 {'logits': array([1.1229699, 0.6428395, 1.5275804], dtype=float32),
  'probabilities': array([0.32077795, 0.1984662 , 0.48075593], dtype=float32),
  'class_ids': array([2]),
  'classes': array([b'2'], dtype=object),
  'all_class_ids': array([0, 1, 2], dtype=int32),
  'all_classes': array([b'0', b'1', b'2'], dtype=object)},
 {'logits': arra

In [32]:
predictions = list(classifier.predict(input_fn=pred_fn))

W0828 13:23:30.777132 4421977536 base_layer_v1.py:1825] Layer dnn is casting an input tensor from dtype float64 to the layer's dtype of float32, which is new behavior in TensorFlow 2.  The layer has dtype float32 because it's dtype defaults to floatx.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



In [33]:
# Returns a list of dictionaries 
predictions

[{'logits': array([0.7330219 , 0.50403357, 0.894268  ], dtype=float32),
  'probabilities': array([0.336665  , 0.2677624 , 0.39557263], dtype=float32),
  'class_ids': array([2]),
  'classes': array([b'2'], dtype=object),
  'all_class_ids': array([0, 1, 2], dtype=int32),
  'all_classes': array([b'0', b'1', b'2'], dtype=object)},
 {'logits': array([0.15127988, 0.34906673, 0.28618482], dtype=float32),
  'probabilities': array([0.297342  , 0.36237147, 0.34028652], dtype=float32),
  'class_ids': array([1]),
  'classes': array([b'1'], dtype=object),
  'all_class_ids': array([0, 1, 2], dtype=int32),
  'all_classes': array([b'0', b'1', b'2'], dtype=object)},
 {'logits': array([1.1229699, 0.6428395, 1.5275804], dtype=float32),
  'probabilities': array([0.32077795, 0.1984662 , 0.48075593], dtype=float32),
  'class_ids': array([2]),
  'classes': array([b'2'], dtype=object),
  'all_class_ids': array([0, 1, 2], dtype=int32),
  'all_classes': array([b'0', b'1', b'2'], dtype=object)},
 {'logits': arra

#### Interpreting the output:

This object was predicted to be in Class 1:   
'class_ids': array([1])

This object had a %0.01549307 of being Class A, %0.7058529 of being Class B, et al. %0.7058529 was the highest:    
'probabilities': array([0.01549307, 0.7058529 , 0.27865404]


Raw output:  
[{'logits': array([-3.9265225 , -0.10750813, -1.036944  ], dtype=float32),
  'probabilities': array([0.01549307, 0.7058529 , 0.27865404], dtype=float32),
  'class_ids': array([1]),
  'classes': array([b'1'], dtype=object)},
  

### Now to evaluating the predictions

In [34]:
final_preds = []
for pred in predictions:
    final_preds.append(pred['class_ids'][0])

In [35]:
final_preds

[2,
 1,
 2,
 2,
 2,
 1,
 2,
 2,
 2,
 2,
 2,
 1,
 1,
 1,
 1,
 2,
 2,
 2,
 2,
 2,
 1,
 2,
 1,
 2,
 2,
 2,
 2,
 2,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 2,
 2,
 1,
 1,
 1,
 2,
 2,
 2,
 1,
 1]

**Now lets see how it performed using a Classification Report and a Confusion Matrix** 

In [36]:
print(confusion_matrix(y_test,final_preds))
print(classification_report(y_test,final_preds))

[[ 0 19  0]
 [ 0  0 13]
 [ 0  0 13]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        19
           1       0.00      0.00      0.00        13
           2       0.50      1.00      0.67        13

   micro avg       0.29      0.29      0.29        45
   macro avg       0.17      0.33      0.22        45
weighted avg       0.14      0.29      0.19        45



### Iterating on the test to improve the scores

Here is where you can go back to these earlier steps and play around with *batch_size* and *hidden_units* measures to get better results

*Use larger batches and more hidden layers to tweak*  
examples- try batch_size=30 and hidden_units=[10,30,20,12,10,20,22]


**Original Functions**  
Here change **batch_size**  
input_func = tf.estimator.inputs.pandas_input_fn(x=X_train,y=y_train,batch_size=10,num_epochs=5,shuffle=True)  

Here change **hidden_layers**  
classifier = tf.estimator.DNNClassifier(hidden_units=[10,20,10], n_classes=3, feature_columns=feat_cols)