# Structure Data Example: Automobile dataset

https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data

In [86]:
from __future__ import print_function
from __future__ import division
from __future__ import absolute_import

In [87]:
# We're using pandas to read the CSV file. This is easy for small datasets, but for large and complex datasets,
# tensorflow parsing and processing functions are more powerful
import pandas as pd
import numpy as np

# TensorFlow
import tensorflow as tf
print('please make sure that version >= 1.2:')
print(tf.__version__)
print('@monteirom: I made changes so it also works with 1.1.0 that is the current pip install version')
print('@monteirom: The lines that were changed have @1.2 as comment')

# Layers that will define the features
#
# real_value_column: real values, float32
# sparse_column_with_hash_bucket: Use this when your sparse features are in string or integer format, 
#                                 but you don't have a vocab file that maps each value to an integer ID. 
#                                 output_id = Hash(input_feature_string) % bucket_size
# sparse_column_with_keys: Look up logic is as follows: 
#                          lookup_id = index_of_feature_in_keys if feature in keys else default_value.
#                          You should use this when you know the vocab file for the feature
# one_hot_column: Creates an _OneHotColumn for a one-hot or multi-hot repr in a DNN.
#                 The input can be a _SparseColumn which is created by `sparse_column_with_*`
#                 or crossed_column functions
from tensorflow.contrib.layers import real_valued_column, sparse_column_with_keys, sparse_column_with_hash_bucket
from tensorflow.contrib.layers import one_hot_column

please make sure that version >= 1.2:
1.2.0-rc1
@monteirom: I made changes so it also works with 1.1.0 that is the current pip install version
@monteirom: The lines that were changed have @1.2 as comment


# Please Download

**https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data
And move it to data/**

**So: data/imports-85.data is expected to exist!**

# Preparing the data

In [88]:
# The CSV file does not have a header, so we have to fill in column names.
names = [
    'symboling', 
    'normalized-losses', 
    'make', 
    'fuel-type', 
    'aspiration',
    'num-of-doors',
    'body-style',
    'drive-wheels',
    'engine-location',
    'wheel-base',
    'length',
    'width',
    'height',
    'curb-weight',
    'engine-type',
    'num-of-cylinders',
    'engine-size',
    'fuel-system',
    'bore',
    'stroke',
    'compression-ratio',
    'horsepower',
    'peak-rpm',
    'city-mpg',
    'highway-mpg',
    'price',
]

# We also have to specify dtypes.
dtypes = {
    'symboling': np.int32, 
    'normalized-losses': np.float32, 
    'make': str, 
    'fuel-type': str, 
    'aspiration': str,
    'num-of-doors': str,
    'body-style': str,
    'drive-wheels': str,
    'engine-location': str,
    'wheel-base': np.float32,
    'length': np.float32,
    'width': np.float32,
    'height': np.float32,
    'curb-weight': np.float32,
    'engine-type': str,
    'num-of-cylinders': str,
    'engine-size': np.float32,
    'fuel-system': str,
    'bore': np.float32,
    'stroke': np.float32,
    'compression-ratio': np.float32,
    'horsepower': np.float32,
    'peak-rpm': np.float32,
    'city-mpg': np.float32,
    'highway-mpg': np.float32,
    'price': np.float32,    
}

In [89]:
# Read the file.
df = pd.read_csv('data/imports-85.data', names=names, dtype=dtypes, na_values='?')

In [90]:
# Some rows don't have price data, we can't use those.
df = df.dropna(axis='rows', how='any', subset=['price'])

## Dealing with NaN

There are many approaches possibles for NaN values in the data, here we just changing it to " " or 0 depending of the data type. This is the simplest way, but for sure is not the best in most cases, so in practice you should try some other ways to use the NaN data. Some approaches are:

* use the mean of the row
* use the mean of the column
* if/else substituion (e.g if a lot of NaN do this, else do this other thing)
* ...
* google others


In [91]:
# Fill missing values in continuous columns with zeros instead of NaN.
float_columns = [k for k,v in dtypes.items() if v == np.float32]
df[float_columns] = df[float_columns].fillna(value=0., axis='columns')

# Fill missing values in continuous columns with '' instead of NaN (NaN mixed with strings is very bad for us).
string_columns = [k for k,v in dtypes.items() if v == str]
df[string_columns] = df[string_columns].fillna(value='', axis='columns')

## Standardize features

In [92]:
# We have too many variables let's just use some of them
df = df[['num-of-doors','num-of-cylinders', 'horsepower', 'make', 'price', 'length', 'height', 'width']]

In [93]:
# Since we're possibly dealing with parameters of different units and scales. We'll need to rescale our data.
# There are two main ways to do it: 
# * Normalization, which scales all numeric variables in the range [0,1].
#   Example:
# * Standardization, it will then transform it to have zero mean and unit variance.
#   Example: 
# Which is better? It deppends of your data and your features.
# But one disadvantage of normalization over standardization is that it loses 
# some information in the data. Since normalization loses more info it can make harder
# for gradient descent to converse, so we'll use standardization.
# In practice: please analyse your data and see what gives you better results.

def std(x):
    return (x - x.mean()) / x.std()

before = df.length[0]
df.length = std(df.length)
df.width = std(df.width)
df.height = std(df.height)
df.horsepower = std(df.horsepower)

after = df.length[0]
print('before:', before, 'after:', after)

before: 168.8 after: -0.438314


## Separating training data from testing data

In [94]:
TRAINING_DATA_SIZE = 160
TEST_DATA_SIZE = 10

LABEL = 'price'

# Split the data into a training set, eval set and test set
training_data = df[:TRAINING_DATA_SIZE]
eval_data = df[TRAINING_DATA_SIZE: TRAINING_DATA_SIZE + TEST_DATA_SIZE]
test_data = df[TRAINING_DATA_SIZE + TEST_DATA_SIZE:]

# Separate input features from labels
training_label = training_data.pop(LABEL)
eval_label = eval_data.pop(LABEL)
test_label = test_data.pop(LABEL)

# Using Tensorflow

## Defining input function

In [95]:
BATCH_SIZE = 64

# Make input function for training: 
#   num_epochs=None -> will cycle through input data forever
#   shuffle=True -> randomize order of input data
training_input_fn = tf.estimator.inputs.pandas_input_fn(x=training_data,
                                                        y=training_label,
                                                        batch_size=BATCH_SIZE,
                                                        shuffle=True,
                                                        num_epochs=None)

# Make input function for evaluation:
# shuffle=False -> do not randomize input data
eval_input_fn = tf.estimator.inputs.pandas_input_fn(x=eval_data,
                                                    y=eval_label,
                                                    batch_size=BATCH_SIZE,
                                                    shuffle=False)

# Make input function for testing:
# shuffle=False -> do not randomize input data
eval_input_fn = tf.estimator.inputs.pandas_input_fn(x=test_data,
                                                    y=test_label,
                                                    batch_size=1,
                                                    shuffle=False)

## Defining a Linear Estimator

In [100]:
# Describe how the model should interpret the inputs. The names of the feature columns have to match the names
# of the series in the dataframe.

# @1.2.0 tf.feature_column.numeric_column -> tf.contrib.layers.real_valued_column
horsepower = real_valued_column('horsepower')
width = real_valued_column('width')
height = real_valued_column('height')
length = real_valued_column('length')

# @1.2.0 tf.feature_column.categorical_column_with_hash_bucket -> tf.contrib.layers.sparse_column_with_hash_bucket
make = sparse_column_with_hash_bucket('make', 50)

# @1.2.0 tf.feature_column.categorical_column_with_vocabulary_list -> tf.contrib.layers.sparse_column_with_keys
fuel_type = sparse_column_with_keys('fuel-type', keys=['diesel', 'gas'])
num_of_doors = sparse_column_with_keys('num-of-doors', keys=['two', 'four'])
num_of_cylinders = sparse_column_with_keys('num-of-cylinders', ['eight', 'five', 'four', 'six', 'three', 'twelve', 'two'])

linear_features = [horsepower, make, num_of_doors, num_of_cylinders, length, width, height]

In [101]:
regressor = tf.contrib.learn.LinearRegressor(feature_columns=linear_features, model_dir='tensorboard/linear_regressor/')

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_save_checkpoints_steps': None, '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1
}
, '_task_id': 0, '_keep_checkpoint_max': 5, '_save_checkpoints_secs': 600, '_session_config': None, '_model_dir': 'tensorboard/linear_regressor/', '_master': '', '_tf_random_seed': None, '_environment': 'local', '_num_ps_replicas': 0, '_num_worker_replicas': 0, '_task_type': None, '_keep_checkpoint_every_n_hours': 10000, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7faf67544b70>, '_save_summary_steps': 100, '_evaluation_master': '', '_is_chief': True}


## Training

In [102]:
regressor.fit(input_fn=training_input_fn, steps=10000)

Instructions for updating:
Please switch to tf.summary.scalar. Note that tf.summary.scalar uses the node name instead of the tag. This means that TensorFlow will automatically de-duplicate summary names based on the scope they are created in. Also, passing a tensor or list of tags to a scalar summary op is no longer supported.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into tensorboard/linear_regressor/model.ckpt.
INFO:tensorflow:loss = 2.59493e+08, step = 1
INFO:tensorflow:global_step/sec: 513.702
INFO:tensorflow:loss = 2.42048e+08, step = 101 (0.198 sec)
INFO:tensorflow:global_step/sec: 744.834
INFO:tensorflow:loss = 2.35011e+08, step = 201 (0.135 sec)
INFO:tensorflow:global_step/sec: 687.575
INFO:tensorflow:loss = 2.44327e+08, step = 301 (0.144 sec)
INFO:tensorflow:global_step/sec: 823.258
INFO:tensorflow:loss = 1.79146e+08, step = 401 (0.121 sec)
INFO:tensorflow:global_step/sec: 824.485
INFO:tensorflow:loss = 2.55799e+08, step = 501 (0.120 

INFO:tensorflow:global_step/sec: 774.118
INFO:tensorflow:loss = 3.46903e+08, step = 6901 (0.129 sec)
INFO:tensorflow:global_step/sec: 742.067
INFO:tensorflow:loss = 3.03337e+08, step = 7001 (0.135 sec)
INFO:tensorflow:global_step/sec: 737.658
INFO:tensorflow:loss = 2.33171e+08, step = 7101 (0.136 sec)
INFO:tensorflow:global_step/sec: 746.162
INFO:tensorflow:loss = 2.02141e+08, step = 7201 (0.134 sec)
INFO:tensorflow:global_step/sec: 644.51
INFO:tensorflow:loss = 3.23866e+08, step = 7301 (0.156 sec)
INFO:tensorflow:global_step/sec: 748.617
INFO:tensorflow:loss = 2.31643e+08, step = 7401 (0.132 sec)
INFO:tensorflow:global_step/sec: 758.476
INFO:tensorflow:loss = 2.89519e+08, step = 7501 (0.132 sec)
INFO:tensorflow:global_step/sec: 802.441
INFO:tensorflow:loss = 2.23578e+08, step = 7601 (0.125 sec)
INFO:tensorflow:global_step/sec: 838.834
INFO:tensorflow:loss = 2.81537e+08, step = 7701 (0.119 sec)
INFO:tensorflow:global_step/sec: 841.294
INFO:tensorflow:loss = 1.60254e+08, step = 7801 (0.

LinearRegressor(params={'joint_weights': False, 'gradient_clip_norm': None, 'feature_columns': [_RealValuedColumn(column_name='horsepower', dimension=1, default_value=None, dtype=tf.float32, normalizer=None), _SparseColumnHashed(column_name='make', is_integerized=False, bucket_size=50, lookup_config=None, combiner='sum', dtype=tf.string), _SparseColumnKeys(column_name='num-of-doors', is_integerized=False, bucket_size=None, lookup_config=_SparseIdLookupConfig(vocabulary_file=None, keys=('two', 'four'), num_oov_buckets=0, vocab_size=2, default_value=-1), combiner='sum', dtype=tf.string), _SparseColumnKeys(column_name='num-of-cylinders', is_integerized=False, bucket_size=None, lookup_config=_SparseIdLookupConfig(vocabulary_file=None, keys=('eight', 'five', 'four', 'six', 'three', 'twelve', 'two'), num_oov_buckets=0, vocab_size=7, default_value=-1), combiner='sum', dtype=tf.string), _RealValuedColumn(column_name='length', dimension=1, default_value=None, dtype=tf.float32, normalizer=None),

## Evaluating

In [103]:
regressor.evaluate(input_fn=eval_input_fn)

Instructions for updating:
Please switch to tf.summary.scalar. Note that tf.summary.scalar uses the node name instead of the tag. This means that TensorFlow will automatically de-duplicate summary names based on the scope they are created in. Also, passing a tensor or list of tags to a scalar summary op is no longer supported.
INFO:tensorflow:Starting evaluation at 2017-06-15-14:01:39
INFO:tensorflow:Restoring parameters from tensorboard/linear_regressor/model.ckpt-10000
INFO:tensorflow:Finished evaluation at 2017-06-15-14:01:39
INFO:tensorflow:Saving dict for global step 10000: global_step = 10000, loss = 2.03146e+08


{'global_step': 10000, 'loss': 2.0314605e+08}

## Predicting

In [104]:
preds = list(regressor.predict(input_fn=eval_input_fn))

for i in range(TEST_DATA_SIZE):
    print('prediction:', preds[i], 'real value:', test_label.iloc[i])

Instructions for updating:
Please switch to predict_scores, or set `outputs` argument.
INFO:tensorflow:Restoring parameters from tensorboard/linear_regressor/model.ckpt-10000
prediction: 142.403 real value: 10698.0
prediction: 159.54 real value: 9988.0
prediction: 160.006 real value: 10898.0
prediction: 159.54 real value: 11248.0
prediction: 262.848 real value: 16558.0
prediction: 262.848 real value: 15998.0
prediction: 251.596 real value: 15690.0
prediction: 251.596 real value: 15750.0
prediction: 58.2873 real value: 7775.0
prediction: 88.861 real value: 7975.0


## Defining a DNN Estimator

In [105]:
# @1.2.0 tf.feature_column.indicator_column -> tf.contrib.layers.one_hot_column(tf.contrib.layers.sparse_column_with_keys(...))
dnn_features = [
    #numerical features
    length, width, height, horsepower,    
    # densify categorical features:
    one_hot_column(make),
    one_hot_column(num_of_doors)
]

In [107]:
dnnregressor = tf.contrib.learn.DNNRegressor(feature_columns=dnn_features,
                                             hidden_units=[50, 30, 10], model_dir='tensorboard/DNN_regressor/')

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_save_checkpoints_steps': None, '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1
}
, '_task_id': 0, '_keep_checkpoint_max': 5, '_save_checkpoints_secs': 600, '_session_config': None, '_model_dir': 'tensorboard/DNN_regressor/', '_master': '', '_tf_random_seed': None, '_environment': 'local', '_num_ps_replicas': 0, '_num_worker_replicas': 0, '_task_type': None, '_keep_checkpoint_every_n_hours': 10000, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7faf6408c9e8>, '_save_summary_steps': 100, '_evaluation_master': '', '_is_chief': True}


## Training

In [108]:
dnnregressor.fit(input_fn=training_input_fn, steps=10000)

Instructions for updating:
Please switch to tf.summary.scalar. Note that tf.summary.scalar uses the node name instead of the tag. This means that TensorFlow will automatically de-duplicate summary names based on the scope they are created in. Also, passing a tensor or list of tags to a scalar summary op is no longer supported.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into tensorboard/DNN_regressor/model.ckpt.
INFO:tensorflow:loss = 2.90802e+08, step = 1
INFO:tensorflow:global_step/sec: 643.468
INFO:tensorflow:loss = 1.1641e+07, step = 101 (0.156 sec)
INFO:tensorflow:global_step/sec: 694.261
INFO:tensorflow:loss = 7.98193e+06, step = 201 (0.144 sec)
INFO:tensorflow:global_step/sec: 747.178
INFO:tensorflow:loss = 3.06143e+06, step = 301 (0.134 sec)
INFO:tensorflow:global_step/sec: 746.46
INFO:tensorflow:loss = 4.55193e+06, step = 401 (0.135 sec)
INFO:tensorflow:global_step/sec: 804.846
INFO:tensorflow:loss = 4.13768e+06, step = 501 (0.124 sec)


KeyboardInterrupt: 

## Evaluating

In [80]:
dnnregressor.evaluate(input_fn=eval_input_fn)

Instructions for updating:
Please switch to tf.summary.scalar. Note that tf.summary.scalar uses the node name instead of the tag. This means that TensorFlow will automatically de-duplicate summary names based on the scope they are created in. Also, passing a tensor or list of tags to a scalar summary op is no longer supported.
INFO:tensorflow:Starting evaluation at 2017-06-15-00:11:52
INFO:tensorflow:Restoring parameters from /tmp/tmp5u7roxz9/model.ckpt-10000
INFO:tensorflow:Finished evaluation at 2017-06-15-00:11:52
INFO:tensorflow:Saving dict for global step 10000: global_step = 10000, loss = 1.54887e+07


{'global_step': 10000, 'loss': 15488682.0}

## Predicting

In [81]:
preds = list(dnnregressor.predict(input_fn=eval_input_fn))

for i in range(TEST_DATA_SIZE):
    print('prediction:', preds[i], 'real value:', test_label.iloc[i])

Instructions for updating:
Please switch to predict_scores, or set `outputs` argument.
INFO:tensorflow:Restoring parameters from /tmp/tmp5u7roxz9/model.ckpt-10000
prediction: 9597.67 real value: 10698.0
prediction: 11866.2 real value: 9988.0
prediction: 11514.2 real value: 10898.0
prediction: 11866.2 real value: 11248.0
prediction: 23688.1 real value: 16558.0
prediction: 23688.1 real value: 15998.0
prediction: 22791.1 real value: 15690.0
prediction: 22791.1 real value: 15750.0
prediction: 4272.87 real value: 7775.0
prediction: 6275.6 real value: 7975.0


### Creating an Experiment

In [82]:
# @1.2.0 experiment_fn(run_config, params) - > experiment_fn(output_dir)
def experiment_fn(output_dir):
    # This function makes an Experiment, containing an Estimator and inputs for training and evaluation.
    # You can use params and config here to customize the Estimator depending on the cluster or to use
    # hyperparameter tuning.

    # Collect information for training
    # @1.2.0 config=run_config -> ''
    return tf.contrib.learn.Experiment(estimator=tf.contrib.learn.LinearRegressor(
                                     feature_columns=linear_features, model_dir=output_dir),
                                     train_input_fn=training_input_fn,
                                     train_steps=10000,
                                     eval_input_fn=eval_input_fn)

In [83]:
import shutil
# @1.2.0 tf.contrib.learn.learn_runner(exp, run_config=tf.contrib.learn.RunConfig(model_dir="/tmp/output_dir")
# -> tf.contrib.learn.python.learn.learm_runner.run(exp, output_dir='/tmp/output_dir')
shutil.rmtree("/tmp/output_dir", ignore_errors=True)

from tensorflow.contrib.learn.python.learn import learn_runner
learn_runner.run(experiment_fn, output_dir='/tmp/output_dir')

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_save_checkpoints_steps': None, '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1
}
, '_task_id': 0, '_keep_checkpoint_max': 5, '_save_checkpoints_secs': 600, '_session_config': None, '_model_dir': '/tmp/output_dir', '_master': '', '_tf_random_seed': None, '_environment': 'local', '_num_ps_replicas': 0, '_num_worker_replicas': 0, '_task_type': None, '_keep_checkpoint_every_n_hours': 10000, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7faf2c6dc5f8>, '_save_summary_steps': 100, '_evaluation_master': '', '_is_chief': True}
Instructions for updating:
Monitors are deprecated. Please use tf.train.SessionRunHook.
Instructions for updating:
Please switch to tf.summary.scalar. Note that tf.summary.scalar uses the node name instead of the tag. This means that TensorFlow will automatically de-duplicate summary names based on the scope they are created in. Also, passing a tensor 

INFO:tensorflow:loss = 2.04795e+08, step = 3301 (0.208 sec)
INFO:tensorflow:global_step/sec: 485.621
INFO:tensorflow:loss = 2.34377e+08, step = 3401 (0.206 sec)
INFO:tensorflow:global_step/sec: 486.705
INFO:tensorflow:loss = 2.8409e+08, step = 3501 (0.205 sec)
INFO:tensorflow:global_step/sec: 487.064
INFO:tensorflow:loss = 2.64078e+08, step = 3601 (0.205 sec)
INFO:tensorflow:global_step/sec: 479.578
INFO:tensorflow:loss = 1.93784e+08, step = 3701 (0.209 sec)
INFO:tensorflow:global_step/sec: 477.774
INFO:tensorflow:loss = 2.8074e+08, step = 3801 (0.209 sec)
INFO:tensorflow:global_step/sec: 485.669
INFO:tensorflow:loss = 2.33637e+08, step = 3901 (0.206 sec)
INFO:tensorflow:global_step/sec: 486.921
INFO:tensorflow:loss = 1.72349e+08, step = 4001 (0.205 sec)
INFO:tensorflow:global_step/sec: 485.449
INFO:tensorflow:loss = 2.2439e+08, step = 4101 (0.206 sec)
INFO:tensorflow:global_step/sec: 485.478
INFO:tensorflow:loss = 3.11015e+08, step = 4201 (0.206 sec)
INFO:tensorflow:global_step/sec: 4

INFO:tensorflow:Evaluation [3/100]
INFO:tensorflow:Evaluation [4/100]
INFO:tensorflow:Evaluation [5/100]
INFO:tensorflow:Evaluation [6/100]
INFO:tensorflow:Evaluation [7/100]
INFO:tensorflow:Evaluation [8/100]
INFO:tensorflow:Evaluation [9/100]
INFO:tensorflow:Evaluation [10/100]
INFO:tensorflow:Evaluation [11/100]
INFO:tensorflow:Evaluation [12/100]
INFO:tensorflow:Evaluation [13/100]
INFO:tensorflow:Evaluation [14/100]
INFO:tensorflow:Evaluation [15/100]
INFO:tensorflow:Evaluation [16/100]
INFO:tensorflow:Evaluation [17/100]
INFO:tensorflow:Evaluation [18/100]
INFO:tensorflow:Evaluation [19/100]
INFO:tensorflow:Evaluation [20/100]
INFO:tensorflow:Evaluation [21/100]
INFO:tensorflow:Evaluation [22/100]
INFO:tensorflow:Evaluation [23/100]
INFO:tensorflow:Evaluation [24/100]
INFO:tensorflow:Evaluation [25/100]
INFO:tensorflow:Evaluation [26/100]
INFO:tensorflow:Evaluation [27/100]
INFO:tensorflow:Evaluation [28/100]
INFO:tensorflow:Evaluation [29/100]
INFO:tensorflow:Evaluation [30/100]

({'global_step': 10000, 'loss': 2.0313659e+08}, [])