In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [2]:
import os
import collections
import itertools

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf

In [4]:
from six.moves import urllib

In [6]:
URL_PATH = 'https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data'
DOWNLOADED_FILENAME = 'automobiles.csv'

def download_data():
  if not os.path.exists(DOWNLOADED_FILENAME):
    filename, _ = urllib.request.urlretrieve(URL_PATH, DOWNLOADED_FILENAME)
  print('Found and verified file from this path: ', URL_PATH)
  print('Downloaded file: ', DOWNLOADED_FILENAME)

download_data()

Found and verified file from this path:  https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data
Downloaded file:  automobiles.csv


In [7]:
COLUMN_TYPES = collections.OrderedDict([
  ('symboling',int),
  ('normalized-losses',float),
  ('make',str),
  ('fuel-type',str),
  ('aspiration',str),
  ('num-of-doors',str),
  ('body-style',str),
  ('drive-wheels',str),
  ('engine-location',str),
  ('wheel-base',float),
  ('length',float),
  ('width',float),
  ('height',float),
  ('curb-weight',float),
  ('engine-type',str),
  ('num-of-cylinders',str),
  ('engine-size',float),
  ('fuel-system',str),
  ('bore',float),
  ('stroke',float),
  ('compression-ratio',float),
  ('horsepower',float),
  ('peak-rpm',float),
  ('city-mpg',float),
  ('highway-mpg',float),
  ('price',float),
])

In [8]:
df = pd.read_csv(DOWNLOADED_FILENAME, names=COLUMN_TYPES.keys(), dtype=COLUMN_TYPES, na_values='?')

In [9]:
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130.0,mpfi,3.47,2.68,9.0,111.0,5000.0,21.0,27.0,13495.0
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130.0,mpfi,3.47,2.68,9.0,111.0,5000.0,21.0,27.0,16500.0
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152.0,mpfi,2.68,3.47,9.0,154.0,5000.0,19.0,26.0,16500.0
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109.0,mpfi,3.19,3.4,10.0,102.0,5500.0,24.0,30.0,13950.0
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136.0,mpfi,3.19,3.4,8.0,115.0,5500.0,18.0,22.0,17450.0


In [10]:
df.count()

symboling            205
normalized-losses    164
make                 205
fuel-type            205
aspiration           205
num-of-doors         203
body-style           205
drive-wheels         205
engine-location      205
wheel-base           205
length               205
width                205
height               205
curb-weight          205
engine-type          205
num-of-cylinders     205
engine-size          205
fuel-system          205
bore                 201
stroke               201
compression-ratio    205
horsepower           203
peak-rpm             203
city-mpg             205
highway-mpg          205
price                201
dtype: int64

In [11]:
df = df.dropna()
df.count()

symboling            159
normalized-losses    159
make                 159
fuel-type            159
aspiration           159
num-of-doors         159
body-style           159
drive-wheels         159
engine-location      159
wheel-base           159
length               159
width                159
height               159
curb-weight          159
engine-type          159
num-of-cylinders     159
engine-size          159
fuel-system          159
bore                 159
stroke               159
compression-ratio    159
horsepower           159
peak-rpm             159
city-mpg             159
highway-mpg          159
price                159
dtype: int64

In [12]:
TRIMMED_CSV_COLUMNS = [
  'make', 'fuel-type', 'aspiration', 'num-of-doors', 'body-style', 'drive-wheels', 'curb-weight', 'engine-type', 'num-of-cylinders', 'engine-size', 'fuel-system', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price'
]

In [13]:
df = df[TRIMMED_CSV_COLUMNS]
df.head()

Unnamed: 0,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,curb-weight,engine-type,num-of-cylinders,engine-size,fuel-system,horsepower,peak-rpm,city-mpg,highway-mpg,price
3,audi,gas,std,four,sedan,fwd,2337.0,ohc,four,109.0,mpfi,102.0,5500.0,24.0,30.0,13950.0
4,audi,gas,std,four,sedan,4wd,2824.0,ohc,five,136.0,mpfi,115.0,5500.0,18.0,22.0,17450.0
6,audi,gas,std,four,sedan,fwd,2844.0,ohc,five,136.0,mpfi,110.0,5500.0,19.0,25.0,17710.0
8,audi,gas,turbo,four,sedan,fwd,3086.0,ohc,five,131.0,mpfi,140.0,5500.0,17.0,20.0,23875.0
10,bmw,gas,std,two,sedan,rwd,2395.0,ohc,four,108.0,mpfi,101.0,5800.0,23.0,29.0,16430.0


In [17]:
Y_NAME = 'price'
def get_training_test_prediction_data(df):
  np.random.seed(None)
  x_train = df.sample(frac=0.8, random_state=None)
  x_test = df.drop(x_train.index)
  x_predict = x_test.sample(frac=0.2, random_state=None)
  
  y_train = x_train.pop(Y_NAME)
  y_test = x_test.pop(Y_NAME)
  y_predict = x_predict.pop(Y_NAME)
  
  return (x_train, y_train), (x_test, y_test), (x_predict, y_predict)

In [18]:
(x_train, y_train), (x_test, y_test), (x_predict, y_predict) = get_training_test_prediction_data(df)

In [20]:
x_train.head()

Unnamed: 0,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,curb-weight,engine-type,num-of-cylinders,engine-size,fuel-system,horsepower,peak-rpm,city-mpg,highway-mpg
37,honda,gas,std,two,hatchback,fwd,2236.0,ohc,four,110.0,1bbl,86.0,5800.0,27.0,33.0
99,nissan,gas,std,four,hatchback,fwd,2324.0,ohc,four,120.0,2bbl,97.0,5200.0,27.0,34.0
10,bmw,gas,std,two,sedan,rwd,2395.0,ohc,four,108.0,mpfi,101.0,5800.0,23.0,29.0
33,honda,gas,std,two,hatchback,fwd,1940.0,ohc,four,92.0,1bbl,76.0,6000.0,30.0,34.0
32,honda,gas,std,two,hatchback,fwd,1837.0,ohc,four,79.0,1bbl,60.0,5500.0,38.0,42.0


In [21]:
# neural network performs well with float number
PRICE_SCALING_FACTOR = 10000
y_train /= PRICE_SCALING_FACTOR
y_test /= PRICE_SCALING_FACTOR

In [22]:
df['make'].unique()

array(['audi', 'bmw', 'chevrolet', 'dodge', 'honda', 'jaguar', 'mazda',
       'mercedes-benz', 'mitsubishi', 'nissan', 'peugot', 'plymouth',
       'porsche', 'saab', 'subaru', 'toyota', 'volkswagen', 'volvo'],
      dtype=object)

In [23]:
curb_weight = tf.feature_column.numeric_column('curb-weight')
engine_size = tf.feature_column.numeric_column('engine-size')
horsepower = tf.feature_column.numeric_column('horsepower')
peak_rpm = tf.feature_column.numeric_column('peak-rpm')
city_mpg = tf.feature_column.numeric_column('city-mpg')
highway_mpg = tf.feature_column.numeric_column('highway-mpg')

In [24]:
body_style = tf.feature_column.categorical_column_with_vocabulary_list(key='body-style', vocabulary_list=df['body-style'].unique())
fuel_type = tf.feature_column.categorical_column_with_vocabulary_list(key='fuel-type', vocabulary_list=df['fuel-type'].unique())
aspiration = tf.feature_column.categorical_column_with_vocabulary_list(key='aspiration', vocabulary_list=df['aspiration'].unique())
num_of_doors = tf.feature_column.categorical_column_with_vocabulary_list(key='num-of-doors', vocabulary_list=df['num-of-doors'].unique())
drive_wheels = tf.feature_column.categorical_column_with_vocabulary_list(key='drive-wheels', vocabulary_list=df['drive-wheels'].unique())
engine_type = tf.feature_column.categorical_column_with_vocabulary_list(key='engine-type', vocabulary_list=df['engine-type'].unique())
num_of_cylinders = tf.feature_column.categorical_column_with_vocabulary_list(key='num-of-cylinders', vocabulary_list=df['num-of-cylinders'].unique())
fuel_system = tf.feature_column.categorical_column_with_vocabulary_list(key='fuel-system', vocabulary_list=df['fuel-system'].unique())

In [25]:
make = tf.feature_column.categorical_column_with_hash_bucket(key='make', hash_bucket_size=50)

In [26]:
feature_columns = [
  curb_weight, engine_size, horsepower, peak_rpm, city_mpg, highway_mpg,
  tf.feature_column.indicator_column(body_style),
  tf.feature_column.embedding_column(fuel_type, dimension=3),
  tf.feature_column.embedding_column(aspiration, dimension=3),
  tf.feature_column.embedding_column(num_of_doors, dimension=3),
  tf.feature_column.embedding_column(drive_wheels, dimension=3),
  tf.feature_column.embedding_column(engine_type, dimension=3),
  tf.feature_column.embedding_column(num_of_cylinders, dimension=3),
  tf.feature_column.embedding_column(fuel_system, dimension=3),
  tf.feature_column.embedding_column(make, dimension=3),
]

In [27]:
def input_fn(x_data, y_data, num_epochs, shuffle):
  return tf.estimator.inputs.pandas_input_fn(
    x=x_data,
    y=y_data,
    batch_size=64,
    num_epochs=num_epochs,
    shuffle=shuffle
  )

In [37]:
model = tf.estimator.DNNRegressor(hidden_units=[24,16,24], feature_columns=feature_columns)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_task_type': 'worker', '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f442a7456d0>, '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_service': None, '_num_ps_replicas': 0, '_tf_random_seed': None, '_master': '', '_num_worker_replicas': 1, '_task_id': 0, '_log_step_count_steps': 100, '_model_dir': '/tmp/tmpNQZFk8', '_save_summary_steps': 100}


In [29]:
model.train(input_fn=input_fn(x_train,y_train,num_epochs=None,shuffle=True),steps=10000)

INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into /tmp/tmp7EMkNG/model.ckpt.
INFO:tensorflow:loss = 14579608.0, step = 1
INFO:tensorflow:global_step/sec: 134.634
INFO:tensorflow:loss = 167.11539, step = 101 (0.750 sec)
INFO:tensorflow:global_step/sec: 175.068
INFO:tensorflow:loss = 91.90479, step = 201 (0.569 sec)
INFO:tensorflow:global_step/sec: 182.113
INFO:tensorflow:loss = 74.39903, step = 301 (0.549 sec)
INFO:tensorflow:global_step/sec: 181.969
INFO:tensorflow:loss = 36.154457, step = 401 (0.552 sec)
INFO:tensorflow:global_step/sec: 162.519
INFO:tensorflow:loss = 32.81778, step = 501 (0.615 sec)
INFO:tensorflow:global_step/sec: 171.623
INFO:tensorflow:loss = 21.875633, step = 601 (0.580 sec)
INFO:tensorflow:global_step/sec: 168.288
INFO:tensorflow:loss = 34.18802, step = 701 (0.594 sec)
INFO:tensorflow:global_step/sec: 185.594
INFO:tensorflow:loss = 25.758102, step = 801 (0.539 sec)
INFO:tensorflow:global_step/sec: 183.294
INFO:tensorflow:lo

<tensorflow.python.estimator.canned.dnn.DNNRegressor at 0x7f442ae6bad0>

In [30]:
results = model.evaluate(input_fn=input_fn(x_test,y_test,num_epochs=1,shuffle=False))

INFO:tensorflow:Starting evaluation at 2018-05-07-23:56:26
INFO:tensorflow:Restoring parameters from /tmp/tmp7EMkNG/model.ckpt-10000
INFO:tensorflow:Finished evaluation at 2018-05-07-23:56:26
INFO:tensorflow:Saving dict for global step 10000: average_loss = 0.15872158, global_step = 10000, loss = 5.0790906


In [31]:
predict_results = model.predict(input_fn=input_fn(x_predict,y_predict,num_epochs=1,shuffle=False))
predictions = list(itertools.islice(predict_results, len(x_predict)))

INFO:tensorflow:Restoring parameters from /tmp/tmp7EMkNG/model.ckpt-10000


In [32]:
predictions

[{'predictions': array([1.8309134], dtype=float32)},
 {'predictions': array([0.6049537], dtype=float32)},
 {'predictions': array([1.7558783], dtype=float32)},
 {'predictions': array([1.3926734], dtype=float32)},
 {'predictions': array([0.87404245], dtype=float32)},
 {'predictions': array([0.8393974], dtype=float32)}]

In [34]:
predicted_prices = [obj['predictions'][0]*PRICE_SCALING_FACTOR for obj in predictions]

In [35]:
predicted_prices

[18309.134244918823,
 6049.5370626449585,
 17558.783292770386,
 13926.73373222351,
 8740.424513816833,
 8393.973708152771]

In [36]:
y_predict

169     9989.0
120     6229.0
171    11549.0
6      17710.0
158     7898.0
34      7129.0
Name: price, dtype: float64