In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [2]:
import os
import collections
import itertools

In [3]:
import numpy as np
import pandas as pd
import tensorflow as tf

In [4]:
from six.moves import urllib

In [5]:
print(np.__version__)
print(pd.__version__)
print(tf.__version__)

1.13.3
0.20.1
1.4.1


In [6]:
URL_PATH = "https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data"

DOWNLOADED_FILENAME = "automobiles.csv"

def download_data():
    if not os.path.exists(DOWNLOADED_FILENAME):
        filename, _ = urllib.request.urlretrieve(URL_PATH, DOWNLOADED_FILENAME)

    print('Found and verified file from this path: ', URL_PATH)
    print('Downloaded file: ', DOWNLOADED_FILENAME)        

In [7]:
download_data()

Found and verified file from this path:  https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data
Downloaded file:  automobiles.csv


In [8]:
COLUMN_TYPES = collections.OrderedDict([
    ("symboling", int),
    ("normalized-losses", float),
    ("make", str),
    ("fuel-type", str),
    ("aspiration", str),
    ("num-of-doors", str),
    ("body-style", str),
    ("drive-wheels", str),
    ("engine-location", str),
    ("wheel-base", float),
    ("length", float),
    ("width", float),
    ("height", float),
    ("curb-weight", float),
    ("engine-type", str),
    ("num-of-cylinders", str),
    ("engine-size", float),
    ("fuel-system", str),
    ("bore", float),
    ("stroke", float),
    ("compression-ratio", float),
    ("horsepower", float),
    ("peak-rpm", float),
    ("city-mpg", float),
    ("highway-mpg", float),
    ("price", float)
])

In [9]:
df = pd.read_csv(DOWNLOADED_FILENAME, names=COLUMN_TYPES.keys(),
                 dtype=COLUMN_TYPES, na_values="?")

In [10]:
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130.0,mpfi,3.47,2.68,9.0,111.0,5000.0,21.0,27.0,13495.0
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130.0,mpfi,3.47,2.68,9.0,111.0,5000.0,21.0,27.0,16500.0
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152.0,mpfi,2.68,3.47,9.0,154.0,5000.0,19.0,26.0,16500.0
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109.0,mpfi,3.19,3.4,10.0,102.0,5500.0,24.0,30.0,13950.0
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136.0,mpfi,3.19,3.4,8.0,115.0,5500.0,18.0,22.0,17450.0


In [11]:
df.count()

symboling            205
normalized-losses    164
make                 205
fuel-type            205
aspiration           205
num-of-doors         203
body-style           205
drive-wheels         205
engine-location      205
wheel-base           205
length               205
width                205
height               205
curb-weight          205
engine-type          205
num-of-cylinders     205
engine-size          205
fuel-system          205
bore                 201
stroke               201
compression-ratio    205
horsepower           203
peak-rpm             203
city-mpg             205
highway-mpg          205
price                201
dtype: int64

In [12]:
df = df.dropna()

In [13]:
df.count()

symboling            159
normalized-losses    159
make                 159
fuel-type            159
aspiration           159
num-of-doors         159
body-style           159
drive-wheels         159
engine-location      159
wheel-base           159
length               159
width                159
height               159
curb-weight          159
engine-type          159
num-of-cylinders     159
engine-size          159
fuel-system          159
bore                 159
stroke               159
compression-ratio    159
horsepower           159
peak-rpm             159
city-mpg             159
highway-mpg          159
price                159
dtype: int64

In [14]:
TRIMMED_CSV_COLUMNS = [
    "make", "fuel-type", "aspiration", "num-of-doors", "body-style",
    "drive-wheels", "curb-weight", "engine-type", "num-of-cylinders", "engine-size",
    "fuel-system", "horsepower", "peak-rpm", "city-mpg", "highway-mpg", "price"
]

In [15]:
df = df[TRIMMED_CSV_COLUMNS]

In [16]:
df.head()

Unnamed: 0,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,curb-weight,engine-type,num-of-cylinders,engine-size,fuel-system,horsepower,peak-rpm,city-mpg,highway-mpg,price
3,audi,gas,std,four,sedan,fwd,2337.0,ohc,four,109.0,mpfi,102.0,5500.0,24.0,30.0,13950.0
4,audi,gas,std,four,sedan,4wd,2824.0,ohc,five,136.0,mpfi,115.0,5500.0,18.0,22.0,17450.0
6,audi,gas,std,four,sedan,fwd,2844.0,ohc,five,136.0,mpfi,110.0,5500.0,19.0,25.0,17710.0
8,audi,gas,turbo,four,sedan,fwd,3086.0,ohc,five,131.0,mpfi,140.0,5500.0,17.0,20.0,23875.0
10,bmw,gas,std,two,sedan,rwd,2395.0,ohc,four,108.0,mpfi,101.0,5800.0,23.0,29.0,16430.0


In [17]:
Y_NAME = "price"

def get_training_test_prediction_data(df):
    
    # Generate a unique shuffle each time
    np.random.seed(None)

    # Split the data into train/test subsets.
    x_train = df.sample(frac=0.8, random_state=None)
    
    # Remove the training data from the original dataset
    x_test = df.drop(x_train.index)
    
    # Choose a small sample from the test data for prediction
    x_predict = x_test.sample(frac=0.2, random_state=None)
    
    # Extract the label from the features DataFrame.
    y_train = x_train.pop(Y_NAME)
    y_test = x_test.pop(Y_NAME)
    y_predict = x_predict.pop(Y_NAME)
    
    return (x_train, y_train), (x_test, y_test), (x_predict, y_predict) 

In [19]:
(x_train, y_train), (x_test, y_test), (x_predict, y_predict) = \
    get_training_test_prediction_data(df)

In [20]:
x_train.head()

Unnamed: 0,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,curb-weight,engine-type,num-of-cylinders,engine-size,fuel-system,horsepower,peak-rpm,city-mpg,highway-mpg
53,mazda,gas,std,four,sedan,fwd,1945.0,ohc,four,91.0,2bbl,68.0,5000.0,31.0,38.0
133,saab,gas,std,four,sedan,fwd,2695.0,ohc,four,121.0,mpfi,110.0,5250.0,21.0,28.0
64,mazda,gas,std,four,hatchback,fwd,2425.0,ohc,four,122.0,2bbl,84.0,4800.0,26.0,32.0
187,volkswagen,diesel,turbo,four,sedan,fwd,2319.0,ohc,four,97.0,idi,68.0,4500.0,37.0,42.0
12,bmw,gas,std,two,sedan,rwd,2710.0,ohc,six,164.0,mpfi,121.0,4250.0,21.0,28.0


In [21]:
y_train.head()

53      6695.0
133    12170.0
64     11245.0
187     9495.0
12     20970.0
Name: price, dtype: float64

In [22]:
PRICE_SCALING_FACTOR = 10000

y_train /= PRICE_SCALING_FACTOR
y_test /= PRICE_SCALING_FACTOR

In [23]:
y_train.head()

53     0.6695
133    1.2170
64     1.1245
187    0.9495
12     2.0970
Name: price, dtype: float64

In [24]:
df['make'].unique()

array(['audi', 'bmw', 'chevrolet', 'dodge', 'honda', 'jaguar', 'mazda',
       'mercedes-benz', 'mitsubishi', 'nissan', 'peugot', 'plymouth',
       'porsche', 'saab', 'subaru', 'toyota', 'volkswagen', 'volvo'], dtype=object)

In [25]:
df['fuel-type'].unique()

array(['gas', 'diesel'], dtype=object)

In [26]:
df['aspiration'].unique()

array(['std', 'turbo'], dtype=object)

In [27]:
df['num-of-doors'].unique()

array(['four', 'two'], dtype=object)

In [28]:
df['body-style'].unique()

array(['sedan', 'hatchback', 'wagon', 'hardtop', 'convertible'], dtype=object)

In [29]:
df['drive-wheels'].unique()

array(['fwd', '4wd', 'rwd'], dtype=object)

In [30]:
df['engine-type'].unique()

array(['ohc', 'l', 'dohc', 'ohcv', 'ohcf'], dtype=object)

In [31]:
df['num-of-cylinders'].unique()

array(['four', 'five', 'six', 'three', 'eight'], dtype=object)

In [32]:
df['fuel-system'].unique()

array(['mpfi', '2bbl', 'mfi', '1bbl', 'idi', 'spdi'], dtype=object)

In [33]:
curb_weight = tf.feature_column.numeric_column("curb-weight")

engine_size = tf.feature_column.numeric_column("engine-size")

horsepower = tf.feature_column.numeric_column("horsepower")

peak_rpm = tf.feature_column.numeric_column("peak-rpm")

city_mpg = tf.feature_column.numeric_column("city-mpg")

highway_mpg = tf.feature_column.numeric_column("highway-mpg")

In [34]:
body_style = tf.feature_column.categorical_column_with_vocabulary_list(
      key="body-style", vocabulary_list=df['body-style'].unique())

fuel_type = tf.feature_column.categorical_column_with_vocabulary_list(
      key="fuel-type", vocabulary_list=df['fuel-type'].unique())

aspiration = tf.feature_column.categorical_column_with_vocabulary_list(
      key="aspiration", vocabulary_list=df['aspiration'].unique())

num_of_doors = tf.feature_column.categorical_column_with_vocabulary_list(
      key="num-of-doors", vocabulary_list=df['num-of-doors'].unique())

drive_wheels = tf.feature_column.categorical_column_with_vocabulary_list(
      key="drive-wheels", vocabulary_list=df['drive-wheels'].unique())

engine_type = tf.feature_column.categorical_column_with_vocabulary_list(
      key="engine-type", vocabulary_list=df['engine-type'].unique())

num_of_cylinders = tf.feature_column.categorical_column_with_vocabulary_list(
      key="num-of-cylinders", vocabulary_list=df['num-of-cylinders'].unique())

fuel_system = tf.feature_column.categorical_column_with_vocabulary_list(
      key="fuel-system", vocabulary_list=df['fuel-system'].unique())

In [35]:
make = tf.feature_column.categorical_column_with_hash_bucket(
      key="make", hash_bucket_size=50)

In [36]:
feature_columns = [
    curb_weight, engine_size, horsepower, peak_rpm, city_mpg, highway_mpg,

    tf.feature_column.indicator_column(body_style),

    tf.feature_column.embedding_column(fuel_type, dimension=3),

    tf.feature_column.embedding_column(aspiration, dimension=3),
    tf.feature_column.embedding_column(num_of_doors, dimension=3),
    tf.feature_column.embedding_column(drive_wheels, dimension=3),
    tf.feature_column.embedding_column(engine_type, dimension=3),
    tf.feature_column.embedding_column(num_of_cylinders, dimension=3),
    tf.feature_column.embedding_column(fuel_system, dimension=3),

    tf.feature_column.embedding_column(make, dimension=4)    
]

In [37]:
def input_fn(x_data, y_data, num_epochs, shuffle):

    return tf.estimator.inputs.pandas_input_fn(
          x=x_data,
          y=y_data,
          batch_size=64,
          num_epochs=num_epochs,
          shuffle=shuffle)            

In [69]:
model = tf.estimator.DNNRegressor(
      hidden_units=[24, 16, 24], feature_columns=feature_columns)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_task_type': 'worker', '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x11e580d90>, '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_service': None, '_num_ps_replicas': 0, '_tf_random_seed': None, '_master': '', '_num_worker_replicas': 1, '_task_id': 0, '_log_step_count_steps': 100, '_model_dir': '/var/folders/yd/1rlyjfk975d3bb98d7_nyt740000gn/T/tmp6FrfSF', '_save_summary_steps': 100}


In [70]:
model.train(input_fn=input_fn(x_train, y_train, num_epochs=None, shuffle=True), steps=20000)

INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into /var/folders/yd/1rlyjfk975d3bb98d7_nyt740000gn/T/tmp6FrfSF/model.ckpt.
INFO:tensorflow:loss = 5.61059e+07, step = 1
INFO:tensorflow:global_step/sec: 158.964
INFO:tensorflow:loss = 1627.55, step = 101 (0.633 sec)
INFO:tensorflow:global_step/sec: 239.004
INFO:tensorflow:loss = 372.439, step = 201 (0.417 sec)
INFO:tensorflow:global_step/sec: 237.694
INFO:tensorflow:loss = 301.739, step = 301 (0.429 sec)
INFO:tensorflow:global_step/sec: 247.267
INFO:tensorflow:loss = 96.4529, step = 401 (0.402 sec)
INFO:tensorflow:global_step/sec: 233.867
INFO:tensorflow:loss = 89.8155, step = 501 (0.426 sec)
INFO:tensorflow:global_step/sec: 248.212
INFO:tensorflow:loss = 82.6057, step = 601 (0.403 sec)
INFO:tensorflow:global_step/sec: 260.098
INFO:tensorflow:loss = 40.5743, step = 701 (0.382 sec)
INFO:tensorflow:global_step/sec: 228.873
INFO:tensorflow:loss = 51.2595, step = 801 (0.432 sec)
INFO:tensorflow:global_ste

<tensorflow.python.estimator.canned.dnn.DNNRegressor at 0x1218dfcd0>

In [71]:
results = model.evaluate(input_fn=input_fn(x_test, y_test, num_epochs=1, shuffle=False))

INFO:tensorflow:Starting evaluation at 2017-12-21-04:26:07
INFO:tensorflow:Restoring parameters from /var/folders/yd/1rlyjfk975d3bb98d7_nyt740000gn/T/tmp6FrfSF/model.ckpt-20000
INFO:tensorflow:Finished evaluation at 2017-12-21-04:26:08
INFO:tensorflow:Saving dict for global step 20000: average_loss = 0.145189, global_step = 20000, loss = 4.64606


In [72]:
for key in sorted(results):
    print("%s: %s" % (key, results[key]))


average_loss: 0.145189
global_step: 20000
loss: 4.64606


In [73]:
average_loss = results["average_loss"]

In [74]:
print("\nRMS error for the test set: ${:.0f}"
        .format(PRICE_SCALING_FACTOR * average_loss**0.5))


RMS error for the test set: $3810


In [75]:
len(x_predict), len(y_predict)

(6, 6)

In [76]:
predict_results = model.predict(input_fn=input_fn(x_predict, y_predict, num_epochs=1, shuffle=False))

In [77]:
predictions = list(itertools.islice(predict_results, len(x_predict)))

INFO:tensorflow:Restoring parameters from /var/folders/yd/1rlyjfk975d3bb98d7_nyt740000gn/T/tmp6FrfSF/model.ckpt-20000


In [78]:
predictions

[{'predictions': array([ 1.44325781], dtype=float32)},
 {'predictions': array([ 3.06986761], dtype=float32)},
 {'predictions': array([ 0.87775177], dtype=float32)},
 {'predictions': array([ 0.98317474], dtype=float32)},
 {'predictions': array([ 0.42458099], dtype=float32)},
 {'predictions': array([ 1.03918219], dtype=float32)}]

In [79]:
predicted_prices = [obj['predictions'][0] * PRICE_SCALING_FACTOR for obj in predictions]

In [80]:
predicted_prices

[14432.578086853027,
 30698.676109313965,
 8777.5176763534546,
 9831.7474126815796,
 4245.8099126815796,
 10391.82186126709]

In [81]:
compare_df = x_predict.copy()

In [82]:
compare_df

Unnamed: 0,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,curb-weight,engine-type,num-of-cylinders,engine-size,fuel-system,horsepower,peak-rpm,city-mpg,highway-mpg
200,volvo,gas,std,four,sedan,rwd,2952.0,ohc,four,141.0,mpfi,114.0,5400.0,23.0,28.0
68,mercedes-benz,diesel,turbo,four,wagon,rwd,3750.0,ohc,five,183.0,idi,123.0,4350.0,22.0,25.0
42,honda,gas,std,two,sedan,fwd,2293.0,ohc,four,110.0,2bbl,100.0,5500.0,25.0,31.0
186,volkswagen,gas,std,four,sedan,fwd,2275.0,ohc,four,109.0,mpfi,85.0,5250.0,27.0,34.0
165,toyota,gas,std,two,sedan,rwd,2265.0,dohc,four,98.0,mpfi,112.0,6600.0,26.0,29.0
30,honda,gas,std,two,hatchback,fwd,1713.0,ohc,four,92.0,1bbl,58.0,4800.0,49.0,54.0


In [83]:
compare_df['actual-price'] = y_predict
compare_df['predicted-price'] = predicted_prices

In [84]:
compare_df

Unnamed: 0,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,curb-weight,engine-type,num-of-cylinders,engine-size,fuel-system,horsepower,peak-rpm,city-mpg,highway-mpg,actual-price,predicted-price
200,volvo,gas,std,four,sedan,rwd,2952.0,ohc,four,141.0,mpfi,114.0,5400.0,23.0,28.0,16845.0,14432.578087
68,mercedes-benz,diesel,turbo,four,wagon,rwd,3750.0,ohc,five,183.0,idi,123.0,4350.0,22.0,25.0,28248.0,30698.676109
42,honda,gas,std,two,sedan,fwd,2293.0,ohc,four,110.0,2bbl,100.0,5500.0,25.0,31.0,10345.0,8777.517676
186,volkswagen,gas,std,four,sedan,fwd,2275.0,ohc,four,109.0,mpfi,85.0,5250.0,27.0,34.0,8495.0,9831.747413
165,toyota,gas,std,two,sedan,rwd,2265.0,dohc,four,98.0,mpfi,112.0,6600.0,26.0,29.0,9298.0,4245.809913
30,honda,gas,std,two,hatchback,fwd,1713.0,ohc,four,92.0,1bbl,58.0,4800.0,49.0,54.0,6479.0,10391.821861
