In [32]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [33]:
import os
import collections
import itertools

In [34]:
import numpy as np
import pandas as pd
import tensorflow as tf

In [35]:
from six.moves import urllib

In [36]:
print(np.__version__)
print(pd.__version__)
print(tf.__version__)

1.14.0
0.22.0
1.8.0


In [37]:
URL_PATH = "https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data"

DOWNLOADED_FILENAME = "automobiles.csv"

def download_data():
    if not os.path.exists(DOWNLOADED_FILENAME):
        filename, _ = urllib.request.urlretrieve(URL_PATH, DOWNLOADED_FILENAME)

    print('Found and verified file from this path: ', URL_PATH)
    print('Downloaded file: ', DOWNLOADED_FILENAME)        

In [38]:
download_data()

Found and verified file from this path:  https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data
Downloaded file:  automobiles.csv


In [39]:
COLUMN_TYPES = collections.OrderedDict([
    ("symboling", int),
    ("normalized-losses", float),
    ("make", str),
    ("fuel-type", str),
    ("aspiration", str),
    ("num-of-doors", str),
    ("body-style", str),
    ("drive-wheels", str),
    ("engine-location", str),
    ("wheel-base", float),
    ("length", float),
    ("width", float),
    ("height", float),
    ("curb-weight", float),
    ("engine-type", str),
    ("num-of-cylinders", str),
    ("engine-size", float),
    ("fuel-system", str),
    ("bore", float),
    ("stroke", float),
    ("compression-ratio", float),
    ("horsepower", float),
    ("peak-rpm", float),
    ("city-mpg", float),
    ("highway-mpg", float),
    ("price", float)
])

In [40]:
df = pd.read_csv(DOWNLOADED_FILENAME, names=COLUMN_TYPES.keys(),
                 dtype=COLUMN_TYPES, na_values="?")

In [41]:
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130.0,mpfi,3.47,2.68,9.0,111.0,5000.0,21.0,27.0,13495.0
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130.0,mpfi,3.47,2.68,9.0,111.0,5000.0,21.0,27.0,16500.0
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152.0,mpfi,2.68,3.47,9.0,154.0,5000.0,19.0,26.0,16500.0
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109.0,mpfi,3.19,3.4,10.0,102.0,5500.0,24.0,30.0,13950.0
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136.0,mpfi,3.19,3.4,8.0,115.0,5500.0,18.0,22.0,17450.0


In [42]:
df.count()

symboling            205
normalized-losses    164
make                 205
fuel-type            205
aspiration           205
num-of-doors         203
body-style           205
drive-wheels         205
engine-location      205
wheel-base           205
length               205
width                205
height               205
curb-weight          205
engine-type          205
num-of-cylinders     205
engine-size          205
fuel-system          205
bore                 201
stroke               201
compression-ratio    205
horsepower           203
peak-rpm             203
city-mpg             205
highway-mpg          205
price                201
dtype: int64

In [43]:
df = df.dropna()

In [44]:
df.count()

symboling            159
normalized-losses    159
make                 159
fuel-type            159
aspiration           159
num-of-doors         159
body-style           159
drive-wheels         159
engine-location      159
wheel-base           159
length               159
width                159
height               159
curb-weight          159
engine-type          159
num-of-cylinders     159
engine-size          159
fuel-system          159
bore                 159
stroke               159
compression-ratio    159
horsepower           159
peak-rpm             159
city-mpg             159
highway-mpg          159
price                159
dtype: int64

In [45]:
TRIMMED_CSV_COLUMNS = [
    "make", "fuel-type", "aspiration", "num-of-doors", "body-style",
    "drive-wheels", "curb-weight", "engine-type", "num-of-cylinders", "engine-size",
    "fuel-system", "horsepower", "peak-rpm", "city-mpg", "highway-mpg", "price"
]

In [46]:
df = df[TRIMMED_CSV_COLUMNS]

In [47]:
df.head()

Unnamed: 0,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,curb-weight,engine-type,num-of-cylinders,engine-size,fuel-system,horsepower,peak-rpm,city-mpg,highway-mpg,price
3,audi,gas,std,four,sedan,fwd,2337.0,ohc,four,109.0,mpfi,102.0,5500.0,24.0,30.0,13950.0
4,audi,gas,std,four,sedan,4wd,2824.0,ohc,five,136.0,mpfi,115.0,5500.0,18.0,22.0,17450.0
6,audi,gas,std,four,sedan,fwd,2844.0,ohc,five,136.0,mpfi,110.0,5500.0,19.0,25.0,17710.0
8,audi,gas,turbo,four,sedan,fwd,3086.0,ohc,five,131.0,mpfi,140.0,5500.0,17.0,20.0,23875.0
10,bmw,gas,std,two,sedan,rwd,2395.0,ohc,four,108.0,mpfi,101.0,5800.0,23.0,29.0,16430.0


In [48]:
Y_NAME = "price"

def get_training_test_prediction_data(df):
    
    # Generate a unique shuffle each time
    np.random.seed(None)

    # Split the data into train/test subsets.
    x_train = df.sample(frac=0.8, random_state=None)
    
    # Remove the training data from the original dataset
    x_test = df.drop(x_train.index)
    
    # Choose a small sample from the test data for prediction
    x_predict = x_test.sample(frac=0.2, random_state=None)
    
    # Extract the label from the features DataFrame.
    y_train = x_train.pop(Y_NAME)
    y_test = x_test.pop(Y_NAME)
    y_predict = x_predict.pop(Y_NAME)
    
    return (x_train, y_train), (x_test, y_test), (x_predict, y_predict) 

In [49]:
(x_train, y_train), (x_test, y_test), (x_predict, y_predict) = \
    get_training_test_prediction_data(df)

In [50]:
x_train.head()

Unnamed: 0,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,curb-weight,engine-type,num-of-cylinders,engine-size,fuel-system,horsepower,peak-rpm,city-mpg,highway-mpg
173,toyota,gas,std,four,sedan,fwd,2326.0,ohc,four,122.0,mpfi,92.0,4200.0,29.0,34.0
176,toyota,gas,std,four,sedan,fwd,2414.0,ohc,four,122.0,mpfi,92.0,4200.0,27.0,32.0
28,dodge,gas,std,four,wagon,fwd,2535.0,ohc,four,122.0,2bbl,88.0,5000.0,24.0,30.0
18,chevrolet,gas,std,two,hatchback,fwd,1488.0,l,three,61.0,2bbl,48.0,5100.0,47.0,53.0
108,peugot,diesel,turbo,four,sedan,rwd,3197.0,l,four,152.0,idi,95.0,4150.0,28.0,33.0


In [51]:
y_train.head()

173     8948.0
176    10898.0
28      8921.0
18      5151.0
108    13200.0
Name: price, dtype: float64

In [52]:
PRICE_SCALING_FACTOR = 10000

y_train /= PRICE_SCALING_FACTOR
y_test /= PRICE_SCALING_FACTOR

In [53]:
y_train.head()

173    0.8948
176    1.0898
28     0.8921
18     0.5151
108    1.3200
Name: price, dtype: float64

In [54]:
df['make'].unique()

array(['audi', 'bmw', 'chevrolet', 'dodge', 'honda', 'jaguar', 'mazda',
       'mercedes-benz', 'mitsubishi', 'nissan', 'peugot', 'plymouth',
       'porsche', 'saab', 'subaru', 'toyota', 'volkswagen', 'volvo'],
      dtype=object)

In [55]:
df['fuel-type'].unique()

array(['gas', 'diesel'], dtype=object)

In [56]:
df['aspiration'].unique()

array(['std', 'turbo'], dtype=object)

In [57]:
df['num-of-doors'].unique()

array(['four', 'two'], dtype=object)

In [58]:
df['body-style'].unique()

array(['sedan', 'hatchback', 'wagon', 'hardtop', 'convertible'],
      dtype=object)

In [59]:
df['drive-wheels'].unique()

array(['fwd', '4wd', 'rwd'], dtype=object)

In [60]:
df['engine-type'].unique()

array(['ohc', 'l', 'dohc', 'ohcv', 'ohcf'], dtype=object)

In [61]:
df['num-of-cylinders'].unique()

array(['four', 'five', 'six', 'three', 'eight'], dtype=object)

In [62]:
df['fuel-system'].unique()

array(['mpfi', '2bbl', 'mfi', '1bbl', 'idi', 'spdi'], dtype=object)

In [63]:
curb_weight = tf.feature_column.numeric_column("curb-weight")

engine_size = tf.feature_column.numeric_column("engine-size")

horsepower = tf.feature_column.numeric_column("horsepower")

peak_rpm = tf.feature_column.numeric_column("peak-rpm")

city_mpg = tf.feature_column.numeric_column("city-mpg")

highway_mpg = tf.feature_column.numeric_column("highway-mpg")

In [64]:
body_style = tf.feature_column.categorical_column_with_vocabulary_list(
      key="body-style", vocabulary_list=df['body-style'].unique())

fuel_type = tf.feature_column.categorical_column_with_vocabulary_list(
      key="fuel-type", vocabulary_list=df['fuel-type'].unique())

aspiration = tf.feature_column.categorical_column_with_vocabulary_list(
      key="aspiration", vocabulary_list=df['aspiration'].unique())

num_of_doors = tf.feature_column.categorical_column_with_vocabulary_list(
      key="num-of-doors", vocabulary_list=df['num-of-doors'].unique())

drive_wheels = tf.feature_column.categorical_column_with_vocabulary_list(
      key="drive-wheels", vocabulary_list=df['drive-wheels'].unique())

engine_type = tf.feature_column.categorical_column_with_vocabulary_list(
      key="engine-type", vocabulary_list=df['engine-type'].unique())

num_of_cylinders = tf.feature_column.categorical_column_with_vocabulary_list(
      key="num-of-cylinders", vocabulary_list=df['num-of-cylinders'].unique())

fuel_system = tf.feature_column.categorical_column_with_vocabulary_list(
      key="fuel-system", vocabulary_list=df['fuel-system'].unique())

In [65]:
make = tf.feature_column.categorical_column_with_hash_bucket(
      key="make", hash_bucket_size=50)

In [66]:
feature_columns = [
    curb_weight, engine_size, horsepower, peak_rpm, city_mpg, highway_mpg,

    tf.feature_column.indicator_column(body_style),

    tf.feature_column.embedding_column(fuel_type, dimension=3),

    tf.feature_column.embedding_column(aspiration, dimension=3),
    tf.feature_column.embedding_column(num_of_doors, dimension=3),
    tf.feature_column.embedding_column(drive_wheels, dimension=3),
    tf.feature_column.embedding_column(engine_type, dimension=3),
    tf.feature_column.embedding_column(num_of_cylinders, dimension=3),
    tf.feature_column.embedding_column(fuel_system, dimension=3),

    tf.feature_column.embedding_column(make, dimension=4)    
]

In [67]:
def input_fn(x_data, y_data, num_epochs, shuffle):

    return tf.estimator.inputs.pandas_input_fn(
          x=x_data,
          y=y_data,
          batch_size=64,
          num_epochs=num_epochs,
          shuffle=shuffle)            

In [68]:
model = tf.estimator.DNNRegressor(
      hidden_units=[24, 16, 24], feature_columns=feature_columns)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_task_type': 'worker', '_train_distribute': None, '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fdafe9aa490>, '_evaluation_master': '', '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_service': None, '_num_ps_replicas': 0, '_tf_random_seed': None, '_master': '', '_num_worker_replicas': 1, '_task_id': 0, '_log_step_count_steps': 100, '_model_dir': '/tmp/tmpefBpc7', '_global_id_in_cluster': 0, '_save_summary_steps': 100}


In [69]:
model.train(input_fn=input_fn(x_train, y_train, num_epochs=None, shuffle=True), steps=20000)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1 into /tmp/tmpefBpc7/model.ckpt.
INFO:tensorflow:loss = 7023431.5, step = 1
INFO:tensorflow:global_step/sec: 89.3645
INFO:tensorflow:loss = 162.15143, step = 101 (1.126 sec)
INFO:tensorflow:global_step/sec: 148.114
INFO:tensorflow:loss = 195.44962, step = 201 (0.677 sec)
INFO:tensorflow:global_step/sec: 129.519
INFO:tensorflow:loss = 174.20764, step = 301 (0.772 sec)
INFO:tensorflow:global_step/sec: 140.667
INFO:tensorflow:loss = 136.19968, step = 401 (0.711 sec)
INFO:tensorflow:global_step/sec: 149.467
INFO:tensorflow:loss = 167.02893, step = 501 (0.666 sec)
INFO:tensorflow:global_step/sec: 157.818
INFO:tensorflow:loss = 180.99887, step = 601 (0.636 sec)
INFO:tensorflow:global_step/sec: 159.531
INFO:tensorflow:lo

INFO:tensorflow:global_step/sec: 169.223
INFO:tensorflow:loss = 17.091925, step = 8101 (0.592 sec)
INFO:tensorflow:global_step/sec: 164.126
INFO:tensorflow:loss = 22.557049, step = 8201 (0.610 sec)
INFO:tensorflow:global_step/sec: 158.255
INFO:tensorflow:loss = 14.825172, step = 8301 (0.629 sec)
INFO:tensorflow:global_step/sec: 159.836
INFO:tensorflow:loss = 18.476715, step = 8401 (0.627 sec)
INFO:tensorflow:global_step/sec: 154.646
INFO:tensorflow:loss = 17.499966, step = 8501 (0.645 sec)
INFO:tensorflow:global_step/sec: 159.02
INFO:tensorflow:loss = 13.51836, step = 8601 (0.631 sec)
INFO:tensorflow:global_step/sec: 167.485
INFO:tensorflow:loss = 12.827346, step = 8701 (0.598 sec)
INFO:tensorflow:global_step/sec: 156.609
INFO:tensorflow:loss = 21.148472, step = 8801 (0.638 sec)
INFO:tensorflow:global_step/sec: 163.607
INFO:tensorflow:loss = 16.270123, step = 8901 (0.609 sec)
INFO:tensorflow:global_step/sec: 164.691
INFO:tensorflow:loss = 21.091248, step = 9001 (0.607 sec)
INFO:tensorf

INFO:tensorflow:global_step/sec: 52.6824
INFO:tensorflow:loss = 7.8170853, step = 16401 (1.899 sec)
INFO:tensorflow:global_step/sec: 76.9951
INFO:tensorflow:loss = 4.772986, step = 16501 (1.297 sec)
INFO:tensorflow:global_step/sec: 66.9292
INFO:tensorflow:loss = 8.542789, step = 16601 (1.494 sec)
INFO:tensorflow:global_step/sec: 167.9
INFO:tensorflow:loss = 8.713685, step = 16701 (0.597 sec)
INFO:tensorflow:global_step/sec: 84.7173
INFO:tensorflow:loss = 10.334414, step = 16801 (1.180 sec)
INFO:tensorflow:global_step/sec: 67.7567
INFO:tensorflow:loss = 4.8626575, step = 16901 (1.475 sec)
INFO:tensorflow:global_step/sec: 77.5905
INFO:tensorflow:loss = 7.70811, step = 17001 (1.288 sec)
INFO:tensorflow:global_step/sec: 84.1007
INFO:tensorflow:loss = 7.6790276, step = 17101 (1.191 sec)
INFO:tensorflow:global_step/sec: 162.223
INFO:tensorflow:loss = 5.550972, step = 17201 (0.613 sec)
INFO:tensorflow:global_step/sec: 180.159
INFO:tensorflow:loss = 8.396481, step = 17301 (0.560 sec)
INFO:tens

<tensorflow.python.estimator.canned.dnn.DNNRegressor at 0x7fdafe9aa290>

In [70]:
results = model.evaluate(input_fn=input_fn(x_test, y_test, num_epochs=1, shuffle=False))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-01-24-14:22:18
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmpefBpc7/model.ckpt-20000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2019-01-24-14:22:19
INFO:tensorflow:Saving dict for global step 20000: average_loss = 0.092076205, global_step = 20000, loss = 2.9464386


In [71]:
for key in sorted(results):
    print("%s: %s" % (key, results[key]))


average_loss: 0.092076205
global_step: 20000
loss: 2.9464386


In [72]:
average_loss = results["average_loss"]

In [73]:
print("\nRMS error for the test set: ${:.0f}"
        .format(PRICE_SCALING_FACTOR * average_loss**0.5))


RMS error for the test set: $3034


In [74]:
len(x_predict), len(y_predict)

(6, 6)

In [75]:
predict_results = model.predict(input_fn=input_fn(x_predict, y_predict, num_epochs=1, shuffle=False))

In [76]:
predictions = list(itertools.islice(predict_results, len(x_predict)))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmpefBpc7/model.ckpt-20000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [77]:
predictions

[{'predictions': array([0.8276684], dtype=float32)},
 {'predictions': array([0.6876538], dtype=float32)},
 {'predictions': array([1.52543], dtype=float32)},
 {'predictions': array([0.81132627], dtype=float32)},
 {'predictions': array([0.82526517], dtype=float32)},
 {'predictions': array([1.6929028], dtype=float32)}]

In [78]:
predicted_prices = [obj['predictions'][0] * PRICE_SCALING_FACTOR for obj in predictions]

In [79]:
predicted_prices

[8276.684284210205,
 6876.537799835205,
 15254.299640655518,
 8113.26265335083,
 8252.651691436768,
 16929.028034210205]

In [80]:
compare_df = x_predict.copy()

In [81]:
compare_df

Unnamed: 0,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,curb-weight,engine-type,num-of-cylinders,engine-size,fuel-system,horsepower,peak-rpm,city-mpg,highway-mpg
52,mazda,gas,std,two,hatchback,fwd,1905.0,ohc,four,91.0,2bbl,68.0,5000.0,31.0,38.0
164,toyota,gas,std,two,hatchback,rwd,2204.0,ohc,four,98.0,2bbl,70.0,4800.0,29.0,34.0
133,saab,gas,std,four,sedan,fwd,2695.0,ohc,four,121.0,mpfi,110.0,5250.0,21.0,28.0
149,subaru,gas,turbo,four,wagon,4wd,2650.0,ohcf,four,108.0,mpfi,111.0,4800.0,23.0,23.0
51,mazda,gas,std,two,hatchback,fwd,1900.0,ohc,four,91.0,2bbl,68.0,5000.0,31.0,38.0
195,volvo,gas,std,four,wagon,rwd,3034.0,ohc,four,141.0,mpfi,114.0,5400.0,23.0,28.0


In [82]:
compare_df['actual-price'] = y_predict
compare_df['predicted-price'] = predicted_prices

In [83]:
compare_df

Unnamed: 0,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,curb-weight,engine-type,num-of-cylinders,engine-size,fuel-system,horsepower,peak-rpm,city-mpg,highway-mpg,actual-price,predicted-price
52,mazda,gas,std,two,hatchback,fwd,1905.0,ohc,four,91.0,2bbl,68.0,5000.0,31.0,38.0,6795.0,8276.684284
164,toyota,gas,std,two,hatchback,rwd,2204.0,ohc,four,98.0,2bbl,70.0,4800.0,29.0,34.0,8238.0,6876.5378
133,saab,gas,std,four,sedan,fwd,2695.0,ohc,four,121.0,mpfi,110.0,5250.0,21.0,28.0,12170.0,15254.299641
149,subaru,gas,turbo,four,wagon,4wd,2650.0,ohcf,four,108.0,mpfi,111.0,4800.0,23.0,23.0,11694.0,8113.262653
51,mazda,gas,std,two,hatchback,fwd,1900.0,ohc,four,91.0,2bbl,68.0,5000.0,31.0,38.0,6095.0,8252.651691
195,volvo,gas,std,four,wagon,rwd,3034.0,ohc,four,141.0,mpfi,114.0,5400.0,23.0,28.0,13415.0,16929.028034
