In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [2]:
import os
import collections
import itertools

In [3]:
import numpy as np
import pandas as pd
import tensorflow as tf

  from ._conv import register_converters as _register_converters


In [4]:
from six.moves import urllib

In [5]:
print(np.__version__)
print(pd.__version__)
print(tf.__version__)

1.14.3
0.23.0
1.9.0


In [6]:
URL_PATH = 'https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data'
DOWNLOAD_FILENAME = 'automobiles.csv'
def download_data():
    if not os.path.exists(DOWNLOAD_FILENAME):
        filename , _ = urllib.request.urlretrieve(URL_PATH,DOWNLOAD_FILENAME)
    print("File Downloaded from ",URL_PATH)
    print("File names on local ",DOWNLOAD_FILENAME)

In [7]:
download_data()

File Downloaded from  https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data
File names on local  automobiles.csv


In [8]:
COLUMN_NAMES_TYPES = collections.OrderedDict([("symboling",int),
("normalized-losses",float),
("make",str),
("fuel-type",str),
("aspiration",str),
("num-of-doors",str),
("body-style",str),
("drive-wheels",str),
("engine-location",str),
("wheel-base",float),
("length",float),
("width",float),
("height",float),
("curb-weight",float),
("engine-type",str),
("num-of-cylinders",str),
("engine-size",float),
("fuel-system",str),
("bore",float),
("stroke",float),
("compression-ratio",float),
("horsepower",float),
("peak-rpm",float),
("city-mpg",float),
("highway-mpg",float),
("price",float)])

In [9]:
df = pd.read_csv(DOWNLOAD_FILENAME, names=COLUMN_NAMES_TYPES.keys(), dtype=COLUMN_NAMES_TYPES, na_values='?')

In [10]:
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130.0,mpfi,3.47,2.68,9.0,111.0,5000.0,21.0,27.0,13495.0
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130.0,mpfi,3.47,2.68,9.0,111.0,5000.0,21.0,27.0,16500.0
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152.0,mpfi,2.68,3.47,9.0,154.0,5000.0,19.0,26.0,16500.0
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109.0,mpfi,3.19,3.4,10.0,102.0,5500.0,24.0,30.0,13950.0
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136.0,mpfi,3.19,3.4,8.0,115.0,5500.0,18.0,22.0,17450.0


In [11]:
df.shape

(205, 26)

In [12]:
df.count()

symboling            205
normalized-losses    164
make                 205
fuel-type            205
aspiration           205
num-of-doors         203
body-style           205
drive-wheels         205
engine-location      205
wheel-base           205
length               205
width                205
height               205
curb-weight          205
engine-type          205
num-of-cylinders     205
engine-size          205
fuel-system          205
bore                 201
stroke               201
compression-ratio    205
horsepower           203
peak-rpm             203
city-mpg             205
highway-mpg          205
price                201
dtype: int64

In [13]:
df = df.dropna()

In [14]:
df.count()

symboling            159
normalized-losses    159
make                 159
fuel-type            159
aspiration           159
num-of-doors         159
body-style           159
drive-wheels         159
engine-location      159
wheel-base           159
length               159
width                159
height               159
curb-weight          159
engine-type          159
num-of-cylinders     159
engine-size          159
fuel-system          159
bore                 159
stroke               159
compression-ratio    159
horsepower           159
peak-rpm             159
city-mpg             159
highway-mpg          159
price                159
dtype: int64

In [15]:
features_of_importance = ['make','fuel-type','aspiration', 'num-of-doors','body-style','drive-wheels',"curb-weight",'engine-location','engine-type','num-of-cylinders','fuel-system',"engine-size","horsepower","peak-rpm","city-mpg","highway-mpg","price"]

In [16]:
df = df[features_of_importance]

In [17]:
df.head()

Unnamed: 0,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,curb-weight,engine-location,engine-type,num-of-cylinders,fuel-system,engine-size,horsepower,peak-rpm,city-mpg,highway-mpg,price
3,audi,gas,std,four,sedan,fwd,2337.0,front,ohc,four,mpfi,109.0,102.0,5500.0,24.0,30.0,13950.0
4,audi,gas,std,four,sedan,4wd,2824.0,front,ohc,five,mpfi,136.0,115.0,5500.0,18.0,22.0,17450.0
6,audi,gas,std,four,sedan,fwd,2844.0,front,ohc,five,mpfi,136.0,110.0,5500.0,19.0,25.0,17710.0
8,audi,gas,turbo,four,sedan,fwd,3086.0,front,ohc,five,mpfi,131.0,140.0,5500.0,17.0,20.0,23875.0
10,bmw,gas,std,two,sedan,rwd,2395.0,front,ohc,four,mpfi,108.0,101.0,5800.0,23.0,29.0,16430.0


In [18]:
TARGET_LABEL = 'price'

def prepare_train_test_predicted_data():
    
    
    np.random.seed(None)
    
    X_train = df.sample(frac=0.8,random_state=
                       None)
    
    X_test = df.drop(X_train.index)
    
    X_predict = X_test.sample(frac=0.2,random_state=None)
    
    Y_train = X_train.pop(TARGET_LABEL)
    Y_test = X_test.pop(TARGET_LABEL)
    Y_predict = X_predict.pop(TARGET_LABEL)
    
    return (X_train,Y_train) ,(X_test,Y_test) ,(X_predict,Y_predict)

In [19]:
(X_train,Y_train) ,(X_test,Y_test) ,(X_predict,Y_predict) = prepare_train_test_predicted_data()

In [20]:
X_train.head()

Unnamed: 0,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,curb-weight,engine-location,engine-type,num-of-cylinders,fuel-system,engine-size,horsepower,peak-rpm,city-mpg,highway-mpg
103,nissan,gas,std,four,sedan,fwd,3060.0,front,ohcv,six,mpfi,181.0,152.0,5200.0,19.0,25.0
21,dodge,gas,std,two,hatchback,fwd,1876.0,front,ohc,four,2bbl,90.0,68.0,5500.0,37.0,41.0
77,mitsubishi,gas,std,two,hatchback,fwd,1944.0,front,ohc,four,2bbl,92.0,68.0,5500.0,31.0,38.0
70,mercedes-benz,diesel,turbo,four,sedan,rwd,3770.0,front,ohc,five,idi,183.0,123.0,4350.0,22.0,25.0
196,volvo,gas,std,four,sedan,rwd,2935.0,front,ohc,four,mpfi,141.0,114.0,5400.0,24.0,28.0


In [21]:
Y_train.head()

103    13499.0
21      5572.0
77      6189.0
70     31600.0
196    15985.0
Name: price, dtype: float64

In [22]:
PRICE_SCALING_FACTOR = 10000

Y_train /= PRICE_SCALING_FACTOR
Y_test /= PRICE_SCALING_FACTOR

In [23]:
df.make.unique()

array(['audi', 'bmw', 'chevrolet', 'dodge', 'honda', 'jaguar', 'mazda',
       'mercedes-benz', 'mitsubishi', 'nissan', 'peugot', 'plymouth',
       'porsche', 'saab', 'subaru', 'toyota', 'volkswagen', 'volvo'],
      dtype=object)

In [24]:
df['fuel-type'].unique()

array(['gas', 'diesel'], dtype=object)

In [25]:
df.aspiration.unique()

array(['std', 'turbo'], dtype=object)

In [26]:
df['num-of-doors'].unique()

array(['four', 'two'], dtype=object)

In [27]:
df['body-style'].unique()

array(['sedan', 'hatchback', 'wagon', 'hardtop', 'convertible'],
      dtype=object)

In [28]:
df['drive-wheels'].unique()

array(['fwd', '4wd', 'rwd'], dtype=object)

In [29]:
df['engine-type'].unique()

array(['ohc', 'l', 'dohc', 'ohcv', 'ohcf'], dtype=object)

In [30]:
df['num-of-cylinders'].unique()

array(['four', 'five', 'six', 'three', 'eight'], dtype=object)

In [31]:
df['fuel-system'].unique()

array(['mpfi', '2bbl', 'mfi', '1bbl', 'idi', 'spdi'], dtype=object)

In [32]:
curb_weight  = tf.feature_column.numeric_column("curb-weight")

In [33]:
engine_size  = tf.feature_column.numeric_column("engine-size")

In [34]:
horsepower = tf.feature_column.numeric_column("horsepower")

In [35]:
peak_rpm = tf.feature_column.numeric_column("peak-rpm")

In [36]:
city_mpg = tf.feature_column.numeric_column("city-mpg")

In [37]:
highway_mpg = tf.feature_column.numeric_column("highway-mpg")

In [38]:
body_style = tf.feature_column.categorical_column_with_vocabulary_list(key="body-style",vocabulary_list=df['body-style'].unique())

In [39]:
fuel_type = tf.feature_column.categorical_column_with_vocabulary_list(key="fuel-type",vocabulary_list=df['fuel-type'].unique())

In [40]:
aspiration = tf.feature_column.categorical_column_with_vocabulary_list(key="aspiration",vocabulary_list=df['aspiration'].unique())

In [41]:
num_of_doors = tf.feature_column.categorical_column_with_vocabulary_list(key="num-of-doors",vocabulary_list=df['num-of-doors'].unique())

In [42]:
drive_wheels = tf.feature_column.categorical_column_with_vocabulary_list(key="drive-wheels",vocabulary_list=df['drive-wheels'].unique())

In [43]:
engine_type = tf.feature_column.categorical_column_with_vocabulary_list(key="engine-type",vocabulary_list=df['engine-type'].unique())

In [44]:
num_of_cylinders = tf.feature_column.categorical_column_with_vocabulary_list(key="num-of-cylinders",vocabulary_list=df['num-of-cylinders'].unique())

In [45]:
fuel_system = tf.feature_column.categorical_column_with_vocabulary_list(key="fuel-system",vocabulary_list=df['fuel-system'].unique())

In [46]:
make = tf.feature_column.categorical_column_with_hash_bucket(key='make',hash_bucket_size=50)

In [47]:
feature_columns = [curb_weight,engine_size,horsepower,peak_rpm,city_mpg,highway_mpg,
                  tf.feature_column.indicator_column(body_style),
                  tf.feature_column.embedding_column(fuel_type,dimension=3),
                  tf.feature_column.embedding_column(aspiration,dimension=3),
                  tf.feature_column.embedding_column(num_of_cylinders,dimension=3),
                  tf.feature_column.embedding_column(num_of_doors,dimension=3),
                  tf.feature_column.embedding_column(drive_wheels,dimension=3),
                  tf.feature_column.embedding_column(engine_type,dimension=3),
                  tf.feature_column.embedding_column(fuel_system,dimension=3),
                  
                  tf.feature_column.embedding_column(make,dimension=4)]

In [48]:
def input_dn(X_data,Y_data , num_epochs , shuffle):
    return tf.estimator.inputs.pandas_input_fn(x=X_data,y=Y_data,batch_size=64,num_epochs=num_epochs,shuffle=shuffle)

In [49]:
model = tf.estimator.DNNRegressor(hidden_units=[20,20,20],feature_columns=feature_columns)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/var/folders/nr/yyxxzlcj1gqfwgbp7mn9l610q3vvqq/T/tmp2ziygdh9', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0xb2fb46ba8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [69]:
model.train(input_fn=input_dn(X_train,Y_train,num_epochs=None,shuffle=True),steps=20000)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /var/folders/nr/yyxxzlcj1gqfwgbp7mn9l610q3vvqq/T/tmp2ziygdh9/model.ckpt-30000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 30000 into /var/folders/nr/yyxxzlcj1gqfwgbp7mn9l610q3vvqq/T/tmp2ziygdh9/model.ckpt.
INFO:tensorflow:loss = 2.0986538, step = 30001
INFO:tensorflow:global_step/sec: 69.2084
INFO:tensorflow:loss = 3.2428455, step = 30101 (1.446 sec)
INFO:tensorflow:global_step/sec: 146.716
INFO:tensorflow:loss = 7.1147375, step = 30201 (0.682 sec)
INFO:tensorflow:global_step/sec: 146.092
INFO:tensorflow:loss = 1.801501, step = 30301 (0.685 sec)
INFO:tensorflow:global_step/sec: 138.548
INFO:tensorflow:loss = 3.151345, step = 30401 (0.722 sec)
INFO:tensorflow:global_step/sec: 148.66
INFO:tensorflow:loss = 1.4418291, s

<tensorflow.python.estimator.canned.dnn.DNNRegressor at 0xb2fb46198>

In [83]:
results = model.evaluate(input_fn=input_dn(X_test,Y_test,num_epochs=1,shuffle=True))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-08-12-16:22:11
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /var/folders/nr/yyxxzlcj1gqfwgbp7mn9l610q3vvqq/T/tmp2ziygdh9/model.ckpt-50000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-08-12-16:22:12
INFO:tensorflow:Saving dict for global step 50000: average_loss = 0.047263432, global_step = 50000, loss = 1.5124298
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 50000: /var/folders/nr/yyxxzlcj1gqfwgbp7mn9l610q3vvqq/T/tmp2ziygdh9/model.ckpt-50000


In [84]:
for key in sorted(results):
    print("%s: %s" % (key , results[key]))

average_loss: 0.047263432
global_step: 50000
loss: 1.5124298


In [85]:
avg_loss = results['average_loss']

In [86]:
print("RMS error : ${:.0f}".format(PRICE_SCALING_FACTOR*avg_loss**0.5))

RMS error : $2174


In [87]:
len(X_predict) , len(Y_predict)

(6, 6)

In [88]:
predicted_results = model.predict(input_fn=input_dn(X_predict,Y_predict,num_epochs=1,shuffle=False))

In [89]:
predictions = list(itertools.islice(predicted_results,len(X_predict)))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /var/folders/nr/yyxxzlcj1gqfwgbp7mn9l610q3vvqq/T/tmp2ziygdh9/model.ckpt-50000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [90]:
predictions

[{'predictions': array([0.98185146], dtype=float32)},
 {'predictions': array([0.80950344], dtype=float32)},
 {'predictions': array([1.7152117], dtype=float32)},
 {'predictions': array([0.621271], dtype=float32)},
 {'predictions': array([0.840227], dtype=float32)},
 {'predictions': array([1.1238955], dtype=float32)}]

In [91]:
predicted_prices = [obj['predictions'][0]*PRICE_SCALING_FACTOR for obj in predictions]

In [92]:
predicted_prices

[9818.514585494995,
 8095.03436088562,
 17152.117490768433,
 6212.71014213562,
 8402.270078659058,
 11238.95525932312]

In [93]:
compare_df = X_predict.copy()

In [94]:
compare_df['Actual'] = Y_predict
compare_df['predicted'] = predicted_prices

In [95]:
compare_df

Unnamed: 0,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,curb-weight,engine-location,engine-type,num-of-cylinders,fuel-system,engine-size,horsepower,peak-rpm,city-mpg,highway-mpg,Actual,predicted
143,subaru,gas,std,four,sedan,fwd,2340.0,front,ohcf,four,mpfi,108.0,94.0,5200.0,26.0,32.0,9960.0,9818.514585
119,plymouth,gas,turbo,two,hatchback,fwd,2128.0,front,ohc,four,spdi,98.0,102.0,5500.0,24.0,30.0,7957.0,8095.034361
180,toyota,gas,std,four,sedan,rwd,3131.0,front,dohc,six,mpfi,171.0,156.0,5200.0,20.0,24.0,15690.0,17152.117491
92,nissan,gas,std,four,sedan,fwd,1938.0,front,ohc,four,2bbl,97.0,69.0,5200.0,31.0,37.0,6849.0,6212.710142
86,mitsubishi,gas,std,four,sedan,fwd,2405.0,front,ohc,four,2bbl,122.0,88.0,5000.0,25.0,32.0,8189.0,8402.270079
60,mazda,gas,std,four,sedan,fwd,2410.0,front,ohc,four,2bbl,122.0,84.0,4800.0,26.0,32.0,8495.0,11238.955259
