In [56]:
import collections
import pandas as pd
import tensorflow as tf
import numpy as np
URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data"

COLUMN_TYPES = collections.OrderedDict([
    ("symboling", int),
    ("normalized-losses", float),
    ("make", str),
    ("fuel-type",str),
    ("aspiration", str),
    ("num-of-doors", str),
    ("body-style", str),
    ("drive-wheels", str),
    ("engine-location", str),
    ("wheel-base", float),
    ("length", float),
    ("width", float),
    ("height", float),
    ("curb-weight", float),
    ("engine-type", str),
    ("num-of-cylinders", str),
    ("engine-size", float),
    ("fuel-system", str),
    ("bore", float),
    ("stroke", float),
    ("compression-ratio", float),
    ("horsepower", float),
    ("peak-rpm", float),
    ("city-mpg", float),
    ("highway-mpg", float),
    ("price", float)
])

def raw_dataframe():
    #download and cache the data
    path = tf.keras.utils.get_file(URL.split("/")[-1], URL)
    #load it into pandas DataFrame
    df = pd.read_csv(path, names=COLUMN_TYPES.keys(), dtype=COLUMN_TYPES, na_values="?")
    return df

def load_data(y_name='price', train_fraction=0.7, seed=None):
    data = raw_dataframe()
    data = data.dropna()
    np.random.seed(seed)
    #split the data into train/test datasets
    x_train = data.sample(frac=train_fraction, random_state=seed)
    x_test = data.drop(x_train.index)
    
    y_train = x_train.pop(y_name)
    y_test = x_test.pop(y_name)
    return (x_train, y_train), (x_test, y_test)

def make_dataset(batch_sz, x, y=None, shuffle=None, shuffle_buffer_size = 1000):
    def input_fn():
        if y is not None:
            dataset = tf.data.Dataset.from_tensor_slices((dict(x),y))
        else:
            dataset = tf.data.Dataset.from_tensor_slices(dict(x))
        if shuffle:
            dataset = dataset.shuffle(shuffle_buffer_size).batch(batch_sz).repeat()
        else:
            dataset = dataset.batch(batch_sz)
        return dataset
    return input_fn

In [64]:
#linear regression 
def linear_regression(price_norm_factor=1000., batch_size=100, train_steps=1000):
    (train_x, train_y), (test_x, test_y) = load_data()
    train_y /= price_norm_factor
    test_y /= price_norm_factor
    print(train_x.columns)
    train_input_fn = make_dataset(batch_size, train_x, train_y, True)
    test_input_fn = make_dataset(batch_size, test_x, test_y)
    #print(type(train_input_fn))
    
    feature_columns = [
        tf.feature_column.numeric_column(key="curb-weight"),
        tf.feature_column.numeric_column(key="highway-mpg")
    ]
    #build the estimator
    model = tf.estimator.LinearRegressor(feature_columns=feature_columns)
    model.train(input_fn = train_input_fn, steps = train_steps)
    eval_result = model.evaluate(input_fn=test_input_fn)
    average_loss = eval_result["average_loss"]
    print("\n" + 80 * "*")
    print("\nRMS error for the test set: ${:.0f}"
        .format(price_norm_factor * average_loss**0.5))
    
    #prediction mode
    input_dict = {
        "curb-weight": np.array([2000,3000]),
        "highway-mpg": np.array([30,40])
    }
    predict_input_fn = make_dataset(1, input_dict)
    predict_results = model.predict(input_fn=predict_input_fn)
    print("\nPrediction results:")
    for i, prediction in enumerate(predict_results):
        msg = ("Curb weight: {: 4d}lbs, "
           "Highway: {: 0d}mpg, "
           "Prediction: ${: 9.2f}")
        msg = msg.format(input_dict["curb-weight"][i], input_dict["highway-mpg"][i],
                     price_norm_factor * prediction["predictions"][0])

    print("    " + msg)
    print()
    
#linear_regression()

In [71]:
def linear_regression_with_categorical_features(price_norm_factor=1000., batch_size=100, train_steps=1000):
    (train_x, train_y), (test_x, test_y) = load_data()
    train_y /= price_norm_factor
    test_y /= price_norm_factor
    #print(train_x.columns)
    train_input_fn = make_dataset(batch_size, train_x, train_y, True)
    test_input_fn = make_dataset(batch_size, test_x, test_y)
    
    body_style_vocab = ["hardtop", "wagon", "sedan", "hatchback", "convertible"]
    body_style_column = tf.feature_column.categorical_column_with_vocabulary_list(
        key="body-style", vocabulary_list=body_style_vocab)
    make_column = tf.feature_column.categorical_column_with_hash_bucket(key='make', hash_bucket_size=50)
    feature_columns = [
        tf.feature_column.numeric_column(key="curb-weight"),
        tf.feature_column.numeric_column(key="highway-mpg"),
        body_style_column,
        make_column
    ]
    
    model = tf.estimator.LinearRegressor(feature_columns=feature_columns)
    model.train(input_fn = train_input_fn, steps = train_steps)
    eval_result = model.evaluate(input_fn = test_input_fn)
    average_loss = eval_result["average_loss"]
    print("\n" + 80 * "*")
    print("\nRMS error for the test set: ${:.0f}"
        .format(price_norm_factor * average_loss**0.5))

    
linear_regression_with_categorical_features()

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_eval_distribute': None, '_task_id': 0, '_tf_random_seed': None, '_experimental_distribute': None, '_log_step_count_steps': 100, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_device_fn': None, '_keep_checkpoint_max': 5, '_global_id_in_cluster': 0, '_train_distribute': None, '_service': None, '_evaluation_master': '', '_master': '', '_keep_checkpoint_every_n_hours': 10000, '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_save_checkpoints_secs': 600, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f93c261ae80>, '_model_dir': '/tmp/tmpsi0vejdb', '_task_type': 'worker', '_save_checkpoints_steps': None, '_save_summary_steps': 100, '_protocol': None}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph wa