<a href="https://colab.research.google.com/github/janShi1105/science/blob/main/ML14_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf
import pandas as pd

dataset_path = tf.keras.utils.get_file('auto-mpg.data', 'http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data')
column_names = ['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight', 'Acceleration', 'ModelYear', 'Origin']
df = pd.read_csv(dataset_path, names=column_names, na_values= '?', comment='\t', sep=' ', skipinitialspace=True)


Downloading data from http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data


In [2]:
df = df.dropna()
df = df.reset_index(drop=True)
import sklearn
import sklearn.model_selection
df_train, df_test = sklearn.model_selection.train_test_split(df, train_size=0.8)
train_stats = df_train.describe().transpose()
numeric_column_names = ['Cylinders', 'Displacement', 'Horsepower', 'Weight', 'Acceleration']
df_train_norm, df_test_norm = df_train.copy(), df_test.copy()
for col_name in numeric_column_names:
  mean = train_stats.loc[col_name, 'mean']
  std  = train_stats.loc[col_name, 'std']
  df_train_norm.loc[:, col_name] = (df_train_norm.loc[:, col_name] - mean)/ std
  df_test_norm.loc[:, col_name] = (df_test_norm.loc[:, col_name] - mean)/ std

df_train_norm.tail()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,ModelYear,Origin
311,28.0,-0.855938,-0.413103,-0.372364,-0.347654,0.340372,80,1
226,18.5,0.313968,0.527789,-0.168516,0.65631,1.226404,77,1
350,33.7,-0.855938,-0.831278,-0.754579,-0.902383,-0.403895,81,3
230,16.0,1.483874,1.487688,1.131013,1.616417,-0.368454,77,1
6,14.0,1.483874,2.466596,2.940163,1.638938,-2.317724,70,1


In [3]:
numeric_features = []
for col_name in numeric_column_names:
  numeric_features.append(tf.feature_column.numeric_column(key=col_name))

In [4]:
feature_year = tf.feature_column.numeric_column(key='ModelYear')
bucketized_features = []
bucketized_features.append(tf.feature_column.bucketized_column(source_column=feature_year, boundaries=[73,76,79]))

In [5]:
feature_origin = tf.feature_column.categorical_column_with_vocabulary_list(key='Origin', vocabulary_list=[1,2,3])


In [6]:
print(feature_year)

NumericColumn(key='ModelYear', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None)


In [7]:
feature_origin

VocabularyListCategoricalColumn(key='Origin', vocabulary_list=(1, 2, 3), dtype=tf.int64, default_value=-1, num_oov_buckets=0)

In [8]:
categorical_indicator_features = []
categorical_indicator_features.append(tf.feature_column.indicator_column(feature_origin))

In [9]:
def train_input_fn(df_train, batch_size=8):
  df =df_train.copy()
  train_x, train_y = df, df.pop('MPG')
  dataset = tf.data.Dataset.from_tensor_slices((dict(train_x), train_y))
  return dataset.shuffle(1000).repeat().batch(batch_size)

In [10]:
ds = train_input_fn(df_train_norm)
batch = next(iter(ds))
print('Keys: ', batch[0].keys())

Keys:  dict_keys(['Cylinders', 'Displacement', 'Horsepower', 'Weight', 'Acceleration', 'ModelYear', 'Origin'])


In [11]:
print('Batch Model Years: ', batch[0]['ModelYear'])

Batch Model Years:  tf.Tensor([79 78 81 70 76 79 75 82], shape=(8,), dtype=int64)


In [12]:
def eval_input_fn(df_test, batch_size=8):
  df =df_test.copy()
  test_x, test_y = df, df.pop('MPG')
  dataset = tf.data.Dataset.from_tensor_slices((dict(test_x), test_y))
  return dataset.batch(batch_size)

In [13]:
all_feature_columns = (numeric_features + bucketized_features + categorical_indicator_features)

In [14]:
regressor = tf.estimator.DNNRegressor(feature_columns=all_feature_columns, hidden_units = [32,10], model_dir='models/autompg-dnnregressor/')

In [15]:
import numpy as np
EPOCHS = 1000
BATCH_SIZE = 8
total_steps = EPOCHS * int(np.ceil(len(df_train) / BATCH_SIZE))
print('Training Steps: ', total_steps)

Training Steps:  40000


In [16]:
regressor.train(input_fn=lambda: train_input_fn(df_train_norm, batch_size=BATCH_SIZE), steps=total_steps)

Instructions for updating:
Use Variable.read_value. Variables in 2.X are initialized automatically both in eager and graph (inside tf.defun) contexts.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


<tensorflow_estimator.python.estimator.canned.dnn.DNNRegressorV2 at 0x7f36e5daa8d0>

In [20]:
reloaded_regressor = tf.estimator.DNNRegressor(
    feature_columns=all_feature_columns, hidden_units=[32,10], warm_start_from='models/autompg-dnnregressor/', model_dir='models/autompg-dnnregressor/'
)

In [23]:
eval_results = reloaded_regressor.evaluate(input_fn=lambda:eval_input_fn(df_test_norm, batch_size=8))
print('Average-Loss: {:.4f}'.format(eval_results['average_loss']))

Average-Loss: 11.2417


In [24]:
pred_res = regressor.predict(input_fn=lambda: eval_input_fn(df_test_norm, batch_size=8))
print(next(iter(pred_res)))

{'predictions': array([14.7343025], dtype=float32)}


In [26]:
boosted_tree = tf.estimator.BoostedTreesRegressor(feature_columns = all_feature_columns, n_batches_per_layer=20, n_trees=200)
boosted_tree.train(input_fn = lambda: train_input_fn(df_train_norm, batch_size=BATCH_SIZE))
eval_results = boosted_tree.evaluate(
    input_fn=lambda: eval_input_fn(df_test_norm, batch_size=8))
print('Average-Loss: {:.4f}'.format(eval_results['average_loss']))

Instructions for updating:
Boosted Trees kernels in TF are deprecated. Please use TensorFlow Decision Forests instead (https://github.com/tensorflow/decision-forests)
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.
'_Resource' object has no attribute 'name'
'_Resource' object has no attribute 'name'
'_Resource' object has no attribute 'name'
'_Resource' object has no attribute 'name'
'_Resource' object has no attribute 'name'


Average-Loss: 7.1586
