In [4]:
import math
import tensorflow as tf
from tensorflow.python.data import Dataset
from sklearn import metrics
import numpy as np
import pandas as pd

In [5]:
# Setup pandas environment
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.1f}'.format

# Prepare the dataset
california_housing_dataframe = pd.read_csv("https://download.mlcc.google.com/mledu-datasets/california_housing_train.csv", sep=",")

# Shuffle the dataset for better training performance
california_housing_dataframe = california_housing_dataframe.reindex(
    np.random.permutation(california_housing_dataframe.index))
california_housing_dataframe["median_house_value"] /= 1000.0
california_housing_dataframe

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
3945,-118.0,33.9,10.0,2423.0,356.0,1213.0,347.0,6.6,346.9
10187,-119.9,36.8,12.0,1488.0,253.0,675.0,223.0,4.8,89.3
9402,-119.2,34.1,15.0,5778.0,1285.0,1722.0,829.0,4.3,305.8
802,-117.1,33.1,33.0,555.0,165.0,612.0,176.0,2.2,137.5
15521,-122.3,39.1,10.0,266.0,62.0,154.0,49.0,2.2,75.0
...,...,...,...,...,...,...,...,...,...
9018,-119.0,34.2,21.0,2953.0,419.0,1397.0,410.0,6.5,291.5
12789,-121.8,37.3,14.0,4412.0,924.0,2698.0,891.0,4.7,227.6
15793,-122.4,37.8,52.0,2524.0,559.0,1430.0,476.0,3.4,254.7
11107,-121.0,37.7,15.0,1232.0,180.0,408.0,196.0,7.0,182.4


In [6]:
"""Define Features, Configure Feature Columns and Target

Args:
      Define the input feature: total_rooms.
      Configure a numeric feature column for total_rooms.
      Configure a target or label

Return:
      feature_column: total_rooms
      target:         median_house_value
"""
my_feature = california_housing_dataframe[["total_rooms"]]

feature_columns = [tf.feature_column.numeric_column("total_rooms")]
targets = california_housing_dataframe["median_house_value"]

In [7]:
# Configure the LinearRegressor
# Use gradient descent as the optimizer for training the model.
my_optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.0000001)
my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer, 5.0)

# Configure the linear regression model with our feature columns and optimizer.
# Set a learning rate of 0.0000001 for Gradient Descent.
linear_regressor = tf.estimator.LinearRegressor(
    feature_columns=feature_columns,
    optimizer=my_optimizer
)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmp42fj8z2m', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f3477691ba8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [8]:
# Defince input function
def input_fn(features, targets, batch_size=1, shuffle=True, num_epochs=None):
    """Trains a linear regression model of one feature.
  
    Args:
            features: pandas DataFrame of features
            targets: pandas DataFrame of targets
            batch_size: Size of batches to be passed to the model
            shuffle: True or False. Whether to shuffle the data.
            num_epochs: Number of epochs for which data should be repeated. None = repeat indefinitely

    Returns:
            Tuple of (features, labels) for next data batch
    """
  
    # Convert pandas data into a dict of np arrays.
    features = {key:np.array(value) for key,value in dict(features).items()}                                           
 
    # Construct a dataset, and configure batching/repeating.
    ds = Dataset.from_tensor_slices((features,targets)) # warning: 2GB limit
    ds = ds.batch(batch_size).repeat(num_epochs)
    
    # Shuffle the data, if specified.
    if shuffle:
        ds = ds.shuffle(buffer_size=10000)
    
    # Return the next batch of data.
    features, labels = ds.make_one_shot_iterator().get_next()
    return features, labels

In [9]:

_ = linear_regressor.train(
    input_fn = lambda: input_fn(my_feature, targets),
    steps=100
)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into /tmp/tmp42fj8z2m/model.ckpt.
INFO:tensorflow:loss = 136087.2, step = 0
INFO:tensorflow:Saving checkpoints for 100 into /tmp/tmp42fj8z2m/model.ckpt.
INFO:tensorflow:Loss for final step: 4034.093.


In [10]:
"""Make predictions
Create an input function for predictions.
Note: Since we're making just one prediction for each example, we don't 
need to repeat or shuffle the data here.
"""
prediction_input_fn = lambda: input_fn(my_feature, targets, num_epochs=1, shuffle=False)

# Call predict() on the linear_regressor to make predictions.
predictions = linear_regressor.predict(input_fn=prediction_input_fn)

# Format predictions as a NumPy array, so we can calculate error metrics.
predictions = np.array([item['predictions'][0] for item in predictions])

# Print Mean Squared Error and Root Mean Squared Error.
mean_squared_error = metrics.mean_squared_error(predictions, targets)
root_mean_squared_error = math.sqrt(mean_squared_error)
print("Mean Squared Error (on training data): %0.3f" % mean_squared_error)
print("Root Mean Squared Error (on training data): %0.3f" % root_mean_squared_error)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmp42fj8z2m/model.ckpt-100
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
Mean Squared Error (on training data): 56367.025
Root Mean Squared Error (on training data): 237.417
