Linear regression using tensorflow estimators

Using the immigrant skills and salaries dataset sourced from the UFL website.
Description - http://users.stat.ufl.edu/~winner/data/immwork.txt
Data - http://users.stat.ufl.edu/~winner/data/immwork.dat
The dataset is manually split into a training set and a test set.
The feature used (X) is '% speak English' (column 3) and the output (Y) is average weekly salary (column 2)

This code uses estimators instead of the low-level API.

In [405]:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

In [406]:
# LinearRegressor estimator doesn't like column names with spaces.
COLS = ['PercentEnglish', 'AvgSalary']
learning_rate = 0.0001
num_epochs = 20000
progress_step = 1000

In [407]:
# define how to parse a line from the file
def _parse_line(line):
    defaults = [[0.0], [0.0]]
    fields = tf.decode_csv(line, defaults)
    # features are a dict of column name and value. 
    # this is to conform to the format that the estimator expects.
    features = dict(zip(COLS, fields))
    labels = features.pop('AvgSalary')
    return (features, labels)

In [408]:
# Input function that returns the next training data element. 
# This will be passed to the estimator during training.
def train_input_fn():
    dstrain = tf.data.TextLineDataset(['/home/harini/tensorflow/notebooks/immdata-train.csv'])
    dstrain = dstrain.map(_parse_line)
    # LinearRegressor returns an error about something having rank 0 if batch isn't specified.
    # repeat allows the iterator to start over from the beginning when one pass has completed.
    return dstrain.batch(1).repeat().make_one_shot_iterator().get_next()

# Input function that returns the next test data element. 
# This will be passed to the estimator during the test phase.
def eval_input_fn():
    dstest = tf.data.TextLineDataset(['/home/harini/tensorflow/notebooks/immdata-test.csv'])
    dstest = dstest.map(_parse_line)
    # don't use repeat here.
    return dstest.batch(1).make_one_shot_iterator().get_next()



In [409]:
# The feature columns that the LinearRegressor should use.
feature_columns = [
    tf.feature_column.numeric_column(key='PercentEnglish')
]
# The optimizer that the LinearRegressor should use. The default is the FTRL optimizer.
opt = tf.train.GradientDescentOptimizer(learning_rate)

# specify in the run config how frequently to log progress.
rc = tf.estimator.RunConfig(log_step_count_steps=progress_step)

# create the LinearRegressor with the above params.
est = tf.estimator.LinearRegressor(feature_columns=feature_columns, optimizer=opt, config=rc)

# train the model
est.train(input_fn=train_input_fn, steps=num_epochs)

# evaluate with test data and print the result.
eval_res = est.evaluate(input_fn=eval_input_fn)
print eval_res


INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_task_type': 'worker', '_train_distribute': None, '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f137eb6b810>, '_evaluation_master': '', '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_service': None, '_num_ps_replicas': 0, '_tf_random_seed': None, '_master': '', '_num_worker_replicas': 1, '_task_id': 0, '_log_step_count_steps': 1000, '_model_dir': '/tmp/tmpF0M3Fy', '_global_id_in_cluster': 0, '_save_summary_steps': 100}
{'PercentEnglish': <tf.Tensor 'DecodeCSV:0' shape=() dtype=float32>, 'AvgSalary': <tf.Tensor 'DecodeCSV:1' shape=() dtype=float32>}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tenso