In [None]:
import tensorflow as tf
import pandas as pd
import shutil
print(tf.__version__)

In [None]:
train_data_file = 'data/train-data.csv'
valid_data_file = 'data/valid-data.csv'
test_data_file = 'data/test-data.csv'

## Steps to use the TF Estimator APIs

1. Define dataset metadata

2. Create TF feature columns based on metadata

3. Define data input function to populate the features from the data source

4. Create experiment: Initialise the estimator

5. Run experiment: Supply train data, evaluation data, config, and params

6. Evaluate the trained model on the test set

## Define dataset metadata

In [None]:
HEADER = ['pickup_datetime',
          'pickup_dayofweek',
          'pickup_hour',
          'pickup_longitude',
          'pickup_latitude',
          'dropoff_longitude',
          'dropoff_latitude', 
          'passenger_count',
          'fare_amount']


DEFAULTS = [['NULL'],['NULL'],[-1], [-74.0], [40.0], [-74.0], [40.7], [-1],[-.1]]

NUMERIC_FEATURE_NAMES = ['pickup_longitude', 
                         'pickup_latitude',
                         'dropoff_longitude', 
                         'dropoff_latitude', 
                         'passenger_count']

CATEGORICAL_FEATURE_NAMES = []

FEATURE_NAMES = NUMERIC_FEATURE_NAMES + CATEGORICAL_FEATURE_NAMES

TARGET_NAME = 'fare_amount'

UNUSED_FEATURE_NAMES = set(HEADER) - set(FEATURE_NAMES) - set([TARGET_NAME])

print("Numeric features: {}".format(NUMERIC_FEATURE_NAMES))
print("Categorical features: {}".format(CATEGORICAL_FEATURE_NAMES))
print("Target: {}".format(TARGET_NAME))
print("Unused features: {}".format(UNUSED_FEATURE_NAMES))

## Define input features

In [None]:
def create_feature_columns():

    numeric_columns = list(map(lambda feature_name: tf.feature_column.numeric_column(feature_name, dtype=tf.float32),
                               NUMERIC_FEATURE_NAMES))

    feature_columns = numeric_columns
    
    return feature_columns

# Test create_feature_columns()
feature_columns = create_feature_columns() 
print(feature_columns)

## Define a data input function

This function creates tensorflow data structures (i.e., dictionary of tensors) from Pandas Dataframe... (think about scalability; resultant dataframe needs to fit the memory!!!)

In [None]:
def pandas_input_fn(data_frame):
  
  
    continuous_features = {f: tf.constant(data_frame[f].values, dtype=tf.float32) for f in NUMERIC_FEATURE_NAMES}

    features = continuous_features
 
    target = tf.constant(data_frame[TARGET_NAME].values, dtype=tf.float32)

    return features, target
  
# Test pandas_input_fn()
df_train = pd.read_csv(train_data_file, header=None, names=HEADER)
features,target = pandas_input_fn(df_train)
feature_colum_names = list(features.keys())
print(feature_colum_names)
print(target)

## Define the evaluation metric

In [None]:
evaluation_metrics={
    'rmse': tf.contrib.learn.MetricSpec(metric_fn=tf.metrics.root_mean_squared_error)
    }

def print_evaluation(estimator):
    
    tf.logging.set_verbosity(tf.logging.ERROR)
    
    train_metric = estimator.evaluate(input_fn = lambda: pandas_input_fn(df_train), 
                                        steps=1, 
                                        metrics = evaluation_metrics)

    valid_metric = estimator.evaluate(input_fn = lambda: pandas_input_fn(df_valid), 
                                        steps=1, 
                                        metrics = evaluation_metrics)

    test_metric = estimator.evaluate(input_fn = lambda: pandas_input_fn(df_test), 
                                       steps=1, 
                                       metrics = evaluation_metrics)

    print("")
    print("train metric:{}".format(train_metric))
    print("valid metric:{}".format(valid_metric))
    print("test metric:{}".format(test_metric))

## Create an experiement with Linear Regression Estimator

In [None]:
def experiment_linear_regression(run_config,hparams):
  
    optimizer = tf.train.GradientDescentOptimizer(learning_rate = hparams.learning_rate)
  
    estimator = tf.contrib.learn.LinearRegressor(
        feature_columns=feature_columns,
        optimizer = optimizer,
        config=run_config)
      
    experiment =  tf.contrib.learn.Experiment(estimator=estimator, 
                                   train_steps = hparams.training_steps,
                                   train_input_fn = lambda: pandas_input_fn(df_train), 
                                   eval_input_fn =lambda: pandas_input_fn(df_valid),
                                   eval_metrics = evaluation_metrics
                                  )
    return experiment

## Set params and run experiemnt - Linear Regression

In [None]:
# Load dataset into dataframes
df_train = pd.read_csv('data/train-data.csv', header=None, names=HEADER)
df_valid = pd.read_csv('data/valid-data.csv', header=None, names=HEADER)
df_test = pd.read_csv('data/test-data.csv', header=None, names=HEADER)

# Define algorithm and experiment parameters
hparams  = tf.contrib.training.HParams(training_steps=10000, learning_rate=0.00001)

# Set trained model location
model_dir = "trained_models/linear_regression_model"

# Clear model directory
shutil.rmtree(model_dir, ignore_errors=True)

run_config = tf.contrib.learn.RunConfig(
    model_dir=model_dir
)

# Run experiement
tf.logging.set_verbosity(tf.logging.INFO)
tf.contrib.learn.learn_runner.run(experiment_fn = experiment_linear_regression, 
                               run_config = run_config,
                               schedule="train_and_evaluate",
                               hparams=hparams)

## Evaluate the trained Model - Linear Regression

In [None]:
linear_model = tf.contrib.learn.LinearRegressor(
        feature_columns=feature_columns,
        config=run_config)

print_evaluation(linear_model)

## Create another experiement using Deep Neural Networks (DNN)

In [None]:
def experiment_dnn_regression(run_config,hparams):
    
    dnn_estimator = tf.contrib.learn.DNNRegressor(
            feature_columns = feature_columns,
            hidden_units=hparams.hidden_units,
            config = run_config
    )
    
    experiment =  tf.contrib.learn.Experiment(estimator = dnn_estimator, 
                                     train_steps = hparams.training_steps,
                                     train_input_fn = lambda: pandas_input_fn(df_train),
                                     eval_input_fn =lambda: pandas_input_fn(df_valid),
                                     eval_metrics = evaluation_metrics
                                    )
    return experiment

## Set params and run experiemnt - DNN

In [None]:
# Load dataset into dataframes
df_train = pd.read_csv('data/train-data.csv', header=None, names=HEADER)
df_valid = pd.read_csv('data/valid-data.csv', header=None, names=HEADER)
df_test = pd.read_csv('data/test-data.csv', header=None, names=HEADER)

# Set params
hparams  = tf.contrib.training.HParams(training_steps=10000,
                                       hidden_units=[32, 8, 2])
model_dir = "trained_models/dnn_regression_model"

# Clear model directory# Clear model directory
shutil.rmtree(model_dir, ignore_errors=True)

run_config = tf.contrib.learn.RunConfig(
    model_dir=model_dir
)

# Run the experiment
tf.logging.set_verbosity(tf.logging.WARN)
tf.contrib.learn.learn_runner.run(experiment_fn = experiment_dnn_regression, 
                               run_config = run_config,
                               schedule="train_and_evaluate",
                               hparams=hparams)

## Evaluate the trained Model - DNN

In [None]:
dnn_model = tf.contrib.learn.DNNRegressor(
            feature_columns = feature_columns,
            hidden_units=hparams.hidden_units,
            config = run_config
    )

print_evaluation(dnn_model)

## Results so far...

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

df = pd.DataFrame({
              'Method' : pd.Series(['Basline', 'Linear Reg', 'DNN', ' ---', '----', '-----']),
              'RMSE': pd.Series([8.89, 11.15, 14.94, 0, 0, 0.0]) })

plt.figure(figsize=(15, 8))
ax = sns.barplot(data=df, x='Method', y='RMSE')
ax.set_ylabel('RMSE (dollars)')
ax.set_xlabel('Method')