In [None]:
# Ensure the right version of Tensorflow is installed.
!pip freeze | grep tensorflow==2.5

In [None]:
# Import libraries
import tensorflow as tf
import pandas as pd
import numpy as np
import shutil

print(tf.__version__)

In [None]:
# In CSV, label is the first coulmn, after the features followed by the key
CSV_COLUMNS = ['fare_amount', 
               'pickuplon',
               'pickuplat',
               'dropofflon',
               'dropofflat',
               'passengers',
               'key']
FEATURES = CSV_COLUMNS[1:len(CSV_COLUMNS) - 1]
LABEL = CSV_COLUMNS[0]

df_train = pd.read_csv('./taxi-train.csv', header = None, names = CSV_COLUMNS)
df_valid = pd.read_csv('./taxi-valid.csv', header = None, names = CSV_COLUMNS)

In [None]:
CSV_COLUMNS = ['face_amount',
               'pickuplon',
               'pickuplat',
               'drop']

In [None]:
# Input function to read from Pandas Dataframe into tf.constant
def make_input_fn(df, num_epochs):
    return tf.compat.v1.estimator.inputs.pandas_input_fn(
    x = df,
    y = df[LABEL],
    batch_size = 128,
    num_epochs = num_epochs,
    shuffle = True,
    queue_capacity = 1000,
    num_threads = 1)

In [None]:
# create feature columns for estimator
def make_feature_cols():
    input_columns = [tf.feature_column.numeric_column(k) for k in FEATURES]
    return input_columns

In [None]:
# Linear Regression with tf.Estimator framework
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)

OUTDIR = 'taxi_trained'
shutil.rmtree(OUTDIR, ignore_errors = True) # start fresh each time

model = tf.estimator.LinearRegressor(
    feature_columns = make_feature_cols(), model_dir = OUTDIR)

model.train(input_fn = make_input_fn(df_train, num_epochs = 10))

In [None]:
def print_rmse(model, name, df):
    metrics = model.evaluate(input_fn = make_input_fn(df, 1))
    print('RMSE on {} dataset = {}'.format(name, np.sqrt(metrics['average_loss'])))
print_rmse(model, 'validation', df_valid)

In [None]:
import itertools
# Read saved model and use it for prediction
model = tf.estimator.LinearRegressor(
    feature_columns = make_feature_cols(), model_dir = OUTDIR)
preds_iter = model.predict(input_fn = make_input_fn(df_valid, 1))
print([pred['predictions'][0] for pred in list(itertools.islice(preds_iter, 5))])

In [None]:
# Deep Neural Network regression
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
shutil.rmtree(OUTDIR, ignore_errors=True) # start fresh each time
model = tf.estimator.DNNRegressor(hidden_units=[32, 8, 2],
                                  feature_columns=make_feature_cols(), 
                                  model_dir=OUTDIR)
model.train(input_fn=make_input_fn(df_train, num_epochs=100))
print_rmse(model, 'validation', df_valid)

In [None]:
# Benchmark dataset
from google.cloud import bigquery
import numpy as np
import pandas as pd

def create_query(phase, EVERY_N):
    """Creates a query with the proper splits.

    Args:
        phase: int, 1=train, 2=valid.
        EVERY_N: int, take an example EVERY_N rows.

    Returns:
        Query string with the proper splits.
    """
    base_query = """
    WITH daynames AS
    (SELECT ['Sun', 'Mon', 'Tues', 'Wed', 'Thurs', 'Fri', 'Sat'] AS daysofweek)
    SELECT
    (tolls_amount + fare_amount) AS fare_amount,
    daysofweek[ORDINAL(EXTRACT(DAYOFWEEK FROM pickup_datetime))] AS dayofweek,
    EXTRACT(HOUR FROM pickup_datetime) AS hourofday,
    pickup_longitude AS pickuplon,
    pickup_latitude AS pickuplat,
    dropoff_longitude AS dropofflon,
    dropoff_latitude AS dropofflat,
    passenger_count AS passengers,
    'notneeded' AS key
    FROM
    `nyc-tlc.yellow.trips`, daynames
    WHERE
    trip_distance > 0 AND fare_amount > 0
    """
    if EVERY_N is None:
        if phase < 2:
            # training
            query = """{0} AND ABS(MOD(FARM_FINGERPRINT(CAST
            (pickup_datetime AS STRING), 4)) < 2""".format(base_query)
        else:
            query = """{0} AND ABS(MOD(FARM_FINGERPRINT(CAST(
            pickup_datetime AS STRING), 4)) = {1}""".format(base_query, phase)
    else:
        query = """{0} AND ABS(MOD(FARM_FINGERPRINT(CAST(
        pickup_datetime AS STRING)), {1})) = {2}""".format(
            base_query, EVERY_N, phase)

    return query

CREDS = 'arboreal-parser-228610-683598fe8b4a.json'
client = bigquery.Client.from_service_account_json(json_credentials_path=CREDS)
query = create_query(2, 100000)
df = client.query(query).to_dataframe()

In [None]:
print_rmse(model, 'benchmark', df)