# Forecasting Taxi Fare via Neural Networks


Imagine that you work for a taxi company, and that one of your customers' biggest complaints is that they don't know how much a ride will cost until it's over. That's because distance is just one of several factors from which taxi fares are calculated. You decide to do something about it by building a mobile app that customers can use when they climb into a taxi to estimate what the fare will be. To provide the smarts for the app, you intend to use the massive amounts of fare data the company has collected over the years to train a neural network. Let's use a portion of a larger taxi-fare dataset from New York City to train the network to predict a fare amount given the time of day, the pickup and dropoff locations, and other information.

<div style="text-align: center;"> <img src = "res/regression/taxi_icon.jpg" width="25%"/> </div>

In the process, you'll see <strong> that neural networks can also be used for regression in addition to classification</strong>, which represents the primary learning goal of this practicum. This is accomplished by using a linear activation function for the output layer rather than a sigmoid.

# 0 | Google Colab Setup

In [1]:
import os
import shutil
import stat

In [2]:
def copy_safe(src, dst, max_len=200):
    """Copy files, skip long paths"""
    skipped = 0
    for root, dirs, files in os.walk(src):
        rel_path = os.path.relpath(root, src)
        dst_root = os.path.join(dst, rel_path) if rel_path != '.' else dst
        if len(dst_root) < max_len:
            os.makedirs(dst_root, exist_ok=True)
            for file in files:
                dst_file = os.path.join(dst_root, file)
                if len(dst_file) < max_len:
                    try: shutil.copy2(os.path.join(root, file), dst_file)
                    except: skipped += 1
                else: skipped += 1
        else: skipped += len(files)
    return skipped

In [3]:
# Setup resources if needed
setup_ran = False
if not os.path.exists('res'):
    print("Setting up resources...")
    setup_ran = True
    
    # Cleanup, clone, copy
    repo = 'deep_learning_resources'
    if os.path.exists(repo):
        shutil.rmtree(repo, onerror=lambda f,p,e: os.chmod(p, stat.S_IWRITE) or f(p))
    
    !git clone --depth=1 https://github.com/jjv31/deep_learning_resources
    
    if os.path.exists(f'{repo}/res'):
        skipped = copy_safe(f'{repo}/res', 'res')
        print(f"Setup complete! {'(' + str(skipped) + ' long filenames skipped)' if skipped else ''}")
    
    shutil.rmtree(repo, onerror=lambda f,p,e: os.chmod(p, stat.S_IWRITE) or f(p))

In [4]:
# Only refresh if we just downloaded resources
if setup_ran:
    from IPython.display import Javascript, display
    import time
    
    print("Refreshing images...")
    
    # Try browser refresh + aggressive image reload
    display(Javascript(f'''
    try {{ setTimeout(() => window.location.reload(true), 2000); }} catch(e) {{}}
    
    const t = {int(time.time())};
    document.querySelectorAll('img').forEach((img, i) => {{
        if (img.src.includes('res/')) {{
            const src = img.src.split('?')[0];
            setTimeout(() => img.src = src + '?v=' + t + '_' + i, i * 50);
        }}
    }});
    '''))
    
    print("If images don't appear, press Ctrl+Shift+R to hard refresh!")
else:
    print("Resources already exist, skipping setup.")

Resources already exist, skipping setup.


# 1 | Introduction

### 1.1 | Imports

In [None]:
%pip install tqdm datetime

In [None]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm


# Model evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

# Neural network
from keras import metrics
from keras.models import Sequential
from keras.layers import Dense, Input
from keras.optimizers import Adam

# Other
import datetime
from math import sqrt
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


# Plotting
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()

In [None]:
def print_univariates_metric(data, nameToPrint=None):

    # Mode - Handling multimodal cases
    mode_result = data.mode()
    if len(mode_result) == 0:  # No mode found
        mode_result = None
    else:
        mode_result = mode_result[0]

    # Print output
    print(f"Descriptives for {nameToPrint}")
    print(f"Mean = {round(data.mean(),2)} | Median = {round(data.median(),2)} | Mode = {mode_result} | "
          f"Min = {data.min()} | Max = {data.max()} | SD = {round(data.std(),2)} | "
          f"IQR(25) = {data.quantile(0.25)} | IQR(75) = {data.quantile(0.75)}")

In [None]:
# Plots the performance of the neural network
def plot_performance(training_values, validation_values, metric_name = "Recall"):

    epochs = range(1, len(training_values) + 1)
    
    sns.set() 
    plt.plot(epochs, training_values, '-', label=f'Training {metric_name}')
    plt.plot(epochs, validation_values, ':', label=f'Validation {metric_name}')

    plt.title(f'Training and Validation {metric_name}')
    plt.xlabel('Epoch')
    plt.ylabel(metric_name)
    plt.legend(loc='lower right')
    plt.plot()

### 1.2 | Loads & Explores Data

Start by loading the dataset and shaping it so that it's suitable for use in machine learning. The data requires a fair amount of prep work before it's of any use at all — something that is not uncommon in machine learning. Data scientists often find that collecting and preparing data accounts for 90% or more of their time.

In [None]:
df = pd.read_csv('res/regression/taxi-fares.csv')
df.head()

In [None]:
# How many rows and columns does the dataset contain?
df.shape

In [None]:
# Are any of the columns missing values?
df.isnull().sum()

In [None]:
sns.countplot(x=df['passenger_count'])

In [None]:
# Most of the rows in the dataset have a passenger count of 1. 
# Let's just focus on fares with one passenger.

df = df[df['passenger_count'] == 1]
df = df.drop(['key', 'passenger_count'], axis=1)
df.head()

# 2 | Preprocessing

### 2.1 | Feature Engineering

In [None]:
df.head(3)

These features are terrible for creating a neural network.

First, neural networks cannot process a datetime, so we need to turn that into meaningful ordinal features. In this case, we are going to create an ordinal feature that flags the day of the week the ride took place, as well as one that specifies the hour they were picked up, as both likely affect fare prices. For example, getting picked up at 5 PM on a weekday will likely incur a different fare than being picked up at 11 PM on a Sunday!

Second, longitude and latitude are large numbers that may confound a neural network, and they're not inherently meaningful on their own. Thus, we're going to extract distance from them.

In [None]:

# Loops through each row of the dataset
for i, row in tqdm(df.iterrows(), total=len(df)):

    # From pickup_datetime, turn it into something meaningful. That is, extract day_of_week and pickup_time from it.
    dt = datetime.datetime.strptime(row['pickup_datetime'], '%Y-%m-%d %H:%M:%S UTC')
    df.at[i, 'day_of_week'] = dt.weekday()
    df.at[i, 'pickup_time'] = dt.hour

    # Calculates distance
    x = (row['dropoff_longitude'] - row['pickup_longitude']) * 54.6 # 1 degree = 54.6 miles
    y = (row['dropoff_latitude'] - row['pickup_latitude']) * 69.0   # 1 degree = 69 miles
    distance = sqrt(x**2 + y**2)
    df.at[i, 'distance'] = distance

In [None]:
# Drops the now irrelevant columns
df = df.drop(columns=['pickup_datetime', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude'] )
df.head()

### 2.2 | Check for anomalous valus

In [None]:
# Let's inspect fare amount
print_univariates_metric(df["fare_amount"], "Fare")

In [None]:
# There are negative fares, which doesn't make sense. Let's remove them.
# Let's also remove really extreme fares, or fares that are roughly 2 SDs above the mean (i.e., 11.21 + 9.7 * 2).

df = df[(df['fare_amount'] > 0.0) & df['fare_amount'] < 30.61]

In [None]:
# Let's inspect distances.
print_univariates_metric(df["distance"], "Distance")

In [None]:
# Yikes! There are so many outliers that it's massively inflating the mean and SD. Let's remove them
# IQR (75) is 2.4. Let's limit the distance to 10 miles and make the minimum distance 1 mile
df = df[(df['distance'] > 1.0) & (df['distance'] < 10.0)] 

# 3 | Trains Initial Neural Network

Now it's time build a neural network and train it with the data prepared in the previous exercise. We'll create two hidden layers with 512 neurons each and an input layer that accepts three values: distance, the day of the week, and the time of day. Since the model is designed to predict a fare amount, the output layer will have one neuron.

### 3.1 | Train Test Split

In [None]:
X, y = df.drop(columns=["fare_amount"]), df["fare_amount"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Size of training set = {X_train.shape[0]} fares")
print(f"Size of test set = {X_test.shape[0]} fares")

### 3.2 | Compile & Train Neural Network

In [None]:
model = Sequential()
model.add( Input( shape=(3,) ) )  # Input layer
model.add(Dense(512, activation='relu',))
model.add(Dense(512, activation='relu'))
model.add(Dense(1))

model.compile(optimizer=Adam(learning_rate=.001), loss='mse', metrics=[metrics.MeanSquaredError(name='mse')] )
model.summary()

In [None]:
hist = model.fit(X_train, y_train, validation_data=(X_test, y_test), 
                 epochs=100, batch_size=1028)

### 3.3 | Evaluate Neural Network

In [None]:
plot_performance(hist.history["loss"], hist.history["val_loss"], metric_name = "Loss")

In [None]:
# One way to assess how well a model performs is the r2 score.
# Uses R2 to determine how good the model was on the training and testing set.

print("R2 score for training set")
print( r2_score(y_train, model.predict(X_train)) )

print("R2 score for testing set")
print( r2_score(y_test, model.predict(X_test)) )

In [None]:
# Another way to evaluate our model is to assess how 'off' our predicted fare values were from our true values. 
# We'll evaluate this through median & IQR to avoid outliers

prediction_error = model.predict(X_test)[0] - y_test
IQR25, median, IQR75 = round(prediction_error.quantile(0.25), 2), round(prediction_error.quantile(0.5), 2), round(prediction_error.quantile(0.75), 2)
print(f"Your fare estimation is usually ${median} off. 50% of the time, your fare is between ${IQR25} and ${IQR75} off.")

# 4 | Forecasting Fares through Our Model.

In [None]:
# Estimate what it will cost to hire a taxi for a 2-mile trip at 5:00 p.m. on Friday afternoon.
predicted_fare = model.predict(np.array([[4, 17, 2.0]]))[0][0]
print(f"Estimated fare = ${predicted_fare}")

In [None]:
#Now predict the fare amount for a 2-mile trip taken at 5:00 p.m. one day later (on Saturday).
predicted_fare = model.predict(np.array([[0, 10, 5.0]]))[0][0]
print(f"Estimated fare = ${predicted_fare}")

# 5 | Your Turn

### 5.0 | Section Overview

Your task is to improve the neural network so it can better predict fares relative to the original model. Namely, the original model appears in §3.1. Your task is to modify it such that it achieves a better r2 score on the testing set relative to the original model (§3.3)

### 5.1 | Compile & Train Neural Network

In [None]:
# *****************************************
# EXERCISE
# *****************************************

# The code for the original model appears below.
# Modify that code so the model performs better.

your_model = Sequential()
your_model.add( Input( shape=(3,) ) ) 
your_model.add(Dense(512, activation='relu',))
your_model.add(Dense(512, activation='relu'))
your_model.add(Dense(1))

your_model.compile(optimizer=Adam(learning_rate=.001), loss='mse', metrics=[metrics.MeanSquaredError(name='mse')] )
your_model.summary()

In [None]:
your_hist = your_model.fit(X_train, y_train, validation_data=(X_test, y_test), 
                 epochs=100, batch_size=1028)

### 5.2 | Evaluation

In [None]:
plot_performance(your_hist.history["loss"], your_hist.history["val_loss"], metric_name = "Loss")

In [None]:
# Run this code to determine if your model out/underperformed the original model.

print("R2 score for training set")
print( r2_score(y_train, your_model.predict(X_train)) )

print("R2 score for testing set")
print( r2_score(y_test, your_model.predict(X_test)) )

print("Did your model outperform the original model? Let's compare r2 scores on the testing set.")
print(f"Original r2 score = {r2_score(y_test, model.predict(X_test))} | Your r2 score = {r2_score(y_test, your_model.predict(X_test))}")