# Grab Challenge - Traffic Management

This notebook preprocesses the test dataset and load the pretrained XGBoost model to predict traffic demand

In [1]:
import geohash
import numpy as np
import pandas as pd
import random
import xgboost
import pickle

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from annoy import AnnoyIndex

## Data Preprocessing

#### Load Test Dataset

In [3]:
df = pd.read_csv("data/testing.csv")

#### Split timestamp to hour and minute

In [4]:
df['hour'] = df['timestamp'].apply(lambda x: int(x.split(':')[0]))
df['minute'] = df['timestamp'].apply(lambda x: int(x.split(':')[1]))

#### Convert geohash to latitude and longitude

In [5]:
def geohash2LatLong(gh):
    lat_long = geohash.decode_exactly(gh)
    return float(lat_long[0]), float(lat_long[1])

lats = []
longs = []
for index, row in df.iterrows():
    lat, long = geohash2LatLong(row['geohash6'])
    lats.append(lat)
    longs.append(long)

df['lat'] = lats
df['long'] = longs

### Additional Feature 1: Past Days Demand Value

This feature looks up the previous demand values of each geohash location for past few days. This is to incorporate time-series feature of the dataset into the prediction.

In [6]:
# Number of past days to look up as feature
no_days_lookup = 3

In [7]:
day_mean = df.groupby(['geohash6', 'day'])['demand'].mean()

In [8]:
# Insert additional feature columns
for i in range(no_days_lookup):
    column = 'D-' + str(i+1) + ' mean'
    df.insert(len(df.columns), column, value=0.0)

In [9]:
for i, row in df.iterrows():
#     print(i)
    geohash_value = row['geohash6']
    day = row['day']
    for j in range(no_days_lookup):
        column = 'D-' + str(j+1) + ' mean'
        insert_tuple = (geohash_value, day-(j+1))
        if insert_tuple in day_mean.index:
            df.set_value(i, column, float(day_mean.loc[[insert_tuple]]))
        else:
            df.set_value(i, column, 0.0)

  # This is added back by InteractiveShellApp.init_path()
  if __name__ == '__main__':


### Additional Feature 2: Nearest Neighbours Demand Value

This feature uses Spotify's Annoy library to find the nearest neighbours of each data entries at given timeframe, and uses product of neighbours demand value and distance to the neigbhour as the additional feature to incorporate spatial element to the prediction.

In [10]:
# Number of closest neighbours to use as feature
no_neighbours = 5

In [11]:
# Sort dataframe to find nearest neighbours in each day and time
df = df.sort_values(by=['day', 'timestamp'])

In [12]:
# Insert additional feature columns
for i in range(no_neighbours):
    column = str(i+1) + '_neighbour'
    df.insert(len(df.columns), column, value=0.0)

In [13]:
# Data in same timeframe (date and time) belongs to the same 'window'

windows = []
window_indices = []
day = None
timestamp = None
for i, row in df.iterrows():
    if day == None and timestamp == None:
        day = row['day']
        timestamp = row['timestamp']
    elif row['day'] != day or row['timestamp'] != timestamp:
        windows.append(window_indices)
        window_indices = []
        day = None
        timestamp = None
    window_indices.append(i)

In [14]:
# Insert (demand * distance) value of nearest neighbours into dataframe 

for window in windows:
    t = AnnoyIndex(2, metric='euclidean')
    for index in window:
        t.add_item(index, [df['lat'][index], df['long'][index]])
    t.build(10)
    
    for index in window:
        indices, distances = t.get_nns_by_item(i=index, n=no_neighbours+1, include_distances=True)
        neighbours = list(zip(indices, distances))[1:] # Exclusde first neighbour => itself
        for i, neighbour in enumerate(neighbours):
            neighbour_index = neighbour[0]
            neighbour_distance = neighbour[1]
            neighbour_demand = df['demand'][neighbour_index]
            column = str(i+1) + '_neighbour'
            df[column][index] = neighbour_distance * neighbour_demand

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [15]:
# Reshuffle dataframe
df = df.sample(frac=1)

### Load Pretrained XGBoost Regressor Model

In [19]:
model = pickle.load(open("traffic_xgboost.dat", "rb"))



In [20]:
y = df['demand']
X = df.drop(['geohash6', 'timestamp', 'demand'], axis=1)

In [21]:
# Make Prediction
preds = model.predict(X)

In [22]:
# Calculate Root Mean Square Error
rmse = np.sqrt(mean_squared_error(y, preds))
print("RMSE: %f" % (rmse))

RMSE: 0.132396
