In [2]:
%matplotlib inline

import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import cross_val_score
import geohash
from sklearn.metrics import mean_squared_error
import random
import math
import datetime




### Goal: Explore the possiblity of predicting a destination (lat/long) based on the pickup location, time & day

## 1. Load the aggregated data from the CSV file

In [3]:
# Each line is of the format:

#pickupGeohash, dropOffGeohash,time_num,day_of_week, count
# File is also available here: https://s3.amazonaws.com/testsetu/nyc/final/groupbydestn/singlefile/part-00000
names = ["pickup_geohash","dropoff_geohash","time_num","day_of_week", "count"]
df=pd.read_csv("./data/jason_destination1/part-00000", header=None, names = names)
#df = df.sample(n=10000000,weights=df['count'], replace=True)
print df.shape

(6475, 5)


## 2. Feature Extraction

In [4]:
# Get the longitude and latitude from the geohash
def decodegeo(geo, which):
    if len(geo) >= 6:
        geodecoded = geohash.decode(geo)
        return geodecoded[which]
    else:
        return 0
    
def further_data_prep(df):
  
    df['time_sin'] = (df['time_num'] * 2 * math.pi).apply(math.sin)
    df['time_cos'] = (df['time_num'] * 2 * math.pi).apply(math.cos)
    df['pickup_lat'] = df['pickup_geohash'].apply(lambda geo: decodegeo(geo, 0))
    df['pickup_long'] = df['pickup_geohash'].apply(lambda geo: decodegeo(geo, 1))
    df['dropoff_lat'] = df['dropoff_geohash'].apply(lambda geo: decodegeo(geo, 0))
    df['dropoff_long'] = df['dropoff_geohash'].apply(lambda geo: decodegeo(geo, 1))
    
    return df

In [5]:
df = further_data_prep(df)

## 3. Train-test split

In [7]:
trainSetSampleSize = 1000
testSetSize = 2000
testSetPosns = np.random.choice(df.shape[0],testSetSize, replace = False)

In [8]:
testSet = df.iloc[testSetPosns]
trainSet = df.drop(testSetPosns)

In [9]:
print df.shape
print trainSet.shape
print testSet.shape

(6475, 11)
(4475, 11)
(2000, 11)


In [10]:
#sample with replacement
trainSet = trainSet.sample(n=trainSetSampleSize,weights=trainSet['count'], replace=True)

In [11]:
X_train = trainSet[['time_num', 'time_sin', 'time_cos','day_of_week', 'pickup_lat', 'pickup_long', 'count']]
y_train = trainSet[['dropoff_lat', 'dropoff_long']]

X_test = testSet[['time_num', 'time_sin', 'time_cos','day_of_week', 'pickup_lat', 'pickup_long', 'count']]
y_test = testSet[['dropoff_lat', 'dropoff_long']]

In [12]:
pickup_count_train = X_train[['count']]
X_train.drop('count', axis=1, inplace=True)

pickup_count_test = X_test[['count']]
X_test.drop('count', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


## 4. Machine learning

In [13]:
reg = RandomForestRegressor(n_estimators=1, max_depth=20, n_jobs=-1, verbose=4, warm_start=True)
for n in range(1,20):
   reg.set_params(n_estimators=n)
   reg.fit(X_train,y_train) #, sample_weight=pickup_count_train.values)
   training_accuracy = reg.score(X_train, y_train)
   valid_accuracy = reg.score(X_test, y_test)
   rmsetrain = np.sqrt(mean_squared_error(reg.predict(X_train),y_train))
   rmsevalid = np.sqrt(mean_squared_error(reg.predict(X_test),y_test))
   print " R^2 (train) = %0.3f, R^2 (valid) = %0.3f, RMSE (train) = %0.3f, RMSE (valid) = %0.3f" % (training_accuracy, valid_accuracy, rmsetrain, rmsevalid)

building tree 1 of 1
 R^2 (train) = 0.271, R^2 (valid) = 0.109, RMSE (train) = 0.082, RMSE (valid) = 0.055
building tree 1 of 1


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done   2 out of 

 R^2 (train) = 0.289, R^2 (valid) = 0.302, RMSE (train) = 0.081, RMSE (valid) = 0.048
building tree 1 of 1


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished


 R^2 (train) = 0.311, R^2 (valid) = 0.334, RMSE (train) = 0.080, RMSE (valid) = 0.047
building tree 1 of 1


[Parallel(n_jobs=2)]: Done   4 out of   4 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done   4 out of   4 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done   4 out of   4 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done   4 out of   4 | elapsed:    0.0s finished


 R^2 (train) = 0.595, R^2 (valid) = 0.350, RMSE (train) = 0.061, RMSE (valid) = 0.047
building tree 1 of 1


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished


 R^2 (train) = 0.554, R^2 (valid) = 0.365, RMSE (train) = 0.064, RMSE (valid) = 0.046
building tree 1 of 1


[Parallel(n_jobs=2)]: Done   6 out of   6 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done   6 out of   6 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done   6 out of   6 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done   6 out of   6 | elapsed:    0.0s finished


 R^2 (train) = 0.679, R^2 (valid) = 0.374, RMSE (train) = 0.055, RMSE (valid) = 0.046
building tree 1 of 1


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done   7 out of   7 | elapsed:    0.1s finished
[Parallel(n_jobs=2)]: Done   7 out of   7 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done   7 out of   7 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done   7 out of   7 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished


 R^2 (train) = 0.643, R^2 (valid) = 0.383, RMSE (train) = 0.058, RMSE (valid) = 0.045
building tree 1 of 1


[Parallel(n_jobs=2)]: Done   8 out of   8 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   8 out of   8 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done   8 out of   8 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   8 out of   8 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done   8 out of   8 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   8 out of   8 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done   8 out of   8 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   8 out of   8 | elapsed:    0.0s finished


 R^2 (train) = 0.718, R^2 (valid) = 0.390, RMSE (train) = 0.051, RMSE (valid) = 0.045
building tree 1 of 1


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done   9 out of   9 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   9 out of   9 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done   9 out of   9 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   9 out of   9 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done   9 out of   9 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   9 out of   9 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done   9 out of   9 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   9 out of   9 | elapsed:    0.0s finished


 R^2 (train) = 0.769, R^2 (valid) = 0.394, RMSE (train) = 0.046, RMSE (valid) = 0.045
building tree 1 of 1


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done  10 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=2)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done  10 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=2)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done  10 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=2)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done  10 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=2)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished


 R^2 (train) = 0.737, R^2 (valid) = 0.396, RMSE (train) = 0.049, RMSE (valid) = 0.045
building tree 1 of 1


[Parallel(n_jobs=2)]: Done  11 out of  11 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=2)]: Done  11 out of  11 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done  11 out of  11 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=2)]: Done  11 out of  11 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done  11 out of  11 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=2)]: Done  11 out of  11 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done  11 out of  11 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=2)]: Done  11 out of  11 | elapsed:    0.0s finished


 R^2 (train) = 0.777, R^2 (valid) = 0.399, RMSE (train) = 0.045, RMSE (valid) = 0.045
building tree 1 of 1


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.1s finished
[Parallel(n_jobs=2)]: Done  12 out of  12 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done  12 out of  12 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done  12 out of  12 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done  12 out of  12 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished


 R^2 (train) = 0.751, R^2 (valid) = 0.397, RMSE (train) = 0.048, RMSE (valid) = 0.045
building tree 1 of 1


[Parallel(n_jobs=2)]: Done  13 out of  13 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done  13 out of  13 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done  13 out of  13 | elapsed:    0.1s finished
[Parallel(n_jobs=2)]: Done  13 out of  13 | elapsed:    0.0s finished


 R^2 (train) = 0.783, R^2 (valid) = 0.400, RMSE (train) = 0.045, RMSE (valid) = 0.045
building tree 1 of 1


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done  14 out of  14 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done  14 out of  14 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done  14 out of  14 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done  14 out of  14 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished


 R^2 (train) = 0.761, R^2 (valid) = 0.402, RMSE (train) = 0.047, RMSE (valid) = 0.045
building tree 1 of 1


[Parallel(n_jobs=2)]: Done  15 out of  15 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done  15 out of  15 | elapsed:    0.1s finished
[Parallel(n_jobs=2)]: Done  15 out of  15 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done  15 out of  15 | elapsed:    0.0s finished


 R^2 (train) = 0.787, R^2 (valid) = 0.408, RMSE (train) = 0.044, RMSE (valid) = 0.044
building tree 1 of 1


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done  16 out of  16 | elapsed:    0.1s finished
[Parallel(n_jobs=2)]: Done  16 out of  16 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done  16 out of  16 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done  16 out of  16 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished


 R^2 (train) = 0.809, R^2 (valid) = 0.411, RMSE (train) = 0.042, RMSE (valid) = 0.044
building tree 1 of 1


[Parallel(n_jobs=2)]: Done  17 out of  17 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done  17 out of  17 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done  17 out of  17 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done  17 out of  17 | elapsed:    0.0s finished


 R^2 (train) = 0.827, R^2 (valid) = 0.408, RMSE (train) = 0.040, RMSE (valid) = 0.044
building tree 1 of 1


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done  18 out of  18 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done  18 out of  18 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done  18 out of  18 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done  18 out of  18 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished


 R^2 (train) = 0.810, R^2 (valid) = 0.405, RMSE (train) = 0.042, RMSE (valid) = 0.045
building tree 1 of 1


[Parallel(n_jobs=2)]: Done  19 out of  19 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done  19 out of  19 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done  19 out of  19 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done  19 out of  19 | elapsed:    0.0s finished


 R^2 (train) = 0.826, R^2 (valid) = 0.404, RMSE (train) = 0.040, RMSE (valid) = 0.045


### Outcome

The best RMSE value that we got was 0.120. In NYC each longitude is approx 53 miles & latitude is approx 69 miles (see reference below). This gives an **error range of 6.36 x 8.28 square miles**. So we do not have a great predictor here (but a great learning experience in modeling this problem)

note: 1. We have not standardized lat & long in same scale - ideally this shd be done part of data prep. This would have given us a RMSE that we can apply properly to find exact error in distance.

#### Reference [link](http://geography.about.com/library/faq/blqzdistancedegree.htm)
Each degree of latitude is approximately 69 miles (111 kilometers) apart. The range varies (due to the earth's slightly ellipsoid shape) from 68.703 miles (110.567 km) at the equator to 69.407 (111.699 km) at the poles. 
A degree of longitude is widest at the equator at 69.172 miles (111.321) and gradually shrinks to zero at the poles. At 40° north or south the distance between a degree of longitude is 53 miles (85 km).