In [1]:
from __future__ import print_function, division
%matplotlib inline
import os
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import geopandas as gpd
import geohash_hilbert as ghh

In [2]:
df = pd.read_csv('data/yellow_tripdata_2015_short.csv', index_col=0)
print ("Shape of data\n{}".format(df.shape))
df.head()

Shape of data
(770806, 18)


Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,total_amount
61599709,1,2015-05-12 17:46:38,2015-05-12 18:18:09,1,9.1,-73.863525,40.770012,1,N,-73.982628,40.736942,1,30.0,1.0,0.5,7.45,5.54,44.79
24871662,1,2015-02-27 06:03:22,2015-02-27 06:09:52,1,2.6,-74.008308,40.714573,1,N,-73.988518,40.748821,1,9.0,0.5,0.5,1.5,0.0,11.8
19662635,2,2015-02-15 11:23:13,2015-02-15 11:29:37,1,1.28,-73.956635,40.775738,1,N,-73.967468,40.761639,2,7.0,0.0,0.5,0.0,0.0,7.8
15273088,2,2015-02-05 18:36:07,2015-02-05 18:36:40,1,0.0,-73.97419,40.754951,5,N,-73.974167,40.754822,1,52.0,0.0,0.5,11.47,5.33,69.6
67295899,2,2015-06-06 22:18:20,2015-06-06 22:40:32,3,4.55,-73.97345,40.784496,1,N,-73.993279,40.732471,2,17.5,0.5,0.5,0.0,0.0,18.8


In [3]:
df.columns

Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'pickup_longitude',
       'pickup_latitude', 'RateCodeID', 'store_and_fwd_flag',
       'dropoff_longitude', 'dropoff_latitude', 'payment_type', 'fare_amount',
       'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'total_amount'],
      dtype='object')

In [4]:
df.reset_index(drop=True, inplace=True)

In [5]:
df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,total_amount
0,1,2015-05-12 17:46:38,2015-05-12 18:18:09,1,9.1,-73.863525,40.770012,1,N,-73.982628,40.736942,1,30.0,1.0,0.5,7.45,5.54,44.79
1,1,2015-02-27 06:03:22,2015-02-27 06:09:52,1,2.6,-74.008308,40.714573,1,N,-73.988518,40.748821,1,9.0,0.5,0.5,1.5,0.0,11.8
2,2,2015-02-15 11:23:13,2015-02-15 11:29:37,1,1.28,-73.956635,40.775738,1,N,-73.967468,40.761639,2,7.0,0.0,0.5,0.0,0.0,7.8
3,2,2015-02-05 18:36:07,2015-02-05 18:36:40,1,0.0,-73.97419,40.754951,5,N,-73.974167,40.754822,1,52.0,0.0,0.5,11.47,5.33,69.6
4,2,2015-06-06 22:18:20,2015-06-06 22:40:32,3,4.55,-73.97345,40.784496,1,N,-73.993279,40.732471,2,17.5,0.5,0.5,0.0,0.0,18.8


In [6]:
#add a column of geohash to dataframe
geo = {}
for i in range(len(df)):
    value = ghh.encode(df.pickup_longitude.iloc[i], df.pickup_latitude.iloc[i], precision=5)
    geo[i] = value
s = pd.Series(geo, name='geohash')
geo = pd.DataFrame(s)
df = df.merge(geo, left_index=True, right_index=True)
df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,total_amount,geohash
0,1,2015-05-12 17:46:38,2015-05-12 18:18:09,1,9.1,-73.863525,40.770012,1,N,-73.982628,40.736942,1,30.0,1.0,0.5,7.45,5.54,44.79,SHG09
1,1,2015-02-27 06:03:22,2015-02-27 06:09:52,1,2.6,-74.008308,40.714573,1,N,-73.988518,40.748821,1,9.0,0.5,0.5,1.5,0.0,11.8,SHGCP
2,2,2015-02-15 11:23:13,2015-02-15 11:29:37,1,1.28,-73.956635,40.775738,1,N,-73.967468,40.761639,2,7.0,0.0,0.5,0.0,0.0,7.8,SHG1B
3,2,2015-02-05 18:36:07,2015-02-05 18:36:40,1,0.0,-73.97419,40.754951,5,N,-73.974167,40.754822,1,52.0,0.0,0.5,11.47,5.33,69.6,SHG1Y
4,2,2015-06-06 22:18:20,2015-06-06 22:40:32,3,4.55,-73.97345,40.784496,1,N,-73.993279,40.732471,2,17.5,0.5,0.5,0.0,0.0,18.8,SHBJ4


In [7]:
df.geohash.unique()

array(['SHG09', 'SHGCP', 'SHG1B', ..., 'SHGuJ', 'SHG3t', 'SHGEG'], dtype=object)

In [8]:
len(df.geohash.unique())

1109

## geohash JFK airport location

In [9]:
lat_center = 40.647537
lon_center = -73.787931

In [10]:
lat_center, lon_center

(40.647537, -73.787931)

In [11]:
center = ghh.encode(lon_center, lat_center, precision=5)
center

'SHFkk'

In [12]:
df[df.geohash == center]

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,total_amount,geohash
121,1,2015-05-29 16:43:56,2015-05-29 17:39:03,1,19.40,-73.786278,40.644199,1,N,-73.904037,40.820633,2,57.5,1.0,0.5,0.00,5.54,64.84,SHFkk
251,1,2015-04-26 20:55:00,2015-04-26 21:29:30,1,15.10,-73.789810,40.643921,1,N,-73.955750,40.617531,2,44.5,0.5,0.5,0.00,0.00,45.80,SHFkk
267,2,2015-04-19 07:07:44,2015-04-19 07:34:21,1,18.49,-73.784630,40.648617,2,N,-73.994781,40.725262,1,52.0,0.0,0.5,10.56,0.00,63.36,SHFkk
487,2,2015-04-06 11:28:58,2015-04-06 12:14:54,5,16.81,-73.791245,40.645691,2,N,-73.978142,40.753277,1,52.0,0.0,0.5,11.67,5.54,70.01,SHFkk
817,2,2015-04-30 16:48:54,2015-04-30 18:00:28,1,17.20,-73.789879,40.647209,2,N,-73.988708,40.722420,2,52.0,0.0,0.5,0.00,0.00,52.80,SHFkk
1040,2,2015-04-12 20:15:45,2015-04-12 20:50:41,1,17.73,-73.789787,40.647034,2,N,-73.946709,40.800278,1,52.0,0.0,0.5,11.67,5.54,70.01,SHFkk
1172,2,2015-05-05 09:59:01,2015-05-05 10:56:56,5,21.98,-73.786713,40.644276,2,N,-73.981888,40.773224,1,52.0,0.0,0.5,6.00,5.54,64.34,SHFkk
1507,2,2015-05-27 18:03:13,2015-05-27 18:48:28,1,10.67,-73.789497,40.647060,1,N,-73.949928,40.675598,1,39.0,1.0,0.5,8.16,0.00,48.96,SHFkk
1734,2,2015-03-13 12:03:21,2015-03-13 12:04:44,1,0.48,-73.789703,40.646233,1,N,-73.801231,40.646652,2,3.5,0.0,0.5,0.00,0.00,4.30,SHFkk
1803,1,2015-05-26 01:28:42,2015-05-26 01:59:00,1,12.20,-73.789948,40.646500,1,N,-73.975586,40.679039,1,35.5,0.5,0.5,4.00,0.00,40.80,SHFkk


#### LaGuardia Airport

In [13]:
lat_center = 40.773063
lon_center = -73.873289

In [14]:
lat_center, lon_center

(40.773063, -73.873289)

In [15]:
center = ghh.encode(lon_center, lat_center, precision=5)
center

'SHG0t'

In [16]:
df[df.geohash == center]

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,total_amount,geohash
97,2,2015-01-25 14:49:08,2015-01-25 15:18:08,1,15.75,-73.872849,40.774136,1,N,-74.014114,40.709396,1,43.0,0.0,0.5,9.83,5.33,58.96,SHG0t
388,1,2015-04-08 23:55:28,2015-04-09 00:16:40,3,9.20,-73.874260,40.773876,1,N,-73.997246,40.755661,1,27.5,0.5,0.5,5.00,5.54,39.34,SHG0t
518,2,2015-03-12 17:48:34,2015-03-12 18:48:47,1,9.16,-73.874367,40.774029,1,N,-73.993408,40.745621,1,38.0,1.0,0.5,5.00,5.33,50.13,SHG0t
725,2,2015-04-21 17:36:31,2015-04-21 18:24:31,1,8.65,-73.873138,40.774120,1,N,-73.991379,40.750050,1,33.0,1.0,0.5,8.07,5.54,48.41,SHG0t
835,2,2015-02-19 22:19:09,2015-02-19 22:38:11,5,8.83,-73.873108,40.774021,1,N,-73.982857,40.756310,2,26.5,0.5,0.5,0.00,5.33,33.13,SHG0t
847,1,2015-03-19 11:54:58,2015-03-19 12:35:36,1,10.70,-73.872826,40.773998,1,N,-73.984413,40.759201,2,37.5,0.0,0.5,0.00,5.33,43.63,SHG0t
1043,1,2015-02-16 18:00:06,2015-02-16 18:20:02,1,8.80,-73.874527,40.774242,1,N,-73.973671,40.752998,1,25.5,0.0,0.5,6.30,5.33,37.93,SHG0t
1173,2,2015-04-29 15:32:21,2015-04-29 17:19:21,1,18.40,-73.875214,40.773888,2,N,-73.989433,40.759064,2,52.0,0.0,0.5,0.00,5.54,58.34,SHG0t
1222,2,2015-04-23 23:25:16,2015-04-23 23:46:10,1,10.56,-73.874695,40.774033,1,N,-74.006477,40.730915,1,31.5,0.5,0.5,5.00,0.00,37.80,SHG0t
1228,2,2015-01-23 16:41:04,2015-01-23 17:07:25,1,8.50,-73.873474,40.773884,1,N,-73.984245,40.745407,1,28.0,1.0,0.5,5.96,0.00,35.76,SHG0t


#### Newark Liberty International Airport

In [17]:
lat_center = 40.691942
lon_center = -74.183390

In [18]:
lat_center, lon_center

(40.691942, -74.18339)

In [19]:
center = ghh.encode(lon_center, lat_center, precision=5)
center

'SHGTe'

In [20]:
df[df.geohash == center]

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,total_amount,geohash
19172,1,2015-01-04 08:36:59,2015-01-04 09:23:23,3,24.7,-74.183372,40.69315,3,N,-73.872551,40.774536,2,85.5,0.0,0.0,0.0,15.08,100.88,SHGTe
32031,1,2015-01-14 11:18:26,2015-01-14 11:32:02,1,2.4,-74.186302,40.693142,1,N,-74.186302,40.693142,1,11.5,0.0,0.5,1.75,0.0,14.05,SHGTe
58773,2,2015-05-09 16:19:42,2015-05-09 16:20:00,1,0.0,-74.18293,40.692989,5,N,-74.18293,40.692989,1,73.0,0.0,0.0,0.0,17.55,90.85,SHGTe
112013,1,2015-01-22 07:31:26,2015-01-22 07:41:59,2,1.9,-74.186302,40.693142,1,N,-74.186302,40.693142,2,9.5,0.0,0.5,0.0,0.0,10.3,SHGTe
114432,1,2015-01-24 20:55:01,2015-01-24 20:59:40,2,0.9,-74.186302,40.693142,1,N,-74.186302,40.693142,1,5.5,0.5,0.5,1.36,0.0,8.16,SHGTe
129670,1,2015-02-06 13:16:59,2015-02-06 13:20:52,1,0.9,-74.186302,40.693142,1,N,-74.186302,40.693142,1,5.0,0.0,0.5,1.0,0.0,6.8,SHGTe
140054,1,2015-01-25 00:08:55,2015-01-25 00:20:19,3,1.0,-74.186302,40.693142,1,N,-74.186302,40.693142,1,8.5,0.5,0.5,1.96,0.0,11.76,SHGTe
239472,1,2015-01-13 13:26:55,2015-01-13 13:47:11,1,1.6,-74.186302,40.693142,1,N,-74.186302,40.693142,1,13.0,0.0,0.5,2.76,0.0,16.56,SHGTe
291601,1,2015-02-10 04:18:21,2015-02-10 04:29:19,2,3.7,-74.186302,40.693142,1,N,-74.186302,40.693142,1,12.5,0.5,0.5,1.0,0.0,14.8,SHGTe
319525,1,2015-01-15 08:49:55,2015-01-15 09:16:04,1,2.8,-74.186302,40.693142,1,N,-74.186302,40.693142,1,17.0,0.0,0.5,3.56,0.0,21.36,SHGTe
