In [1]:
%load_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
import cPickle as pickle
plt.style.use('ggplot')

In [2]:
df = pd.read_csv('temp/data/taxi_tripdata_2016-05.csv',
                 usecols=[
                     'pickup_latitude',
                     'pickup_longitude']
                )
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12985767 entries, 0 to 12985766
Data columns (total 2 columns):
pickup_latitude     float64
pickup_longitude    float64
dtypes: float64(2)
memory usage: 297.2 MB


In [4]:
import Geohash

Geohash.encode(42.6, -5.6, precision=5)

'ezs42'

In [98]:
%%time
geostrings = [Geohash.encode(lat, lon, precision=7) for lat, lon in df.values]
print geostrings[:5]

['dr5rsge', 'dr5ru6k', 'dr5ruxb', 'dr5rgbx', 'dr5rue3']
CPU times: user 4min 43s, sys: 790 ms, total: 4min 44s
Wall time: 4min 44s


In [99]:
len(set(geostrings))

20917

In [107]:
l = 6
len(set(s[:l] for s in geostrings))

1738

In [5]:
df = pd.read_csv('temp/data/taxi_tripdata_2016-05.csv',
                 usecols=[
                     'dropoff_latitude',
                     'dropoff_longitude']
                )
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12985767 entries, 0 to 12985766
Data columns (total 2 columns):
dropoff_latitude     float64
dropoff_longitude    float64
dtypes: float64(2)
memory usage: 297.2 MB


In [6]:
%%time
geostrings = [Geohash.encode(lat, lon, precision=7) for lat, lon in df.values]
print geostrings[:5]

['dr5rtce', 'dr5rut8', 'dr72mt2', 'dr5ru0h', 'dr5ru7g']
CPU times: user 4min 59s, sys: 2.58 s, total: 5min 2s
Wall time: 5min 7s


In [2]:
df = pd.read_csv('temp/data/taxi_tripdata_2016-05.csv',
                 dtype={
                     'date': np.int16,
                     'dayofweek':np.int16,
                     'hour': np.int16,
                     'minute': np.int16
                 })
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12933070 entries, 0 to 12933069
Data columns (total 16 columns):
Unnamed: 0               int64
request_id               int64
date                     int16
dayofweek                int16
dropoff_latitude         float64
dropoff_longitude        float64
hour                     int16
minute                   int16
pickup_latitude          float64
pickup_longitude         float64
second                   float64
trip_distance            float64
trip_time                float64
great_circle_distance    int64
pickup_geohash           object
dropoff_geohash          object
dtypes: float64(7), int16(4), int64(3), object(2)
memory usage: 1.3+ GB


In [4]:
df['pickup_zone'] = pd.Series(geostrings)
df['dropoff_zone'] = pd.Series(geostrings)
df.head()

Unnamed: 0,request_id,date,dayofweek,dropoff_latitude,dropoff_longitude,hour,minute,pickup_latitude,pickup_longitude,second,trip_distance,trip_time,great_circle_distance,pickup_zone,dropoff_zone
0,0,1,6,40.702751,-73.921951,0,0,40.712791,-73.965874,0,2.93,15.216667,3866,dr5rsge,dr5rtce
1,1,1,6,40.767826,-73.980797,0,0,40.750507,-73.987198,0,1.48,6.516667,1999,dr5ru6k,dr5rut8
2,2,1,6,40.855343,-73.937805,0,0,40.780739,-73.981544,0,6.68,15.716667,9075,dr5ruxb,dr72mt2
3,3,1,6,40.737564,-73.997498,0,0,40.740192,-74.00528,0,0.56,6.65,717,dr5rgbx,dr5ru0h
4,4,1,6,40.758469,-73.988014,0,0,40.755764,-73.979294,0,0.63,5.316667,793,dr5rue3,dr5ru7g


In [5]:
df.to_csv('temp/data/taxi_tripdata_2016-05.csv', index=False)

In [6]:
del df

## Visualization with bokeh

In [87]:
geolats = []
geolons = []
l = 7
for s in geostrings:
    lat, lon, _, _ = Geohash.decode_exactly(s[:l])
    geolats.append(float(lat))
    geolons.append(float(lon))
len(set(zip(geolats, geolons)))

2401

In [66]:
df = pd.read_csv('temp/data/taxi_tripdata_2016-05.csv',
                 usecols=[
                     'dayofweek',
                     'hour',
                     'pickup_latitude',
                     'pickup_longitude', 
                     'pickup_geohash',
                     'dropoff_latitude',
                     'dropoff_longitude', 
                     'dropoff_geohash'
                 ],
                 dtype={
                     'dayofweek':np.int16,
                     'hour': np.int16
                 }
                )
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12985767 entries, 0 to 12985766
Data columns (total 8 columns):
dayofweek            int16
dropoff_latitude     float64
dropoff_longitude    float64
hour                 int16
pickup_latitude      float64
pickup_longitude     float64
pickup_geohash       object
dropoff_geohash      object
dtypes: float64(4), int16(2), object(2)
memory usage: 743.1+ MB


In [67]:
df['pickup_geohash'] = df.pickup_geohash.str[:-2]
df['dropoff_geohash'] = df.dropoff_geohash.str[:-2]

In [69]:
len(df)

12985767

In [71]:
left, right = -74.05, -73.75
top, bottom = 40.9, 40.6
df = df[(df.pickup_latitude>bottom)&
               (df.pickup_latitude<top)&
               (df.pickup_longitude>left)&
               (df.pickup_longitude<right)]
df = df[(df.dropoff_latitude>bottom)&
               (df.dropoff_latitude<top)&
               (df.dropoff_longitude>left)&
               (df.dropoff_longitude<right)]
len(df)

12912556

In [72]:
pickup = df.groupby(['pickup_geohash']).agg({
    'pickup_latitude' : 'mean',
    'pickup_longitude' : 'mean',
    'pickup_geohash': 'count'})
pickup.head()

Unnamed: 0_level_0,pickup_geohash,pickup_latitude,pickup_longitude
pickup_geohash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
dr5qg,13,40.602865,-74.009984
dr5qu,135,40.60284,-73.980888
dr5qv,45,40.60254,-73.947815
dr5qy,9,40.601988,-73.911591
dr5r4,1,40.635216,-74.048225


In [98]:
dropoff = df.groupby(['dropoff_geohash']).agg({
    'dropoff_latitude' : 'mean',
    'dropoff_longitude' : 'mean',
    'dropoff_geohash': 'count'})
dropoff.head()

Unnamed: 0_level_0,dropoff_longitude,dropoff_latitude,dropoff_geohash
dropoff_geohash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
dr5qg,-74.009876,40.603477,274
dr5qu,-73.981022,40.602833,1717
dr5qv,-73.947011,40.602813,751
dr5qy,-73.912054,40.602832,59
dr5r5,-74.022134,40.630564,25320


In [99]:
import geopandas as gpd

nyc = gpd.read_file("temp/data/taxi_zones/taxi_zones.shp").to_crs('+proj=latlon')

In [121]:
import shapely
from bokeh.models import Range1d
from bokeh import plotting

def plot_overmap(gdf):
    world_xs = []
    world_ys = []

    for i, row in gdf.iterrows():
        polygons = row['geometry']
        if isinstance(polygons, shapely.geometry.multipolygon.MultiPolygon):
            polygons = [x for x in polygons]
        elif isinstance(polygons, shapely.geometry.polygon.Polygon):
            polygons = [polygons]
        else:
            raise ValueError

        for p in polygons: 
            lons, lats = zip(*list(p.exterior.coords))
            world_xs.append(lons)
            world_ys.append(lats)

    plotting.output_notebook()
    p = plotting.figure(toolbar_location="left", plot_width=600, plot_height=500)
    p.patches(world_xs, world_ys, fill_color='white',
              line_color="black", line_width=0.2)    
    left, right = -74.05, -73.75
    top, bottom = 40.9, 40.6
    p.set(x_range=Range1d(left, right), y_range=Range1d(bottom, top))
#     plotting.show(p)
    return p

In [114]:
demand = pd.concat([pickup, dropoff], axis=1)
demand.index.names = ['zone']
demand = demand.rename(index=str, columns={
    'pickup_geohash':'total_pickup',
    'dropoff_geohash':'total_dropoff'
})
demand = demand[demand.total_pickup>30]
geolats = []
geolons = []
l = 5
for s in demand.index:
    lat, lon, _, _ = Geohash.decode_exactly(s[:l])
    geolats.append(float(lat))
    geolons.append(float(lon))
demand['decoded_latitude'] = geolats
demand['decoded_longitude'] = geolons
demand.head()

Unnamed: 0_level_0,total_pickup,pickup_latitude,pickup_longitude,dropoff_longitude,dropoff_latitude,total_dropoff,decoded_latitude,decoded_longitude
geohash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
dr5qg,13,40.602865,-74.009984,-74.009876,40.603477,274,40.583496,-74.025879
dr5qu,135,40.60284,-73.980888,-73.981022,40.602833,1717,40.583496,-73.981934
dr5qv,45,40.60254,-73.947815,-73.947011,40.602813,751,40.583496,-73.937988
dr5r5,2992,40.632575,-74.021492,-74.022134,40.630564,25320,40.627441,-74.025879
dr5r7,7906,40.670743,-74.00977,-74.009931,40.671404,18610,40.671387,-74.025879


In [8]:
demand.to_csv('data/geohash.csv')

In [1]:
demand = pd.read_csv('data/geohash.csv')

In [144]:
# demand = demand[demand.total_pickup>30]
# p = plot_overmap(nyc)
# p.scatter(
#     demand.pickup_longitude,
#     demand.pickup_latitude,
#     radius=np.sqrt(demand.total_pickup)/100000,
#     fill_alpha=0.2, color='red'
# )
# p.scatter(
#     demand.dropoff_longitude,
#     demand.dropoff_latitude,
#     radius=np.sqrt(demand.total_dropoff)/100000,
#     fill_alpha=0.2, color='blue')
# plotting.show(p)

In [203]:
pattern = '|'.join([s[2:] for s in demand.geohash])
pattern = 'dr(?:' + pattern + ')'
pattern

'dr(?:5qu|5qv|5r5|5r7|5re|5rg|5rh|5rj|5rk|5rm|5rn|5rq|5rr|5rs|5rt|5ru|5rv|5rw|5rx|5ry|5rz|5x0|5x1|5x2|5x3|5x8|5x9|5xb|5xc|72h|72j|72m|72n|72p|72q|72r|72t|72w|72x|780|782|788)'

In [166]:
len(df)

12912556

In [219]:
df = df[df.pickup_geohash.str.match(pattern)]
len(df.pickup_geohash.unique())

42

In [209]:
df = df[df.dropoff_geohash.str.match(pattern)]
len(df)

12909602

In [210]:
len(df.dropoff_geohash.unique())

42