In [2]:
%load_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [3]:
cd ..

/Users/odatakuma/cmu/FleetAI


In [3]:
GREEN_PATH = 'data/nyc_taxi/trips/green_tripdata_2016-05.csv'
YELLOW_PATH = 'data/nyc_taxi/trips/yellow_tripdata_2016-05.csv'
OUTPUT_PATH = 'data/nyc_taxi/trips/tripdata_2016-05.csv'

In [8]:
from analytics.preprocess import create_dataset

df = create_dataset(GREEN_PATH, YELLOW_PATH)
df.to_csv(OUTPUT_PATH, index=False)

Loading data
Cleaning data
Geohashing


In [5]:
df = pd.read_csv(OUTPUT_PATH, usecols=['phash', 'dhash'])
pickup = df.groupby(['phash']).phash.count()
dropoff = df.groupby(['dhash']).dhash.count()
geohash_table = pd.concat([pickup, dropoff], axis=1).fillna(0)
geohash_table = geohash_table.rename(columns={
    'phash':'total_pickup',
    'dhash':'total_dropoff'
})
geohash_table.head()

Unnamed: 0,total_pickup,total_dropoff
dr5qgru,0,1
dr5qgrv,0,2
dr5qgry,0,1
dr5qgxm,0,1
dr5qgxs,0,2


In [6]:
geohash_table.shape

(28655, 2)

In [7]:
import Geohash

geolats = []
geolons = []
for g in geohash_table.index:
    lat, lon, _, _ = Geohash.decode_exactly(g)
    geolats.append(float(lat))
    geolons.append(float(lon))
geohash_table['lat'] = geolats
geohash_table['lon'] = geolons
geohash_table = geohash_table.reset_index().rename(columns={'index':'geohash'})
geohash_table.head()

Unnamed: 0,geohash,total_pickup,total_dropoff,lat,lon
0,dr5qgru,0,1,40.604782,-74.030685
1,dr5qgrv,0,2,40.604782,-74.029312
2,dr5qgry,0,1,40.604782,-74.027939
3,dr5qgxm,0,1,40.602036,-74.018326
4,dr5qgxs,0,2,40.603409,-74.019699


In [8]:
import cPickle as pickle
# import engine.mapper.geohelper as gh
from engine.mapper.pathgenerator import PathGenerator

graph_path = 'data/pickle/nyc_network_graph.pkl'
with open(graph_path, 'r') as f:
    G = pickle.load(f)

path_generator = PathGenerator(G)

In [9]:
import Geohash

geohash_table['mlat'] = 0
geohash_table['mlon'] = 0
geohash_table['mgeohash'] = ''
r = 0.003
for i, (lat, lon) in geohash_table[['lat', 'lon']].iterrows():
    try:
        mlat, mlon = path_generator.mm_convert((lat, lon), georange=r)
        geohash_table.loc[i, ['mlat', 'mlon']] = mlat, mlon
        geohash_table.loc[i, 'mgeohash'] = Geohash.encode(mlat, mlon, 7)
    except:
        continue
        
len(geohash_table[geohash_table.mlat!=0])

28242

In [10]:
geohash_table.to_csv("data/table/g2mm.csv", index=False)

## Create state management table

In [12]:
geohash_table = geohash_table[geohash_table.geohash==geohash_table.mgeohash]
len(geohash_table)

27321

In [38]:
geohash_table = geohash_table[geohash_table.total_pickup+geohash_table.total_dropoff>5]
len(geohash_table)

21458

In [17]:
geohash_table['lat'] = geohash_table.mlat
geohash_table['lon'] = geohash_table.mlon
geohash_table = geohash_table.drop(['mlat', 'mlon', 'mgeohash', 'total_pickup', 'total_dropoff'], axis=1)
geohash_table.head()

Unnamed: 0,geohash,lat,lon
7,dr5qgxx,40.603358,-74.015682
8,dr5qgxy,40.604832,-74.017029
9,dr5qgxz,40.604637,-74.015335
12,dr5qgz5,40.600984,-74.009745
14,dr5qgz7,40.602178,-74.010322


## Tag Taxi Zone

In [21]:
import geopandas as gpd

nyc = gpd.read_file("data/nyc_taxi/taxi_zones/taxi_zones.shp").to_crs('+proj=latlon')
nyc = nyc[(nyc.borough != 'EWR')&(nyc.borough != 'Staten Island')]

In [22]:
import shapely
import rtree

def assign_zone(loc, gdf):
    df = gpd.GeoDataFrame(loc.copy(), crs="+proj=latlon")
    df["geometry"] = [shapely.geometry.Point(lon, lat)for lat, lon in loc.values]
    df = gpd.sjoin(df, gdf[["geometry"]], how="inner", op="within")    
    return df.index_right

In [26]:
geohash_table['taxi_zone'] = assign_zone(geohash_table[['lat', 'lon']], nyc)
geohash_table = geohash_table.dropna().set_index('geohash')
geohash_table.head()

Unnamed: 0_level_0,lat,lon,taxi_zone
geohash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
dr5qgxx,40.603358,-74.015682,10
dr5qgxy,40.604832,-74.017029,10
dr5qgxz,40.604637,-74.015335,10
dr5qgz5,40.600984,-74.009745,10
dr5qgz7,40.602178,-74.010322,10


In [36]:
len(geohash_table.taxi_zone.unique())

226

## Tag XY coordinates

In [77]:
import Geohash

lat0 = geohash_table.lat.min()
lon0 = geohash_table.lon.min()
Nx = 219
Ny = 219
dx = 0.3/(Nx-1)
dy = 0.3/(Ny-1)
num_g = 7
g2xy = {}
for x in range(Nx):
    for y in range(Ny):
        g = Geohash.encode(lat0+y*dy, lon0+x*dx, 7)
        g2xy[g] = (x/num_g, y/num_g)
print x/num_g, y/num_g

31 31


In [78]:
geohash_table['x'] = 0
geohash_table['y'] = 0
geohash_table[['x', 'y']] = [g2xy[g] for g in geohash_table.index]
print len(geohash_table.x.unique())
print len(geohash_table.y.unique())

31
32


## Compute road density

In [84]:
import json
from analytics.osmloader import OsmLoader

osm_path = 'data/osm/osm_nyc.json'
osm = OsmLoader(osm_path)

# of nodes: 5092898
# of highways: 57803


In [90]:
from engine.mapper import geohelper as gh

dense_G = osm.get_graph(drive=True, seg_max_length=10)
dense_G = dense_G.subgraph(G.nodes())
road_locs = [zip(d['lat'], d['lon']) for _, _, d in dense_G.edges_iter(data=True) if d.get('lat')]
road_density = gh.road_density(road_locs)
node_locs =[(d['lat'], d['lon']) for _, d in dense_G.nodes_iter(data=True) if d.get('lat')]
intxn_density = gh.intxn_density(node_locs)

In [93]:
geohash_table['road_density'] = [road_density[g] for g in geohash_table.index]
geohash_table['intxn_density'] = [intxn_density[g] for g in geohash_table.index]
geohash_table.head()

Unnamed: 0_level_0,lat,lon,taxi_zone,x,y,road_density,intxn_density
geohash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
dr5qgxx,40.603358,-74.015682,10,2,0,32,35
dr5qgxy,40.604832,-74.017029,10,2,0,34,8
dr5qgxz,40.604637,-74.015335,10,2,0,0,0
dr5qgz5,40.600984,-74.009745,10,3,0,13,5
dr5qgz7,40.602178,-74.010322,10,3,0,28,9


In [94]:
geohash_table.to_csv("data/table/zones.csv")

In [5]:
geohash_table = pd.read_csv("data/table/zones.csv", index_col='geohash')

In [6]:
df = pd.read_csv(OUTPUT_PATH)
print df.shape

(12869846, 15)


In [7]:
from analytics.preprocess import map_matching

df = map_matching(df, geohash_table)
df.shape

(12806249, 15)

In [8]:
df.to_csv('data/nyc_taxi/trips_2016-05.csv', index=False)

In [9]:
print len(df.phash.unique())
print len(df.dhash.unique())

16971
21449


## Tag granular XY coordinates

In [4]:
geohash_table = pd.read_csv("data/table/zones.csv", index_col='geohash')

In [5]:
import Geohash

lat0 = geohash_table.lat.min()
lon0 = geohash_table.lon.min()
Nx = 219
Ny = 219
dx = 0.3/(Nx-1)
dy = 0.3/(Ny-1)
num_g = 3
g2xy = {}
for x in range(Nx):
    for y in range(Ny):
        g = Geohash.encode(lat0+y*dy, lon0+x*dx, 7)
        g2xy[g] = (x/num_g, y/num_g)
print x/num_g, y/num_g

72 72


In [6]:
geohash_table['x'] = 0
geohash_table['y'] = 0
geohash_table[['x', 'y']] = [g2xy[g] for g in geohash_table.index]
print len(geohash_table.x.unique())
print len(geohash_table.y.unique())

71
73


In [12]:
geohash_table[['lat', 'lon', 'x', 'y']].to_csv("data/table/zones_granular.csv")