In [69]:
import pandas as pd
import numpy as np
import pygeohash as pgh

In [None]:
pgh.encode()

## Data Dictionary
- VendorID A code indicating the TPEP provider that provided the record.
- 1= Creative Mobile Technologies, LLC; 2= VeriFone Inc.
- tpep_pickup_datetime The date and time when the meter was engaged.
- tpep_dropoff_datetime The date and time when the meter was disengaged.
- Passenger_count The number of passengers in the vehicle.
 This is a driver-entered value.
- Trip_distance The elapsed trip distance in miles reported by the taximeter.
- Pickup_longitude Longitude where the meter was engaged.
- Pickup_latitude Latitude where the meter was engaged.
- RateCodeID The final rate code in effect at the end of the trip.
 1= Standard rate 
 2=JFK
 3=Newark
 4=Nassau or Westchester
 5=Negotiated fare
 6=Group ride
- Store_and_fwd_flag This flag indicates whether the trip record was held in vehicle
 memory before sending to the vendor, aka “store and forward,”
 because the vehicle did not have a connection to the server.
- Y= store and forward trip
- N= not a store and forward trip
- Dropoff_longitude Longitude where the meter was disengaged.
- Dropoff_ latitude Latitude where the meter was disengaged.
- Payment_type A numeric code signifying how the passenger paid for the trip.
 1= Credit card
 2= Cash
 3= No charge
 4= Dispute
 5= Unknown
 6= Voided trip
- Fare_amount The time-and-distance fare calculated by the meter.
- Extra Miscellaneous extras and surcharges. Currently, this only includes
 t he 0.50 and 1 rush hour and overnight charges.
 M TA_tax 0.50 MTA tax that is automatically triggered based on the metered
 rate in use.
- Improvement_surcharge 0.30 improvement surcharge assessed trips at the flag drop. The
- improvement surcharge began being levied in 2015.
- Tip_amount Tip amount – This field is automatically populated for credit card
 tips. Cash tips are not included.
- Tolls_amount Total amount of all tolls paid in trip.
- Total_amount The total amount charged to passengers. Does not include cash tips.

# Geohash dictionary the distance between geohashes based on matching characters, in meters.
_PRECISION = {
    0: 20000000,
    1: 5003530,
    2: 625441,
    3: 123264,
    4: 19545,
    5: 3803,
    6: 610,
    7: 118,
    8: 19,
    9: 3.71,
    10: 0.6,
}

# Average speed of taxis
- In 2014, it dropped to 8.51 MPH. http://www.wnyc.org/story/traffic-speeds-slow-nyc-wants-curb-car-service-growth/
- 8.51 MPH is 3.80431 meters / second

# Geohash units are meters
- http://stackoverflow.com/questions/13448595/geohash-string-length-and-accuracy

In [2]:
taxi_yellowcab_df = pd.read_csv("data/yellow_tripdata_2016-01.csv")

In [3]:
taxi_lookup = pd.read_csv("data/taxi+_zone_lookup.csv")

In [13]:
names_ = ['zipcode','city','state','lat','long','radius?','?']
#dataframe = read_csv('projects.csv', converters={'project_id': lambda x: str(x)})
zip_codes = pd.read_csv("data/zipcode.csv",converters={'zipcode': lambda x: str(x)},names=names_)

In [14]:
zip_codes .head()

Unnamed: 0,zipcode,city,state,lat,long,radius?,?
0,210,Portsmouth,NH,43.005895,-71.013202,-5,1
1,211,Portsmouth,NH,43.005895,-71.013202,-5,1
2,212,Portsmouth,NH,43.005895,-71.013202,-5,1
3,213,Portsmouth,NH,43.005895,-71.013202,-5,1
4,214,Portsmouth,NH,43.005895,-71.013202,-5,1


In [22]:
taxi_lookup.head()

Unnamed: 0,LocationID,Borough,Zone,service_zone
0,1,EWR,Newark Airport,EWR
1,2,Queens,Jamaica Bay,Boro Zone
2,3,Bronx,Allerton/Pelham Gardens,Boro Zone
3,4,Manhattan,Alphabet City,Yellow Zone
4,5,Staten Island,Arden Heights,Boro Zone


In [19]:
zip_codes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43191 entries, 0 to 43190
Data columns (total 7 columns):
zipcode    43191 non-null object
city       43191 non-null object
state      43191 non-null object
lat        43191 non-null float64
long       43191 non-null float64
radius?    43191 non-null int64
?          43191 non-null int64
dtypes: float64(2), int64(2), object(3)
memory usage: 2.3+ MB


In [20]:
# Truncate zip codes to only be NY

In [21]:
zip_codes_ny = zip_codes[zip_codes.state=='NY']

In [25]:
zip_codes_ny.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2281 entries, 6 to 5721
Data columns (total 7 columns):
zipcode    2281 non-null object
city       2281 non-null object
state      2281 non-null object
lat        2281 non-null float64
long       2281 non-null float64
radius?    2281 non-null int64
?          2281 non-null int64
dtypes: float64(2), int64(2), object(3)
memory usage: 142.6+ KB


In [16]:
taxi_yellowcab_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10906858 entries, 0 to 10906857
Data columns (total 19 columns):
VendorID                 int64
tpep_pickup_datetime     object
tpep_dropoff_datetime    object
passenger_count          int64
trip_distance            float64
pickup_longitude         float64
pickup_latitude          float64
RatecodeID               int64
store_and_fwd_flag       object
dropoff_longitude        float64
dropoff_latitude         float64
payment_type             int64
fare_amount              float64
extra                    float64
mta_tax                  float64
tip_amount               float64
tolls_amount             float64
improvement_surcharge    float64
total_amount             float64
dtypes: float64(12), int64(4), object(3)
memory usage: 1.5+ GB


In [17]:
taxi_yellowcab_df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RatecodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,2,2016-01-01 00:00:00,2016-01-01 00:00:00,2,1.1,-73.990372,40.734695,1,N,-73.981842,40.732407,2,7.5,0.5,0.5,0.0,0.0,0.3,8.8
1,2,2016-01-01 00:00:00,2016-01-01 00:00:00,5,4.9,-73.980782,40.729912,1,N,-73.944473,40.716679,1,18.0,0.5,0.5,0.0,0.0,0.3,19.3
2,2,2016-01-01 00:00:00,2016-01-01 00:00:00,1,10.54,-73.98455,40.679565,1,N,-73.950272,40.788925,1,33.0,0.5,0.5,0.0,0.0,0.3,34.3
3,2,2016-01-01 00:00:00,2016-01-01 00:00:00,1,4.75,-73.993469,40.71899,1,N,-73.962242,40.657333,2,16.5,0.0,0.5,0.0,0.0,0.3,17.3
4,2,2016-01-01 00:00:00,2016-01-01 00:00:00,3,1.76,-73.960625,40.78133,1,N,-73.977264,40.758514,2,8.0,0.0,0.5,0.0,0.0,0.3,8.8


In [16]:
taxi_yellowcab_df.tail()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RatecodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
10906853,2,2016-01-31 23:30:32,2016-01-31 23:38:18,1,2.2,-74.003578,40.751011,1,N,-73.982651,40.767509,2,8.5,0.5,0.5,0.0,0.0,0.3,9.8
10906854,1,2016-01-05 00:15:55,2016-01-05 00:16:06,1,0.0,-73.945488,40.75153,1,N,-73.945457,40.75153,2,2.5,0.5,0.5,0.0,0.0,0.3,3.8
10906855,1,2016-01-05 06:12:46,2016-03-19 20:45:50,3,1.4,-73.99424,40.766586,1,N,-73.984428,40.753922,2,7.5,0.5,0.5,0.0,0.0,0.3,8.8
10906856,1,2016-01-05 06:21:44,2016-03-28 12:54:26,1,2.1,-73.948067,40.776531,1,N,-73.978188,40.777435,1,11.5,0.0,0.5,2.45,0.0,0.3,14.75
10906857,1,2016-01-05 06:15:21,2016-01-05 06:15:36,3,0.0,-73.960938,40.758595,2,N,-73.961006,40.758583,2,52.0,0.0,0.5,0.0,5.54,0.3,58.34


In [17]:
taxi_yellowcab_df[taxi_yellowcab_df.VendorID==2].head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RatecodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,2,2016-01-01 00:00:00,2016-01-01 00:00:00,2,1.1,-73.990372,40.734695,1,N,-73.981842,40.732407,2,7.5,0.5,0.5,0.0,0.0,0.3,8.8
1,2,2016-01-01 00:00:00,2016-01-01 00:00:00,5,4.9,-73.980782,40.729912,1,N,-73.944473,40.716679,1,18.0,0.5,0.5,0.0,0.0,0.3,19.3
2,2,2016-01-01 00:00:00,2016-01-01 00:00:00,1,10.54,-73.98455,40.679565,1,N,-73.950272,40.788925,1,33.0,0.5,0.5,0.0,0.0,0.3,34.3
3,2,2016-01-01 00:00:00,2016-01-01 00:00:00,1,4.75,-73.993469,40.71899,1,N,-73.962242,40.657333,2,16.5,0.0,0.5,0.0,0.0,0.3,17.3
4,2,2016-01-01 00:00:00,2016-01-01 00:00:00,3,1.76,-73.960625,40.78133,1,N,-73.977264,40.758514,2,8.0,0.0,0.5,0.0,0.0,0.3,8.8


- Transform lat and long to zip code

>zip_code_data from: https://github.com/EverythingMe/geodis


In [34]:
taxi_sample = taxi_yellowcab_df.sample(50)

In [27]:
zip_codes_ny.head()

Unnamed: 0,zipcode,city,state,lat,long,radius?,?
6,501,Holtsville,NY,40.922326,-72.637078,-5,1
7,544,Holtsville,NY,40.922326,-72.637078,-5,1
2446,6390,Fishers Island,NY,41.261936,-72.00708,-5,1
3443,10001,New York,NY,40.750742,-73.99653,-5,1
3444,10002,New York,NY,40.71704,-73.987,-5,1


In [71]:
taxi_sample.head()


Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RatecodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
248133,2,2016-01-01 13:57:29,2016-01-01 14:10:50,1,2.64,-73.99189,40.74926,1,N,-73.982674,40.723141,1,12.0,0.0,0.5,2.56,0.0,0.3,15.36
10346448,2,2016-01-08 13:36:25,2016-01-08 13:42:44,1,0.75,-73.986359,40.75523,1,N,-73.985291,40.75211,2,6.0,0.0,0.5,0.0,0.0,0.3,6.8
6177573,2,2016-01-15 08:52:53,2016-01-15 09:00:29,5,1.32,-73.986053,40.734699,1,N,-73.978767,40.750702,1,7.5,0.0,0.5,1.66,0.0,0.3,9.96
2612091,2,2016-01-16 09:56:47,2016-01-16 10:04:13,1,1.05,-73.954895,40.767284,1,N,-73.968422,40.76503,1,7.0,0.0,0.5,1.17,0.0,0.3,8.97
8041976,1,2016-01-31 23:59:05,2016-02-01 00:06:01,1,0.7,-73.981216,40.755569,1,N,-73.969368,40.752079,1,6.0,0.5,0.5,1.45,0.0,0.3,8.75


In [104]:
def geohash_encoding(taxi_df,precision_=6):
    """Encode the latitude and longtitude of the pickup and dropoff into a geohash. For reference, the precision
    of a geohash depends on the number of characters present.
    _PRECISION = { 0: 20000000, 1: 5003530, 2: 625441, 3: 123264, 4: 19545,
    5: 3803, 6: 610, 7: 118, 8: 19, 9: 3.71, 10: 0.6, } - all distances in meters.
    
    Average taxi speed in 2014 was 5.51 miles per hour, or 3.8 meters per second.
    Precision of 5, means that a taxi should traverse this square in ~16 minutes.
    Precision of 6, means a taxi should traverse square in ~3 minutes.
    Precision of 7, means a taxi should traverse the square in ~1 minute.
    
    Input: Pandas DF
    Output: Pandas DF with geohas column appended"""
    
    new_taxidf = taxi_df.copy()
    geo_hash_pickup = []
    geo_hash_dropoff = []
    
    for row in new_taxidf.iterrows():
        # longittude followed by latitude
        pickup_vector = np.array( [row[1]['pickup_latitude'],row[1]['pickup_longitude']])
        dropoff_vector = np.array( [row[1]['dropoff_latitude'],row[1]['dropoff_longitude']])
        #geohash encoding
        geo_hash_pickup.append(pgh.encode(pickup_vector[0],pickup_vector[1],precision=precision_))
        geo_hash_dropoff.append(pgh.encode(dropoff_vector[0],dropoff_vector[1],precision=precision_))
    new_taxidf['geohash_pickup'] = geo_hash_pickup
    new_taxidf['geohas_dropoff'] = geo_hash_dropoff 
    return new_taxidf
        
    
    

In [105]:
geohash_encoding(taxi_sample)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RatecodeID,store_and_fwd_flag,dropoff_longitude,...,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,geohash_pickup,geohas_dropoff
248133,2,2016-01-01 13:57:29,2016-01-01 14:10:50,1,2.64,-73.99189,40.74926,1,N,-73.982674,...,1,12.0,0.0,0.5,2.56,0.0,0.3,15.36,dr5ru6,dr5rsm
10346448,2,2016-01-08 13:36:25,2016-01-08 13:42:44,1,0.75,-73.986359,40.75523,1,N,-73.985291,...,2,6.0,0.0,0.5,0.0,0.0,0.3,6.8,dr5ru7,dr5ru6
6177573,2,2016-01-15 08:52:53,2016-01-15 09:00:29,5,1.32,-73.986053,40.734699,1,N,-73.978767,...,1,7.5,0.0,0.5,1.66,0.0,0.3,9.96,dr5rsr,dr5rud
2612091,2,2016-01-16 09:56:47,2016-01-16 10:04:13,1,1.05,-73.954895,40.767284,1,N,-73.968422,...,1,7.0,0.0,0.5,1.17,0.0,0.3,8.97,dr5rvj,dr5ruv
8041976,1,2016-01-31 23:59:05,2016-02-01 00:06:01,1,0.7,-73.981216,40.755569,1,N,-73.969368,...,1,6.0,0.5,0.5,1.45,0.0,0.3,8.75,dr5rue,dr5ruf
7445225,2,2016-01-30 13:54:16,2016-01-30 13:58:34,6,0.76,-73.994583,40.727825,1,N,-73.998734,...,2,5.0,0.0,0.5,0.0,0.0,0.3,5.8,dr5rsn,dr5rsp
8649250,1,2016-01-21 19:01:09,2016-01-21 19:16:40,1,2.0,-73.982094,40.778587,1,N,-73.958,...,1,11.5,1.0,0.5,1.5,0.0,0.3,14.8,dr5rur,dr5rvj
1185066,1,2016-01-05 15:01:00,2016-01-05 15:08:00,1,0.9,-73.980827,40.77475,1,N,-73.98056,...,1,6.5,0.0,0.5,1.0,0.0,0.3,8.3,dr5ruw,dr72h8
8854508,1,2016-01-22 08:36:21,2016-01-22 08:38:56,1,0.4,-74.002991,40.72776,1,N,-73.995193,...,2,4.0,0.0,0.5,0.0,0.0,0.3,4.8,dr5rsn,dr5rsj
9074688,2,2016-01-22 19:07:35,2016-01-22 19:14:42,1,1.1,-73.942513,40.797443,1,N,-73.956291,...,2,6.5,1.0,0.5,0.0,0.0,0.3,8.3,dr72j6,dr72j5


TypeError: 'module' object is not callable

In [None]:
geodis()