In [1]:
%load_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
import datetime

In [43]:
def load_trip_data(path, cols, month):
    df = pd.read_csv(path, usecols=cols, nrows=None)
    newcols = ['pickup_datetime', 'dropoff_datetime', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'trip_distance']
    df.rename(columns=dict(zip(cols, newcols)), inplace=True)
    df['pickup_datetime'] = pd.to_datetime(pd.Series(df.pickup_datetime))
    df['trip_time'] = pd.to_datetime(pd.Series(df.dropoff_datetime)) - df.pickup_datetime
    df['trip_time'] = df.trip_time.map(lambda x: x/np.timedelta64(1, 'm'))
    dates = pd.DatetimeIndex(df.pickup_datetime)
    df['date'] = dates.day
    df['hour'] = dates.hour
    df['minute'] = dates.minute
    df['dayofweek'] = df.pickup_datetime.dt.dayofweek
    df['second'] = df.pickup_datetime - datetime.datetime(2016,month,1)
    df['second'] = df.second.map(lambda x: x/np.timedelta64(1, 's'))
    df = df.drop(['pickup_datetime', 'dropoff_datetime'], axis=1)
#     df['pickup_datetime'] = df.pickup_datetime.map(lambda t: t.strftime('%Y/%m/%d %H:%M'))
    return df

green_cols = ['lpep_pickup_datetime', 'Lpep_dropoff_datetime', 'Pickup_longitude', 'Pickup_latitude', 'Dropoff_longitude', 'Dropoff_latitude', 'Trip_distance']
yellow_cols = ['tpep_pickup_datetime', 'tpep_dropoff_datetime', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'trip_distance']

In [44]:
df = load_trip_data('data/green_tripdata_2016-05.csv', green_cols, 5)
df = df.append(load_trip_data('data/yellow_tripdata_2016-05.csv', yellow_cols, 5))
df.reset_index(drop=True, inplace=True)
df.sort_values(by='second', ascending=True, inplace=True)
df.head()

Unnamed: 0,date,dayofweek,dropoff_latitude,dropoff_longitude,hour,minute,pickup_latitude,pickup_longitude,second,trip_distance,trip_time
9329,1,6,40.81395,-73.936691,0,0,40.727169,-73.952789,0,8.91,0.0
1536988,1,6,40.761452,-73.994049,0,0,40.714119,-74.013855,0,3.52,0.0
9414,1,6,40.669178,-73.950546,0,0,40.668556,-73.980225,0,1.73,0.0
11731,1,6,40.702751,-73.921951,0,0,40.712791,-73.965874,0,2.93,15.216667
1536987,1,6,40.730675,-73.981079,0,0,40.74099,-74.00161,0,1.59,0.0


In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13373832 entries, 9329 to 13373812
Data columns (total 11 columns):
date                 int32
dayofweek            int64
dropoff_latitude     float64
dropoff_longitude    float64
hour                 int32
minute               int32
pickup_latitude      float64
pickup_longitude     float64
second               float64
trip_distance        float64
trip_time            float64
dtypes: float64(7), int32(3), int64(1)
memory usage: 1.0 GB


In [46]:
df.to_csv('data/taxi_tripdata_2016-05.csv', index=False)
df = pd.read_csv('data/taxi_tripdata_2016-05.csv', nrows=5)
df.head()

Unnamed: 0,date,dayofweek,dropoff_latitude,dropoff_longitude,hour,minute,pickup_latitude,pickup_longitude,second,trip_distance,trip_time
0,1,6,40.81395,-73.936691,0,0,40.727169,-73.952789,0,8.91,0.0
1,1,6,40.761452,-73.994049,0,0,40.714119,-74.013855,0,3.52,0.0
2,1,6,40.669178,-73.950546,0,0,40.668556,-73.980225,0,1.73,0.0
3,1,6,40.702751,-73.921951,0,0,40.712791,-73.965874,0,2.93,15.216667
4,1,6,40.730675,-73.981079,0,0,40.74099,-74.00161,0,1.59,0.0


In [47]:
df = load_trip_data('data/green_tripdata_2016-06.csv', green_cols, 6)
df = df.append(load_trip_data('data/yellow_tripdata_2016-06.csv', yellow_cols, 6))
df.reset_index(drop=True, inplace=True)
df.sort_values(by='second', ascending=True, inplace=True)
df.to_csv('data/taxi_tripdata_2016-06.csv', index=False)

In [35]:
from mapping_utils import distance_in_meters

df = pd.read_csv('data/taxi_tripdata_2016-05.csv')
df['great_circle_distance'] = distance_in_meters(df.pickup_latitude, df.pickup_longitude,
                                                df.dropoff_latitude, df.dropoff_longitude).astype(int)
df.shape

(13373832, 12)

In [36]:
lon_max = -73.700165 + 0.1
lon_min = -74.259094 - 0.1
lat_max = 40.91758 + 0.1
lat_min = 40.477398 - 0.1

def remove_outliers(df):
    df = df[(df.trip_time>1.0) & (df.trip_time<60*3)]
    df = df[(df.trip_distance>0.1) & (df.trip_distance<100)]
    df = df[(df.pickup_latitude>lat_min) & (df.pickup_latitude<lat_max)]
    df = df[(df.pickup_longitude>lon_min) & (df.pickup_longitude<lon_max)]
    df = df[(df.dropoff_latitude>lat_min) & (df.dropoff_latitude<lat_max)]
    df = df[(df.dropoff_longitude>lon_min) & (df.dropoff_longitude<lon_max)]
    df = df[(df.great_circle_distance>100) & (df.great_circle_distance<100000)]
    return df
df = remove_outliers(df)
df.describe()

Unnamed: 0,date,dayofweek,dropoff_latitude,dropoff_longitude,hour,minute,pickup_latitude,pickup_longitude,second,trip_distance,trip_time,great_circle_distance
count,12985767.0,12985767.0,12985767.0,12985767.0,12985767.0,12985767.0,12985767.0,12985767.0,12985767.0,12985767.0,12985767.0,12985767.0
mean,15.551703,3.023752,40.751,-73.968797,13.546419,29.559436,40.750431,-73.969044,1307837.3199,3.062361,14.695932,3549.836233
std,8.742482,2.036507,0.036485,0.039766,6.440091,17.32767,0.032925,0.040586,755782.564231,3.641287,11.766379,3926.941992
min,1.0,0.0,40.382389,-74.359001,0.0,0.0,40.429726,-74.348396,0.0,0.11,1.016667,100.008304
25%,8.0,1.0,40.733055,-73.990433,9.0,15.0,40.734226,-73.991051,641351.5,1.04,6.85,1288.550837
50%,15.0,3.0,40.754272,-73.977852,14.0,30.0,40.753502,-73.979538,1285817.0,1.79,11.45,2185.680432
75%,23.0,5.0,40.771145,-73.957649,19.0,45.0,40.76965,-73.960724,1948126.0,3.33,18.766667,4070.785295
max,31.0,6.0,41.017578,-73.600319,23.0,59.0,41.015667,-73.602371,2678399.0,91.65,179.95,54726.316499


In [37]:
df.reset_index(drop=True, inplace=True)
df = df.reset_index().rename(columns={'index':'request_id'})
df.head()

Unnamed: 0,request_id,date,dayofweek,dropoff_latitude,dropoff_longitude,hour,minute,pickup_latitude,pickup_longitude,second,trip_distance,trip_time,great_circle_distance
0,0,1,6,40.702751,-73.921951,0,0,40.712791,-73.965874,0,2.93,15.216667,3866.928064
1,1,1,6,40.767826,-73.980797,0,0,40.750507,-73.987198,0,1.48,6.516667,1999.798981
2,2,1,6,40.855343,-73.937805,0,0,40.780739,-73.981544,0,6.68,15.716667,9075.486978
3,3,1,6,40.737564,-73.997498,0,0,40.740192,-74.00528,0,0.56,6.65,717.831628
4,4,1,6,40.758469,-73.988014,0,0,40.755764,-73.979294,0,0.63,5.316667,793.68886


In [39]:
df.to_csv('data/taxi_tripdata_2016-05.csv', index=False)

In [40]:
df = pd.read_csv('data/taxi_tripdata_2016-06.csv')
df['great_circle_distance'] = distance_in_meters(df.pickup_latitude, df.pickup_longitude,
                                                df.dropoff_latitude, df.dropoff_longitude).astype(int)
df = remove_outliers(df)
df.reset_index(drop=True, inplace=True)
df = df.reset_index().rename(columns={'index':'request_id'})
df.to_csv('data/taxi_tripdata_2016-06.csv', index=False)

In [44]:
df = pd.read_csv('temp/data/taxi_tripdata_2016-06.csv', nrows=10000)

In [45]:
df[df.second<60*60][[
    'request_id', 'trip_time',
    'pickup_longitude', 'pickup_latitude',
    'dropoff_longitude', 'dropoff_latitude']].to_csv(
    'data/requests_sample.csv', index=False)

In [1]:
df = pd.read_csv('data/requests_sample.csv')
df.shape

(8984, 6)

In [7]:
df.head()

Unnamed: 0,request_id,trip_time,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude
0,0,7.683333,-74.008446,40.706024,-74.01339,40.709644
1,1,15.416667,-73.962227,40.760635,-73.922287,40.827213
2,2,14.183333,-73.972916,40.754993,-73.992264,40.725243
3,3,3.716667,-74.002426,40.750156,-73.991066,40.755154
4,4,6.266667,-73.987991,40.754166,-74.001465,40.749119


In [11]:
df.loc[0].pickup_longitude

-74.008445739700008