In [1]:
from __future__ import print_function, division
%matplotlib inline
import os
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import geopandas as gpd
import geohash_hilbert as ghh

In [2]:
#downloaded from TLC by month
df = pd.read_csv('data/fhv_tripdata_2015-04.csv')
print ("Shape of data\n{}".format(df.shape))
df.head()

Shape of data
(3917789, 3)


Unnamed: 0,Dispatching_base_num,Pickup_date,locationID
0,B00001,2015-04-01 04:30:00,
1,B00001,2015-04-01 06:00:00,
2,B00001,2015-04-01 06:00:00,
3,B00001,2015-04-01 06:00:00,
4,B00001,2015-04-01 06:15:00,


In [3]:
df.columns

Index(['Dispatching_base_num', 'Pickup_date', 'locationID'], dtype='object')

In [4]:
df = df.drop(['locationID'], axis=1)
df.dropna(inplace=True)
df.head()

Unnamed: 0,Dispatching_base_num,Pickup_date
0,B00001,2015-04-01 04:30:00
1,B00001,2015-04-01 06:00:00
2,B00001,2015-04-01 06:00:00
3,B00001,2015-04-01 06:00:00
4,B00001,2015-04-01 06:15:00


In [5]:
df.shape

(3917789, 2)

## Aggregate by hour

In [6]:
df['Pickup_date'] = pd.to_datetime(df['Pickup_date'], format='%Y/%m/%d %H:%M:%S')

In [7]:
df.dtypes

Dispatching_base_num            object
Pickup_date             datetime64[ns]
dtype: object

In [8]:
df['Dispatching_base_num'].unique()

array(['B00001', 'B00008', 'B00009', 'B00013', 'B00014', 'B00035',
       'B00037', 'B00078', 'B00160', 'B00202', 'B00210', 'B00221',
       'B00225', 'B00227', 'B00235', 'B00236', 'B00248', 'B00254',
       'B00255', 'B00256', 'B00272', 'B00277', 'B00280', 'B00281',
       'B00310', 'B00346', 'B00373', 'B00381', 'B00411', 'B00412',
       'B00419', 'B00448', 'B00475', 'B00477', 'B00492', 'B00552',
       'B00608', 'B00619', 'B00623', 'B00628', 'b00639', 'B00652',
       'B00692', 'B00756', 'B00789', 'B00821', 'B00837', 'B00856',
       'b00881', 'B00882', 'B00888', 'b00906', 'B00911', 'B00932',
       'b00941', 'b00983', 'B00984', 'b01013', 'B01020', 'b01048',
       'B01069', 'B01087', 'B01129', 'B01148', 'B01176', 'B01177',
       'b01177', 'B01190', 'B01197', 'B01222', 'B01231', 'B01265',
       'B01280', 'B01292', 'B01306', 'B01308', 'B01311', 'B01313',
       'B01315', 'B01332', 'B01336', 'B01339', 'B01340', 'B01351',
       'B01362', 'B01367', 'B01386', 'B01391', 'B01392', 'B013

In [9]:
df['hour'] = df['Pickup_date'].dt.hour
df['dayofyear'] = df['Pickup_date'].dt.dayofyear

In [10]:
df['day_and_hour'] = df['dayofyear'].astype(str) + '-' + df['hour'].astype(str)
df['FHV_count'] = pd.Series(np.ones(len(df)).T)

In [11]:
df.head()

Unnamed: 0,Dispatching_base_num,Pickup_date,hour,dayofyear,day_and_hour,FHV_count
0,B00001,2015-04-01 04:30:00,4,91,91-4,1.0
1,B00001,2015-04-01 06:00:00,6,91,91-6,1.0
2,B00001,2015-04-01 06:00:00,6,91,91-6,1.0
3,B00001,2015-04-01 06:00:00,6,91,91-6,1.0
4,B00001,2015-04-01 06:15:00,6,91,91-6,1.0


In [12]:
df = df.drop(['Dispatching_base_num'], axis=1)
df.dropna(inplace=True)
df.head()

Unnamed: 0,Pickup_date,hour,dayofyear,day_and_hour,FHV_count
0,2015-04-01 04:30:00,4,91,91-4,1.0
1,2015-04-01 06:00:00,6,91,91-6,1.0
2,2015-04-01 06:00:00,6,91,91-6,1.0
3,2015-04-01 06:00:00,6,91,91-6,1.0
4,2015-04-01 06:15:00,6,91,91-6,1.0


In [13]:
dfsum = df.groupby(df['day_and_hour']).count()
dfsum.head()

Unnamed: 0_level_0,Pickup_date,hour,dayofyear,FHV_count
day_and_hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100-0,4814,4814,4814,4814
100-1,2815,2815,2815,2815
100-10,5674,5674,5674,5674
100-11,5120,5120,5120,5120
100-12,5328,5328,5328,5328


In [14]:
dfsum.reset_index(inplace=True)
dfsum.head()

Unnamed: 0,day_and_hour,Pickup_date,hour,dayofyear,FHV_count
0,100-0,4814,4814,4814,4814
1,100-1,2815,2815,2815,2815
2,100-10,5674,5674,5674,5674
3,100-11,5120,5120,5120,5120
4,100-12,5328,5328,5328,5328


In [15]:
dfsum.drop(['Pickup_date', 'hour', 'dayofyear'], axis=1, inplace=True)
dfsum.shape

(720, 2)

In [16]:
dfsum.head()

Unnamed: 0,day_and_hour,FHV_count
0,100-0,4814
1,100-1,2815
2,100-10,5674
3,100-11,5120
4,100-12,5328


In [17]:
dfsum.to_csv('clean_FHV_data_04.csv')