In [19]:
import pandas as pd
import numpy as np

import geopandas as gpd
from shapely.geometry import Point
import rtree
import pickle

import matplotlib.pyplot as plt

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

## Event Listing

In [2]:
event_list_df = pd.read_csv("data/NYC_Parks_Events_Listing___Event_Listing.csv", parse_dates=True)

In [3]:
event_list_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68689 entries, 0 to 68688
Data columns (total 15 columns):
event_id                68689 non-null int64
title                   68689 non-null object
date                    68689 non-null object
start_time              68689 non-null object
end_time                68689 non-null object
location_description    13059 non-null object
description             68685 non-null object
snippet                 68385 non-null object
phone                   53482 non-null object
email                   47694 non-null object
cost_free               68689 non-null int64
cost_description        13807 non-null object
must_see                68689 non-null int64
url                     68687 non-null object
notice                  68689 non-null int64
dtypes: int64(4), object(11)
memory usage: 7.9+ MB


In [4]:
event_list_df['start_time'] = pd.to_datetime(event_list_df['start_time'], format='%H:%M')

In [5]:
event_list_df[event_list_df.end_time == '24:00']

Unnamed: 0,event_id,title,date,start_time,end_time,location_description,description,snippet,phone,email,cost_free,cost_description,must_see,url,notice
11088,79666,New Year's Eve Fireworks in Prospect Park,12/31/2013,1900-01-01 23:00:00,24:00,,<p>\n\tStart the celebrations off around 11 p....,Start the celebrations off around 11 p.m. with...,,,1,,1,new-years-eve-fireworks1,0
23359,102431,New Year's Eve Fireworks,12/31/2014,1900-01-01 23:00:00,24:00,,<p>Kick off the New Year&#39;s with this belov...,Kick off the New Year's with this beloved Broo...,,,1,,1,new-years-eve-fireworks,0


In [6]:
event_list_df = event_list_df.drop(event_list_df.index[23359])
event_list_df = event_list_df.drop(event_list_df.index[11088])

In [7]:
event_list_df[event_list_df.end_time == '24:00']

Unnamed: 0,event_id,title,date,start_time,end_time,location_description,description,snippet,phone,email,cost_free,cost_description,must_see,url,notice


In [8]:
event_list_df['end_time'] = pd.to_datetime(event_list_df['end_time'], format='%H:%M')

In [9]:
event_list_df['date'] = pd.to_datetime(event_list_df.date)

In [10]:
event_list_df['is_event'] = 1

In [11]:
event_list_df.end_time[40]

Timestamp('1900-01-01 15:00:00')

In [12]:
event_list_df.start_time[40]

Timestamp('1900-01-01 13:00:00')

In [13]:
event_list_df['event_duration'] = ((event_list_df.end_time-event_list_df.start_time).dt.total_seconds())/60

In [14]:
str(event_list_df.start_time.dt.time[0]).split(":")[0]

'11'

In [16]:
for index, event in event_list_df.iterrows():
    start = int(str(event_list_df.start_time.dt.time[index]).split(":")[0])
    end = int(str(event_list_df.end_time.dt.time[index]).split(":")[0])
    
    if(((start >= 3) & (start < 13)) | ((end >= 3) & (end < 13))):
        event_list_df.loc[[index],'morning'] = 1
    else:
        event_list_df.loc[[index],'morning'] = 0
        
    if(((start >= 13) & (start < 18)) | ((end >= 13) & (end < 18))):
        event_list_df.loc[[index],'afternoon'] = 1
    else:
        event_list_df.loc[[index],'afternoon'] = 0
        
    if(((start >= 18) & (start < 3)) | ((end >= 18) & (end < 3))):
        event_list_df.loc[[index],'night'] = 1
    else:
        event_list_df.loc[[index],'night'] = 0

In [20]:
f = open('storage/event_list_df.pckl', 'wb')
pickle.dump(event_list_df, f)
f.close()

In [22]:
f = open('storage/event_list_df.pckl', 'rb')
event_list_df = pickle.load(f)
f.close()

In [23]:
agg_df = event_list_df[event_list_df.date.dt.year == 2017]

In [24]:
agg_df.head()

Unnamed: 0,event_id,title,date,start_time,end_time,location_description,description,snippet,phone,email,cost_free,cost_description,must_see,url,notice,is_event,event_duration,morning,afternoon,night
26877,107941,Conference House Art Expo 2017,2017-05-20,1900-01-01 11:00:00,1900-01-01 17:00:00,,<p>Come view original artworks and tree instal...,Come view original artworks and tree installat...,(718) 984-6046,admin@conferencehouse.org,1,,0,resiliency-and-elements,0,1,360.0,1.0,1.0,0.0
27651,108867,Thursday Evening Hours at the Merchant’s House...,2017-01-05,1900-01-01 12:00:00,1900-01-01 20:00:00,,"<p>Beginning May 7, 2015, the Merchant’s House...","Beginning May 7, 2015, the Merchant’s House Mu...",(212) 777-1089,emily@merchantshouse.org,0,"Admission is $10, $5 seniors & students, Free ...",0,thursday-evening-hours-at-the-merchants-house-...,0,1,480.0,1.0,0.0,0.0
27652,108868,Thursday Evening Hours at the Merchant’s House...,2017-01-12,1900-01-01 12:00:00,1900-01-01 20:00:00,,"<p>Beginning May 7, 2015, the Merchant’s House...","Beginning May 7, 2015, the Merchant’s House Mu...",(212) 777-1089,emily@merchantshouse.org,0,"Admission is $10, $5 seniors & students, Free ...",0,thursday-evening-hours-at-the-merchants-house-...,0,1,480.0,1.0,0.0,0.0
27653,108869,Thursday Evening Hours at the Merchant’s House...,2017-01-19,1900-01-01 12:00:00,1900-01-01 20:00:00,,"<p>Beginning May 7, 2015, the Merchant’s House...","Beginning May 7, 2015, the Merchant’s House Mu...",(212) 777-1089,emily@merchantshouse.org,0,"Admission is $10, $5 seniors & students, Free ...",0,thursday-evening-hours-at-the-merchants-house-...,0,1,480.0,1.0,0.0,0.0
27654,108870,Thursday Evening Hours at the Merchant’s House...,2017-01-26,1900-01-01 12:00:00,1900-01-01 20:00:00,,"<p>Beginning May 7, 2015, the Merchant’s House...","Beginning May 7, 2015, the Merchant’s House Mu...",(212) 777-1089,emily@merchantshouse.org,0,"Admission is $10, $5 seniors & students, Free ...",0,thursday-evening-hours-at-the-merchants-house-...,0,1,480.0,1.0,0.0,0.0


In [None]:
plt.hist(agg_df.event_duration, bins=50);

In [None]:
plt.plot(agg_df.event_id);

## Event Location

In [None]:
loc_df = pd.read_csv('data/NYC_Parks_Events_Listing___Event_Locations.csv')

In [None]:
loc_df.head()

In [None]:
loc_df.shape

In [None]:
plt.plot(loc_df.event_id)

In [None]:
loc_df[loc_df.event_id == 141581]

In [None]:
loc_df[loc_df.event_id == 62223]

In [None]:
plt.scatter(loc_df.lat, loc_df.long)

https://towardsdatascience.com/geospatial-operations-at-scale-with-dask-and-geopandas-4d92d00eb7e8

In [None]:
df = gpd.read_file('data/taxizones/taxi_zones.shp').to_crs({'init': 'epsg:4326'})
df = df.drop(['Shape_Area', 'Shape_Leng', 'OBJECTID'], axis=1)
gpd.sjoin(gpd.GeoDataFrame(crs={'init': 'epsg:4326'},
    geometry=[Point(-73.9431, 40.776001)]), 
    df, how='left', op='within')

In [None]:
def assign_taxi_zones(df, lon_var, lat_var, locid_var):
    """Joins DataFrame with Taxi Zones shapefile.
    This function takes longitude values provided by `lon_var`, and latitude
    values provided by `lat_var` in DataFrame `df`, and performs a spatial join
    with the NYC taxi_zones shapefile. 
    The shapefile is hard coded in, as this function makes a hard assumption of
    latitude and longitude coordinates. It also assumes latitude=0 and 
    longitude=0 is not a datapoint that can exist in your dataset. Which is 
    reasonable for a dataset of New York, but bad for a global dataset.
    Only rows where `df.lon_var`, `df.lat_var` are reasonably near New York,
    and `df.locid_var` is set to np.nan are updated. 
    Parameters
    ----------
    df : pandas.DataFrame or dask.DataFrame
        DataFrame containing latitudes, longitudes, and location_id columns.
    lon_var : string
        Name of column in `df` containing longitude values. Invalid values 
        should be np.nan.
    lat_var : string
        Name of column in `df` containing latitude values. Invalid values 
        should be np.nan
    locid_var : string
        Name of series to return. 
    """

    import geopandas
    from shapely.geometry import Point


    # make a copy since we will modify lats and lons
    localdf = df[[lon_var, lat_var]].copy()
    
    # missing lat lon info is indicated by nan. Fill with zero
    # which is outside New York shapefile. 
    localdf[lon_var] = localdf[lon_var].fillna(value=0.)
    localdf[lat_var] = localdf[lat_var].fillna(value=0.)
    

    shape_df = geopandas.read_file('data/taxizones/taxi_zones.shp')
    shape_df.drop(['OBJECTID', "Shape_Area", "Shape_Leng", "borough", "zone"],
                  axis=1, inplace=True)
    shape_df = shape_df.to_crs({'init': 'epsg:4326'})

    try:
        local_gdf = geopandas.GeoDataFrame(
            localdf, crs={'init': 'epsg:4326'},
            geometry=[Point(xy) for xy in
                      zip(localdf[lon_var], localdf[lat_var])])

        local_gdf = geopandas.sjoin(
            local_gdf, shape_df, how='left', op='within')
#         return local_gdf
        return local_gdf.LocationID.rename(locid_var)
    except ValueError as ve:
        print(ve)
        print(ve.stacktrace())
        series = localdf[lon_var]
        series = np.nan
    return series

In [None]:
loc_df['taxi_zone'] = assign_taxi_zones(loc_df, 'long', 'lat', 'taxi_zones')

In [None]:
loc_df[loc_df.taxi_zone == np.inf]

In [None]:
loc_df['taxi_zone'] = loc_df.taxi_zone.fillna(0).astype(int)

In [None]:
loc_df.head()

## Merge Event and Event Locations and Taxi

In [None]:
merged_df = pd.merge(agg_df, loc_df, on=['event_id'])

In [None]:
merged_df.head()

In [None]:
merged_df['date'] = pd.to_datetime(merged_df.date).dt.date

In [None]:
merged_df.info()

In [None]:
data = merged_df.merge(taxi_df, on=['date', 'taxi_zone'], how='right')

In [None]:
data.isnull().sum()

In [None]:
data.is_event.isnull().sum()

In [None]:
data['is_event'] = data.is_event.fillna(0)

In [None]:
data.head()

In [None]:
# data.groupby(['date', 'taxi_zone', 'is_event'])[['trip_distance']].mean()

In [None]:
# data.groupby(['date', 'taxi_zone']).count()

In [None]:
data.groupby(['is_event', 'taxi_zone'])[['VendorID']].count()

In [None]:
# map_df = gpd.read_file('data/newyorkzonemap/geo_export_83da3a04-bfe1-4338-bd66-aa8fff31dec0.shp')

In [None]:
# map_df.plot()

## Taxi Trip Data

In [None]:
import random

filename = "data/2017_Green_Taxi_Trip_Data.csv"
n = sum(1 for line in open(filename)) - 1 #number of records in file (excludes header)
s = 10000 #desired sample size
skip = sorted(random.sample(range(1,n+1),n-s)) #the 0-indexed header will not be included in the skip list
taxi_df = pd.read_csv(filename, skiprows=skip)

In [None]:
# taxi_df = pd.read_csv('2017_Green_Taxi_Trip_Data.csv', nrows=10000)

In [None]:
taxi_df.info()

In [None]:
taxi_df.head()

In [None]:
taxi_df['lpep_pickup_datetime'] = pd.to_datetime(taxi_df.lpep_pickup_datetime)

In [None]:
taxi_df['lpep_dropoff_datetime'] = pd.to_datetime(taxi_df.lpep_dropoff_datetime)

In [None]:
taxi_df['date'] = taxi_df.lpep_pickup_datetime.dt.date

In [None]:
taxi_df['taxi_duration'] = ((taxi_df.lpep_dropoff_datetime - taxi_df.lpep_pickup_datetime).dt.total_seconds())//60

In [None]:
taxi_df = taxi_df.rename(columns={'DOLocationID' : 'taxi_zone'})

In [None]:
taxi_df.head()

In [None]:
taxi_df.info()

In [None]:
type(pd.to_datetime(taxi_df.date)[0])