In [15]:
import uuid
import shapefile
import json
import pandas as pd
from shapely.geometry import shape, Point
from pyproj import Proj, transform

from google.cloud import bigquery
from google.cloud.bigquery import SchemaField

In [3]:
# Instantiates a client
bigquery_client = bigquery.Client(project="taxianalysis-176617")

In [4]:
sf = shapefile.Reader("taxi_zones/taxi_zones")
shapes = sf.shapes()

In [5]:
NEWARK_ID = 1
LAGUARDIA_ID = 138
JFK_ID = 132

In [6]:
def get_bounds(shape):
    in_proj = Proj(init='epsg:2263', preserve_units = True)
    out_proj = Proj(init='epsg:4326')
    x1, y1 = shape.bbox[0], shape.bbox[1]
    x2, y2 = shape.bbox[2], shape.bbox[3]
    new_x1, new_y1 = transform(in_proj, out_proj, x1, y1)
    new_x2, new_y2 = transform(in_proj, out_proj, x2, y2)

    return [new_x1, new_y1, new_x2, new_y2]

In [7]:
newark_bounds = get_bounds(shapes[NEWARK_ID - 1])
laguardia_bounds = get_bounds(shapes[LAGUARDIA_ID - 1])
jfk_bounds = get_bounds(shapes[JFK_ID - 1])

In [220]:
def check_bounds(shapefile, coords):
    in_proj = Proj(init='epsg:4326')
    out_proj= Proj(init='epsg:2263', preserve_units = True)
    
    point = Point(transform(in_proj, out_proj, coords[0], coords[1]))
    polygon = shape(shapefile)
    return polygon.contains(point)

In [221]:
check_bounds(shapes[0], [-74.182861328125, 40.6878776550293])

True

In [212]:
def export_data_helper(client, table, destinationUri):
    # Export data to Google Storage
    job_name = str(uuid.uuid4())
    print job_name
    job = client.extract_table_to_storage(job_name, destination_table, destinationUri)

    job.begin()
    job.result()  # Wait for job to complete

    print('Exported {} to {}'.format(destination_table, destinationUri))

In [223]:
def export_data(client, query_bounds, dataset, destinationUri):
    
    # Query string
    query_string = """
    SELECT
        *
    FROM [tlc_yellow_trips_2016]
    WHERE
        /* Return values between a pair of */
        /* latitude and longitude coordinates */
        dropoff_latitude > {} AND
        dropoff_latitude < {} AND
        dropoff_longitude > {} AND
        dropoff_longitude < {}
    """.format(dataset, query_bounds[1], query_bounds[3], query_bounds[0], query_bounds[2])
    
    query_job = client.run_async_query(str(uuid.uuid4()), query_string)
    query_job.begin()
    query_job.result()
    
    destination_table = query_job.destination
    destination_table.reload()
    
    # Export data to Google Storage
    export_job_name = str(uuid.uuid4())
    export_job = client.extract_table_to_storage(export_job_name, destination_table, destinationUri)

    export_job.begin()
    export_job.result()  # Wait for job to complete

    print('Exported {} to {}'.format(destination_table, destinationUri))

In [231]:
export_data(bigquery_client, laguardia_bounds, 'tlc_green_trips_2016', 'gs://farallon-taxi-analysis/tlc-green-laguardia-dropoffs-2016-*.json')

Exported <google.cloud.bigquery.table.Table object at 0x112913c10> to gs://farallon-taxi-analysis/tlc-green-laguardia-dropoffs-2016-*.json


In [62]:
newark_coordinates = {
    'terminal_a': [40.687794, -74.182307],
    'terminal_b': [40.690627, -74.177544],
    'terminal_c': [40.695558, -74.178063]
}

laguardia_coordinates = {
    'terminal_a': [40.772375, -73.885976],
    'terminal_b': [40.774444, -73.872006],
    'terminal_c': [40.770674, -73.865307],
    'terminal_d': [40.768628, -73.862134]
}

jfk_coordinates = {
    'terminal_1': [40.643325, -73.789939],
    'terminal_2': [40.641595, -73.788767],
    'terminal_4': [40.644193, -73.782554],
    'terminal_5': [40.645807, -73.776774],
    'terminal_7': [40.648798, -73.782922],
    'terminal_8': [40.646934, -73.789874]
}