In [2]:
import pandas as pd
import os
import dask.dataframe as dd
import rasterio
from rasterio.windows import from_bounds
from rasterio.enums import Resampling

Keep only subsets of the original and interim refined data for structure, demonstration, and example purposes (Github file size limitations).
Note: This file cannot be executed, original files are not provided. 
File paths have been updated to point to example files in all other locations. 

In [8]:
for month in range(1, 13):
    month_str = f"{month:02}"
    file_path = f'../original_data/yellow_tripdata_2012_{month_str}.parquet'
    taxi_trips = pd.read_parquet(file_path)

    example_taxi_trips = taxi_trips.head(100)
    example_file_path = f'../original_data/taxi_data_original_examples/yellow_tripdata_2012_{month_str}_example.csv'
    example_taxi_trips.to_csv(example_file_path)

    os.remove(file_path)

In [7]:
for month in range(1, 13):
    month_str = f"{month:02}"
    file_path = f'../refined_data/refined_taxi_data_2012_{month_str}.parquet'
    taxi_trips = pd.read_parquet(file_path)

    example_taxi_trips = taxi_trips.head(100)
    example_file_path = f'../refined_data/refined_taxi_data_examples/refined_taxi_data_2012_{month_str}_example.csv'
    example_taxi_trips.to_csv(example_file_path)

    os.remove(file_path)

Dataset contains information on accidents, construction, road maintenance and activities that affect traffic.
Complete dataset available at: https://catalog.data.gov/dataset/511-ny-events-beginning-2010

In [3]:
df = dd.read_csv('../original_data/new_york_drive_events.csv')
truncated_df = df.head(200, compute=True)
truncated_df.to_csv('../original_data/drive_events_org_example.csv', index=False)

Dataset contains population count for coordinates every 100m of US.
Example TIFF file contains 1 window that intersects NYC coordinate boundaries.
Complete dataset available at: https://hub.worldpop.org/geodata/summary?id=4553

In [4]:
with rasterio.open('../original_data/usa_ppp_2012.tif') as src:
    # Retain data for only the windows that contain NYC
    windows_data = []

    # Geographical coordinate bounds of NYC
    min_lat, max_lat = 40.52763504199989, 40.91037152011096
    min_long, max_long = -74.21220034099993, -73.70134715908382

    for ji, window in src.block_windows(1):
        # Bounds of current window read from complete US TIFF file
        window_bounds = src.window_bounds(window)
        min_long_window, min_lat_window, max_long_window, max_lat_window = window_bounds

        # Only append window data if bounds intersect with NYC bounds
        if ((max_lat + 1 >= min_lat_window >= min_lat - 1) and
            (max_lat + 1 >= max_lat_window >= min_lat - 1) and
            (max_long + 1 >= min_long_window >= min_long - 1) and
            (max_long + 1 >= max_long_window >= min_long - 1)):
            
            windows_data.append({
                'min_long': min_long_window,
                'min_lat': min_lat_window,
                'max_long': max_long_window,
                'max_lat': max_lat_window
            })

    first_window_data = windows_data[0]
    min_long, min_lat, max_long, max_lat = first_window_data['min_long'], first_window_data['min_lat'], first_window_data['max_long'], first_window_data['max_lat']

    # Create window object from window bounds 
    window = from_bounds(min_long, min_lat, max_long, max_lat, src.transform)
    
    window_data = src.read(window=window, resampling=Resampling.nearest)

    # Copy metadata from the original dataset for example window file
    out_meta = src.meta.copy()
    out_meta.update({
        "driver": "GTiff",
        "height": window_data.shape[1],
        "width": window_data.shape[2],
        "transform": rasterio.windows.transform(window, src.transform)
    })

    with rasterio.open('../original_data/geospatial_pop_data_example.tif', 'w', **out_meta) as out_dst:
        out_dst.write(window_data)        

Original datasets (not truncated to examples) available at: 
taxi_zones: https://data.cityofnewyork.us/Transportation/NYC-Taxi-Zones/d3c5-ddgc
new_york_special_events: https://data.ny.gov/Transportation/511-NY-Sporting-Concert-and-Special-Events-Beginni/3ha4-4nfg/about_data
ny_hourly_weather_data: https://home.openweathermap.org/history_bulks/new (must be purchased)
