In [1]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import gc
import numpy as np

In [2]:
taxi_trips = pd.read_parquet('../clean_data/clean_taxi_trips_sample.parquet')
hourly_weather = pd.read_csv('../clean_data/clean_weather.csv')
pop_count = pd.read_csv('../clean_data/clean_pop_count.csv')

Merges cleaned datasets for hourly weather statistics on hourly intervals and population density with 100m resolution on proximity to coordinates with taxi trip datasets. 

In [None]:
def process_taxi_weather_population_in_batches(taxi_trips, hourly_weather, pop_count, batch_size=2000):
    hourly_weather['start_datetime'] = pd.to_datetime(hourly_weather['start_datetime'])
    hourly_weather['datetime_interval'] = hourly_weather['start_datetime'].dt.strftime('%Y-%m-%d-%H')
    gdf_pop = gpd.GeoDataFrame(pop_count, geometry=[Point(xy) for xy in zip(pop_count.longitude, pop_count.latitude)], crs=4326)
    
    results = []
    num_batches = int(np.ceil(len(taxi_trips) / batch_size))

    for i in range(num_batches):
        batch = taxi_trips.iloc[i * batch_size:(i + 1) * batch_size]
        batch['start_datetime'] = pd.to_datetime(batch['start_datetime'])
        batch['datetime_interval'] = batch['start_datetime'].dt.strftime('%Y-%m-%d-%H')

        # Merge with weather data on hourly intervals
        taxi_weather = pd.merge(batch, hourly_weather, on='datetime_interval', how='left')
        taxi_weather = taxi_weather.drop(columns=['datetime_interval', 'start_datetime_y', 'end_datetime_y'])
        taxi_weather = taxi_weather.rename(columns={'start_datetime_x': 'start_datetime', 'end_datetime_x': 'end_datetime'})

        # Create Geometry Point objects for check for proximity between coordinates
        gdf_taxi_pu = gpd.GeoDataFrame(taxi_weather, geometry=[Point(xy) for xy in zip(taxi_weather.PU_long, taxi_weather.PU_lat)], crs=4326)
        gdf_taxi_do = gpd.GeoDataFrame(taxi_weather, geometry=[Point(xy) for xy in zip(taxi_weather.DO_long, taxi_weather.DO_lat)], crs=4326)

        # Join with nearest population count coordinates (within 100m)
        result_pu = gpd.sjoin_nearest(gdf_taxi_pu, gdf_pop, max_distance=200, distance_col="distance_meters_pu")
        result_pu.rename(columns={'pop_count': 'PU_pop_count'}, inplace=True)

        # Join with nearest population count coordinates (within 100m)
        result_do = gpd.sjoin_nearest(gdf_taxi_do, gdf_pop, max_distance=100, distance_col="distance_meters_do")
        result_do.rename(columns={'pop_count': 'DO_pop_count'}, inplace=True)

        # Join with nearest population count coordinates (within 100m)
        taxi_weather = taxi_weather.join(result_pu[['PU_pop_count']], how='left')
        taxi_weather = taxi_weather.join(result_do[['DO_pop_count']], how='left', rsuffix='_do')
        results.append(taxi_weather)

        # Call garbage collection to free up RAM, prevent memory error crashes
        del batch, taxi_weather, gdf_taxi_pu, gdf_taxi_do, result_pu, result_do
        gc.collect()

    taxi_weather.to_parquet('../partial_merge/weather_taxi_pop_merge.parquet')
    return pd.concat(results)

processed_data = process_taxi_weather_population_in_batches(taxi_trips, hourly_weather, pop_count)