##  Adding weather features to the dataset

In [None]:
import dask.dataframe as dd
import pandas as pd
from dask.diagnostics import ProgressBar
# Load your taxi data

WEATHER_DATA_PATH = "/d/hpc/projects/FRI/bigdata/students/in7357/weather_data.csv"
OUTPUT_DIR = '/d/hpc/projects/FRI/bigdata/students/in7357/out'
CLEANED_DATA = '/d/hpc/projects/FRI/bigdata/students/in7357/cleaned_parquet'
yellow_taxi_df = dd.read_parquet(f"{CLEANED_DATA}/YELLOW/", engine="pyarrow")

yellow_taxi_df["pickup_hour"] = yellow_taxi_df["pickup_datetime"].dt.floor("H")

green_taxi_df = dd.read_parquet(f"{CLEANED_DATA}/GREEN/", engine="pyarrow")
green_taxi_df["pickup_hour"] = green_taxi_df["pickup_datetime"].dt.floor("H")

fhvhv_df = dd.read_parquet(f"{CLEANED_DATA}/FHVH/", engine="pyarrow")
fhvhv_df["pickup_hour"] = fhvhv_df["pickup_datetime"].dt.floor("H")

fhv_df = dd.read_parquet(f"{CLEANED_DATA}/FHV/", engine="pyarrow")
fhv_df["pickup_hour"] = fhv_df["pickup_datetime"].dt.floor("H")

weather_df = dd.read_csv(
    WEATHER_DATA_PATH,
    parse_dates=["time"]
)

weather_df = weather_df.rename(columns={"time": "pickup_hour"})

weather_df = weather_df.rename(columns=lambda x: x.strip().lower().replace(" ", "_").replace("(", "").replace(")", "").replace("°", "deg"))

augmented_yellow_df = dd.merge(
    yellow_taxi_df,
    weather_df,
    on="pickup_hour",
    how="left"
)
augmented_green_df = dd.merge(
    green_taxi_df,
    weather_df,
    on="pickup_hour",
    how="left"
)
augmented_fhvhv_df = dd.merge(
    fhvhv_df,
    weather_df,
    on="pickup_hour",
    how="left"
)
augmented_fhv_df = dd.merge(
    fhv_df,
    weather_df,
    on="pickup_hour",
    how="left"
)


augmented_yellow_df.to_parquet(
    f"{OUTPUT_DIR}/YELLOW/augmented_with_weather",
    write_index=False
)

augmented_green_df.to_parquet(
    f"{OUTPUT_DIR}/GREEN/augmented_with_weather",
    write_index=False
)

augmented_fhvhv_df.to_parquet(
    f"{OUTPUT_DIR}/FHVH/augmented_with_weather",
    write_index=False
)

augmented_fhv_df.to_parquet(
    f"{OUTPUT_DIR}/FHV/augmented_with_weather",
    write_index=False
)

## Making new columns from school locations data

In [None]:
import zipfile
import geopandas as gpd
import os

zip_path = "/d/hpc/projects/FRI/bigdata/students/in7357/taxi_zones.zip"  
extract_dir = "/d/hpc/projects/FRI/bigdata/students/in7357/taxi_zones_shapefile"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

shp_file = [f for f in os.listdir(extract_dir) if f.endswith(".shp")][0]
gdf = gpd.read_file(os.path.join(extract_dir, shp_file)).to_crs("EPSG:4326")

# Compute centroids
gdf["latitude"] = gdf.centroid.y
gdf["longitude"] = gdf.centroid.x
zone_coords = gdf[["LocationID", "latitude", "longitude"]].copy()


pickup_zones = zone_coords.rename(columns={
    "LocationID": "pulocationid",
    "latitude": "pickup_latitude",
    "longitude": "pickup_longitude"
})
dropoff_zones = zone_coords.rename(columns={
    "LocationID": "dolocationid",
    "latitude": "dropoff_latitude",
    "longitude": "dropoff_longitude"
})


yellow_taxi_df['pulocationid'] = yellow_taxi_df['pulocationid'].astype('int32')
yellow_taxi_df['dolocationid'] = yellow_taxi_df['dolocationid'].astype('int32')
# Merge pickup and dropoff coordinates
yellow_taxi_df = yellow_taxi_df.merge(pickup_zones, on="pulocationid", how="left")
yellow_taxi_df = yellow_taxi_df.merge(dropoff_zones, on="dolocationid", how="left")

green_taxi_df['pulocationid'] = green_taxi_df['pulocationid'].astype('int32')
green_taxi_df['dolocationid'] = green_taxi_df['dolocationid'].


green_taxi_df = green_taxi_df.merge(pickup_zones, on="pulocationid", how="left")
green_taxi_df = green_taxi_df.merge(dropoff_zones, on="dolocationid", how="left")
fhvhv_df['pulocationid'] = fhvhv_df['pulocationid'].astype('int32')
fhvhv_df['dolocationid'] = fhvhv_df['dolocationid'].astype('int32')
fhvhv_df = fhvhv_df.merge(pickup_zones, on="pulocationid", how="left")
fhvhv_df = fhvhv_df.merge(dropoff_zones, on="dolocationid", how="left")
fhv_df['pulocationid'] = fhv_df['pulocationid'].astype('int32')
fhv_df['dolocationid'] = fhv_df['dolocationid'].astype('int32')
fhv_df = fhv_df.merge(pickup_zones, on="pulocationid", how="left")
fhv_df = fhv_df.merge(dropoff_zones, on="dolocationid", how="left")

### Load school locations data

In [None]:
schools_df = pd.read_csv("/d/hpc/projects/FRI/bigdata/students/in7357/school_locations.csv")

def extract_coords(location_str):
    try:
        coordinates = location_str.split('\n')[-1].strip('(').strip(')').split(',')
        lat, long =  map(float, coordinates)
        return pd.Series([lat, long])
    except Exception:
        return pd.Series([None, None])
    

# def extract_coords(location_str):
#     import re
#     match = re.search(r'\\(([-\\d.]+), ([-\\d.]+)\\)', str(location_str))
#     return pd.Series((float(match[1]), float(match[2]))) if match else pd.Series((None, None))

schools_df[['school_lat', 'school_lon']] = schools_df['Location 1'].apply(extract_coords)
# schools_df = schools_df.dropna(subset=['school_lat', 'school_lon'])

school_lat = schools_df['school_lat'].values
school_lon = schools_df['school_lon'].values

### Define Haversine distatance function and feature logic

In [None]:
import numpy as np
from numba import jit

@jit(nopython=True)
def haversine_numba(lat1, lon1, lat2, lon2):
    """
    Numba-optimized haversine distance calculation for single point vs array
    Returns distances in km
    """
    R = 6371.0
    lat1 = np.radians(lat1)
    lon1 = np.radians(lon1)
    lat2 = np.radians(lat2)
    lon2 = np.radians(lon2)
    
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    
    a = np.sin(dlat/2)**2 + np.cos(lat1)*np.cos(lat2)*np.sin(dlon/2)**2
    return R * 2 * np.arcsin(np.sqrt(a))

def enrich_with_school_features(df, school_lat, school_lon):
    if df.empty or len(school_lat) == 0:
        return df.assign(
            pickup_nearest_school_distance_km=np.nan,
            pickup_near_school=0,
            pickup_school_count_500m=0,
            dropoff_near_school=0
        )
    
    # Convert to numpy arrays once
    lat_pick = df['pickup_latitude'].values
    lon_pick = df['pickup_longitude'].values
    lat_drop = df['dropoff_latitude'].values
    lon_drop = df['dropoff_longitude'].values
    
    # Pre-allocate result arrays
    n = len(df)
    pickup_min_dist = np.full(n, np.inf)
    pickup_count_500m = np.zeros(n, dtype=np.int32)
    dropoff_min_dist = np.full(n, np.inf)
    
    # Process schools in batches to control memory usage
    batch_size = 100  # Adjust based on available memory
    n_schools = len(school_lat)
    
    for i in range(0, n_schools, batch_size):
        batch_end = min(i + batch_size, n_schools)
        batch_lat = school_lat[i:batch_end]
        batch_lon = school_lon[i:batch_end]
        
        # Process pickup locations
        for j in range(len(batch_lat)):
            dists = haversine_numba(lat_pick, lon_pick, batch_lat[j], batch_lon[j])
            # Use np.fmin to handle NaN values properly 
            pickup_min_dist = np.fmin(pickup_min_dist, dists)
            pickup_min_dist = np.where(pickup_min_dist == np.inf, np.nan, pickup_min_dist)
            # Replace any NaN values with original distance
            # pickup_min_dist = np.nan_to_num(pickup_min_dist, nan=np.inf)
            
            pickup_count_500m += (dists <= 0.5)
            
            # Process dropoff locations
            dists = haversine_numba(lat_drop, lon_drop, batch_lat[j], batch_lon[j])
            dropoff_min_dist = np.fmin(dropoff_min_dist, dists)
            # cast inf to NaN
            dropoff_min_dist = np.where(dropoff_min_dist == np.inf, np.nan, dropoff_min_dist)
            # dropoff_min_dist = np.nan_to_num(dropoff_min_dist, nan=np.inf)
            # dropoff_min_dist = np.minimum(np.nanmin(dropoff_min_dist), np.nanmin(dists))
    
    return df.assign(
        pickup_nearest_school_distance_km=pickup_min_dist,
        pickup_near_school=(pickup_min_dist <= 0.2).astype(float),
        pickup_school_count_500m=pickup_count_500m,
        dropoff_nearest_school_distance_km=dropoff_min_dist,
        dropoff_near_school=(dropoff_min_dist <= 0.2).astype(float)
    )

result_yellow = enrich_with_school_features(
    yellow_taxi_df,
    school_lat,
    school_lon
)

result_green = enrich_with_school_features(
    green_taxi_df,
    school_lat,
    school_lon
)

result_fhvhv = enrich_with_school_features(
    fhvhv_df,
    school_lat,
    school_lon
)

result_fhv = enrich_with_school_features(
    fhv_df,
    school_lat,
    school_lon
)
result_yellow.to_parquet(
    f"{OUTPUT_DIR}/YELLOW/augmented_with_school_features",
    write_index=False
)
result_green.to_parquet(
    f"{OUTPUT_DIR}/GREEN/augmented_with_school_features",
    write_index=False
)
result_fhvhv.to_parquet(
    f"{OUTPUT_DIR}/FHVH/augmented_with_school_features",
    write_index=False
)
result_fhv.to_parquet(
    f"{OUTPUT_DIR}/FHV/augmented_with_school_features",
    write_index=False
)   



# Enrich with events

In [None]:
events = dd.read_csv('/d/hpc/projects/FRI/bigdata/students/in7357/nyc_event_data', assume_missing=True)

In [None]:
events['Start Date/Time'] = dd.to_datetime(events['Start Date/Time'], format='%m/%d/%Y %I:%M:%S %p')
events['End Date/Time'] = dd.to_datetime(events['End Date/Time'], format='%m/%d/%Y %I:%M:%S %p')

In [None]:
selected_iterval = (events['Start Date/Time'] > '2012-01-01') & (events['Start Date/Time'] < '2025-05-01')
events = events[selected_iterval]
important_event_types = [
    "Marathon", 
    "BID Multi-Block", 
    "Rally", 
    # "Parade", 
    # "Plaza Event", 
    "Festival", 
    "Health Fair", 
    # "Parade"
]
# Filter out rows based on Event Name
events = events[events['Event Type'].isin(important_event_types)]

events['Event Type'].value_counts().compute()

In [None]:
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
from geopy.exc import GeocoderTimedOut, GeocoderUnavailable

# Initialize Nominatim geocoder
# slow processing, so we set a long timeout and rate limit
geolocator = Nominatim(user_agent="event_location_geocoder", timeout=100)
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=120)

def preprocess_location(location):
    # Clean and format the location string
    location = (
        location.replace("between", "&")
        .replace(" and ", " & ")
        .replace("STREET", "St")
        .replace("BOULEVARD", "Blvd")
        .replace("BROADWAY", "Broadway")
        + ", New York, NY"  # Add city/state for better accuracy
    )
    return location

def get_coordinates(location):
    try:
        location = preprocess_location(location)
        result = geocode(location)
        if result:
            return (result.latitude, result.longitude)
        else:
            return (None, None)
    except (GeocoderTimedOut, GeocoderUnavailable):
        return (None, None)


location = "GOUVERNEUR STREET"
print(get_coordinates(location)) 

In [None]:
unique_locations = events['Event Location'].unique()

locations_df = pd.DataFrame(unique_locations, columns=['Event Location'])
locations_df[['lat', 'lon']] = locations_df['Event Location'].apply(
    lambda x: pd.Series(get_coordinates(x)))

In [None]:
# merge events with the locations

events_merged = events.merge(locations_df, on='Event Location', how='left')
events_merged = events_merged.dropna(subset=['lat', 'lon'])

events_merged_gdf = gpd.GeoDataFrame(
    events_merged,
    geometry=gpd.points_from_xy(events_merged['lon'], events_merged['lat']),
    crs="EPSG:4326"
).to_crs("EPSG:32618")

In [None]:
# Merge with taxi data
from curses import meta


def process_partition(df_partition, events_gdf):
    # Convert partition to pandas DataFrame
    df = df_partition.copy()
    
    pickup_gdf = gpd.GeoDataFrame(
        df,
        geometry=gpd.points_from_xy(df.pickup_longitude, df.pickup_latitude),
        crs="EPSG:4326"
    ).to_crs("EPSG:32618")
    
    pickup_gdf['buffer'] = pickup_gdf.geometry.buffer(1000)
    pickup_joined = gpd.sjoin(
        pickup_gdf.set_geometry('buffer'), 
        events_gdf, 
        how='inner', 
        predicate='intersects'
    )
    pickup_events = pickup_joined[
        pickup_joined.tpep_pickup_datetime.between(
            pickup_joined['Start Date/Time'], 
            pickup_joined['End Date/Time']
        )
    ].groupby(level=0).size().rename('pickup_events')
    
    # Process dropoff events
    dropoff_gdf = gpd.GeoDataFrame(
        df,
        geometry=gpd.points_from_xy(df.dropoff_longitude, df.dropoff_latitude),
        crs="EPSG:4326"
    ).to_crs("EPSG:32618")
    
    dropoff_gdf['buffer'] = dropoff_gdf.geometry.buffer(1000)
    dropoff_joined = gpd.sjoin(
        dropoff_gdf.set_geometry('buffer'), 
        events_gdf, 
        how='inner', 
        predicate='intersects'
    )
    dropoff_events = dropoff_joined[
        dropoff_joined.tpep_dropoff_datetime.between(
            dropoff_joined['Start Date/Time'], 
            dropoff_joined['End Date/Time']
        )
    ].groupby(level=0).size().rename('dropoff_events')
    
    # Merge results
    return df.join(pickup_events, how='left') \
             .join(dropoff_events, how='left') \
             .fillna({'pickup_events': 0, 'dropoff_events': 0}) \
             .assign(total_events=lambda x: x.pickup_events + x.dropoff_events)
    
# Apply processing to all partitions
meta_yellow = yellow_taxi_df._meta.copy()
meta_yellow['pickup_events'] = pd.Series(dtype='int64')
meta_yellow['dropoff_events'] = pd.Series(dtype='int64')
meta_yellow['total_events'] = pd.Series(dtype='int64')


meta_green = green_taxi_df._meta.copy()
meta_green['pickup_events'] = pd.Series(dtype='int64')
meta_green['dropoff_events'] = pd.Series(dtype='int64')
meta_green['total_events'] = pd.Series(dtype='int64')

meta_fhv = fhv_df._meta.copy()
meta_fhv['pickup_events'] = pd.Series(dtype='int64')
meta_fhv['dropoff_events'] = pd.Series(dtype='int64')
meta_fhv['total_events'] = pd.Series(dtype='int64')

meta_fhvgh = fhvhv_df._meta.copy()
meta_fhvgh['pickup_events'] = pd.Series(dtype='int64')
meta_fhvgh['dropoff_events'] = pd.Series(dtype='int64')
meta_fhvgh['total_events'] = pd.Series(dtype='int64')   

yellow_taxi_df_location_enriched = yellow_taxi_df.map_partitions(
    process_partition,
    events_gdf=events_merged_gdf,
    meta=meta_yellow
)
green_taxi_df_location_enriched = green_taxi_df.map_partitions(
    process_partition,
    events_gdf=events_merged_gdf,
    meta=meta_green
)
fhvhv_df_location_enriched = fhvhv_df.map_partitions(
    process_partition,
    events_gdf=events_merged_gdf,
    meta=meta_fhvgh
)
fhv_df_location_enriched = fhv_df.map_partitions(
    process_partition,
    events_gdf=events_merged_gdf,
    meta=meta_fhv
)


yellow_taxi_df_location_enriched.to_parquet(
    f"{OUTPUT_DIR}/YELLOW/augmented_with_events",
    write_index=False
)
green_taxi_df_location_enriched.to_parquet(
    f"{OUTPUT_DIR}/GREEN/augmented_with_events",
    write_index=False
)
fhvhv_df_location_enriched.to_parquet(
    f"{OUTPUT_DIR}/FHVH/augmented_with_events",
    write_index=False
)   

fhv_df_location_enriched.to_parquet(
    f"{OUTPUT_DIR}/FHV/augmented_with_events",
    write_index=False
)


# Vicinity of major businesses and attractions (based on pickup/dropoff date-time).

In [None]:
businesses = pd.read_csv('/d/hpc/projects/FRI/bigdata/students/in7357/DCA_Legally_Operating_Businesses_geocoded_.csv')

In [None]:
businesses = businesses.loc[businesses['License Expiration Date'] > '2012-01-01']
attractions = pd.read_excel('/d/hpc/projects/FRI/bigdata/students/in7357/New_York_Tourist_Locations.xlsx')


attractions[['lat', 'lon']] = attractions['Address'].apply(
    lambda x: pd.Series(get_coordinates(x)))


businesses_gdf = gpd.GeoDataFrame(
        businesses,
        geometry=gpd.points_from_xy(businesses.Longitude, businesses.Latitude),
        crs="EPSG:4326"
    ).to_crs("EPSG:32618") 

attractions_gdf = gpd.GeoDataFrame(
        attractions,
        geometry=gpd.points_from_xy(attractions.lon, attractions.lat),
        crs="EPSG:4326"
    ).to_crs("EPSG:32618")


def find_nearby_pois(trip_points_gdf, pois_gdf, radius=500):
    """Find points of interest within radius (meters) of trip locations"""
    trip_points_gdf['buffer'] = trip_points_gdf.geometry.buffer(radius)
    joined = gpd.sjoin(
        trip_points_gdf.set_geometry('buffer'),
        pois_gdf,
        how='left',
        predicate='intersects'
    )
    return joined.groupby(level=0).size().rename('nearby_count')


def process_partition(df_partition, businesses_gdf, attractions_gdf):
    # Convert to GeoDataFrame for pickup locations
    pickup_gdf = gpd.GeoDataFrame(
        df_partition,
        geometry=gpd.points_from_xy(
            df_partition.pickup_longitude, 
            df_partition.pickup_latitude
        ),
        crs="EPSG:4326"
    ).to_crs("EPSG:32618")
    
    # Find nearby businesses and attractions for pickup
    df_partition['pickup_nearby_businesses'] = find_nearby_pois(pickup_gdf, businesses_gdf)
    df_partition['pickup_nearby_attractions'] = find_nearby_pois(pickup_gdf, attractions_gdf)
    
    # Convert to GeoDataFrame for dropoff locations
    dropoff_gdf = gpd.GeoDataFrame(
        df_partition,
        geometry=gpd.points_from_xy(
            df_partition.dropoff_longitude, 
            df_partition.dropoff_latitude
        ),
        crs="EPSG:4326"
    ).to_crs("EPSG:32618")
    
    # Find nearby businesses and attractions for dropoff
    df_partition['dropoff_nearby_businesses'] = find_nearby_pois(dropoff_gdf, businesses_gdf)
    df_partition['dropoff_nearby_attractions'] = find_nearby_pois(dropoff_gdf, attractions_gdf)
    # fill na only these columns
    df_partition['pickup_nearby_businesses'] = df_partition['pickup_nearby_businesses'].fillna(0)
    df_partition['pickup_nearby_attractions'] = df_partition['pickup_nearby_attractions'].fillna(0)
    df_partition['dropoff_nearby_businesses'] = df_partition['dropoff_nearby_businesses'].fillna(0)
    df_partition['dropoff_nearby_attractions'] = df_partition['dropoff_nearby_attractions'].fillna(0)
    
    return df_partition


meta_yellow['pickup_nearby_businesses'] = pd.Series(dtype='int64')
meta_yellow['pickup_nearby_attractions'] = pd.Series(dtype='int64')
meta_yellow['dropoff_nearby_businesses'] = pd.Series(dtype='int64')
meta_yellow['dropoff_nearby_attractions'] = pd.Series(dtype='int64')
meta_green = green_taxi_df._meta.copy()
meta_green['pickup_nearby_businesses'] = pd.Series(dtype='int64')
meta_green['pickup_nearby_attractions'] = pd.Series(dtype='int64')
meta_green['dropoff_nearby_businesses'] = pd.Series(dtype='int64')
meta_green['dropoff_nearby_attractions'] = pd.Series(dtype='int64')
meta_fhv = fhv_df._meta.copy()
meta_fhv['pickup_nearby_businesses'] = pd.Series(dtype='int64')
meta_fhv['pickup_nearby_attractions'] = pd.Series(dtype='int64')
meta_fhv['dropoff_nearby_businesses'] = pd.Series(dtype='int64')
meta_fhv['dropoff_nearby_attractions'] = pd.Series(dtype='int64')
meta_fhvgh = fhvhv_df._meta.copy()
meta_fhvgh['pickup_nearby_businesses'] = pd.Series(dtype='int64')
meta_fhvgh['pickup_nearby_attractions'] = pd.Series(dtype='int64')
meta_fhvgh['dropoff_nearby_businesses'] = pd.Series(dtype='int64')
meta_fhvgh['dropoff_nearby_attractions'] = pd.Series(dtype='int64')  


yellow_taxi_df_location_enriched = yellow_taxi_df.map_partitions(
    process_partition,
    businesses_gdf=businesses_gdf,
    attractions_gdf=attractions_gdf,
    meta=meta_yellow
)
green_taxi_df_location_enriched = green_taxi_df.map_partitions(
    process_partition,
    businesses_gdf=businesses_gdf,
    attractions_gdf=attractions_gdf,
    meta=meta_green
)
fhvhv_df_location_enriched = fhvhv_df.map_partitions(
    process_partition,
    businesses_gdf=businesses_gdf,
    attractions_gdf=attractions_gdf,
    meta=meta_fhvgh,
)
fhv_df_location_enriched = fhv_df.map_partitions(
    process_partition,
    businesses_gdf=businesses_gdf,
    attractions_gdf=attractions_gdf,
    meta=meta_fhv
)  


yellow_taxi_df_location_enriched.to_parquet(
    f"{OUTPUT_DIR}/YELLOW/augmented_with_pois",
    write_index=False
)
green_taxi_df_location_enriched.to_parquet(
    f"{OUTPUT_DIR}/GREEN/augmented_with_pois",
    write_index=False
)
fhvhv_df_location_enriched.to_parquet(
    f"{OUTPUT_DIR}/FHVH/augmented_with_pois",
    write_index=False
)
fhv_df_location_enriched.to_parquet(
    f"{OUTPUT_DIR}/FHV/augmented_with_pois",
    write_index=False
)   