In [None]:
import pandas as pd
from dask.distributed import Client, LocalCluster
import dask.dataframe as dd
from pathlib import Path
import pandas as pd
import os
import numpy as np
from sklearn.neighbors import BallTree
import pyarrow.dataset as ds
import pyarrow as pa
import pyarrow.compute as pc

In [None]:
add_data_path = Path("/d/hpc/home/jv8043/BD/project/T5/add_data")

In [None]:
schools_df = pd.read_csv(add_data_path / "schools.csv",)
colleges_df = pd.read_csv(add_data_path / "colleges.csv")
major_ba_df = pd.read_csv(add_data_path / "major_ba.csv")
hotels_df = pd.read_csv(add_data_path / "hotels.csv")

In [None]:
# Earth radius in miles
EARTH_RADIUS_MILES = 3959

# Build BallTree using school & college coordinates (in radians)
schools_coords = np.radians(schools_df[['lat', 'lon']].values)
school_tree = BallTree(schools_coords, metric='haversine')

college_coords = np.radians(colleges_df[['lat', 'lon']].values)
college_tree = BallTree(college_coords, metric='haversine')

major_ba_coords = np.radians(major_ba_df[['lat', 'lon']].values)
major_ba_tree = BallTree(major_ba_coords, metric='haversine')

hotels_coords = np.radians(hotels_df[['lat', 'lon']].values)
hotels_tree = BallTree(hotels_coords, metric='haversine')

def find_nearest(df_, tree_, lat, lon, max_distance_miles=0.2):
    # Function to find nearest school within 0.2 miles - 300 meters
    coord = np.radians([[lat, lon]])
    dist, ind = tree_.query(coord, k=1)
    distance_miles = dist[0][0] * EARTH_RADIUS_MILES  # convert from radians to miles
    if distance_miles <= max_distance_miles:
        return df_.iloc[ind[0][0]]['name']
    else:
        # return nan
        return np.nan

In [None]:
tlc_zones = pd.read_csv("/d/hpc/home/jv8043/BD/project/add_data/taxi_zone_lookup.csv")

In [None]:
display(tlc_zones.head())

In [None]:
# drop nans in latitude and longitude
tlc_zones = tlc_zones.dropna(subset=["latitude", "longitude"])

In [None]:
coords = tlc_zones[["latitude", "longitude"]]
coords = coords.rename(columns={"latitude": "lat", "longitude": "lon"})

coords["closest_school"] = coords.apply(
    lambda row: find_nearest(schools_df, school_tree, row["lat"], row["lon"]), axis=1
)
coords["closest_college"] = coords.apply(
    lambda row: find_nearest(colleges_df, college_tree, row["lat"], row["lon"]), axis=1
)
# select between closest school and college
coords["closest_school_college"] = coords.apply(
    lambda row: row["closest_school"] if pd.notna(row["closest_school"]) else row["closest_college"], axis=1
)

del coords["closest_school"]
del coords["closest_college"]

coords["closest_ba"] = coords.apply(
    lambda row: find_nearest(major_ba_df, major_ba_tree, row["lat"], row["lon"]), axis=1
)
coords["closest_hotel"] = coords.apply(
    lambda row: find_nearest(hotels_df, hotels_tree, row["lat"], row["lon"]), axis=1
)

In [None]:
coords.to_csv(add_data_path / "tlc_zones_with_schools_colleges_ba_hotels_strict.csv", index=False)

In [None]:
# inspect N/A percentage per column
na_percentage = coords.isna().mean() * 100
print("N/A percentage per column:")
print(na_percentage)