# Vicinity of major businesses and attractions (based on pickup/dropoff date-time).

In [None]:
import dask.dataframe as dd
import pandas as pd
import geopandas as gpd
import pyarrow as pa

OUTPUT_DIR = "home/ivan/FRI/2024-2025/sem2/bd/hw3/data/out"

In [10]:
taxi_df = dd.read_csv('/home/ivan/FRI/2024-2025/sem2/bd/hw3/data/out/sample_taxi_data.csv')

In [12]:
buissines = pd.read_csv('../../data/DCA_Legally_Operating_Businesses_geocoded_.csv')

In [13]:
len(buissines)

72635

In [15]:
buissines = buissines.loc[buissines['License Expiration Date'] > '2020-01-01']
len(buissines)

23866

In [19]:
attractions = pd.read_excel('/home/ivan/FRI/2024-2025/sem2/bd/hw3/data/New_York_Tourist_Locations.xlsx')

  warn(msg)


In [None]:
# attractions = attractions.sample(frac=0.1)

In [22]:
attractions

Unnamed: 0,Tourist_Spot,Address,Zipcode
210,Sunset Park,Bet. Fifth and Seventh Aves. and 41st and 44th...,11220
309,City Ice Pavilion,"47-32 32nd Place Queens, NY 11101, Long Island...",11101
208,National Park Service,c/o Statue of Liberty National Monument New Yo...,10004
244,Bronx Zoo Treetop Adventure,"Bronx, NY 10460, Fordham",10460
150,The Ride,box office: inside The Gift Shop at 584 Eighth...,10018
7,American Museum of Natural History,"Central Park West at 79th St. Manhattan, NY 10...",10024
162,United Nations,"First Ave. at 46th St. Manhattan, NY 10017, Mi...",10017
261,Saint Augustine's Episcopal Church,"290 Henry St Manhattan, NY 10002, Lower East Side",10002
219,Bank of America Winter Village at Bryant Park,"Bryant Park, 42nd St. and 6th Ave. Manhattan, ...",10018
337,Czech Center New York,"321 E 73rd St Manhattan, NY, Upper East Side",10021


In [24]:
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
from geopy.exc import GeocoderTimedOut, GeocoderUnavailable

# Initialize Nominatim geocoder
geolocator = Nominatim(user_agent="event_location_geocoder", timeout=10)
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)

def preprocess_location(location):
    # Clean and format the location string
    location = (
        location.replace("between", "&")
        .replace(" and ", " & ")
        .replace("STREET", "St")
        .replace("BOULEVARD", "Blvd")
        .replace("BROADWAY", "Broadway")
        + ", New York, NY"  # Add city/state for better accuracy
    )
    return location

def get_coordinates(location):
    try:
        location = preprocess_location(location)
        result = geocode(location)
        if result:
            return (result.latitude, result.longitude)
        else:
            return (None, None)
    except (GeocoderTimedOut, GeocoderUnavailable):
        return (None, None)


location = "Bronx, NY 10460, Fordham"
print(get_coordinates(location)) 

(40.863092800000004, -73.89451320075729)


In [25]:


# Create a DataFrame to store coordinates

attractions[['lat', 'lon']] = attractions['Address'].apply(
    lambda x: pd.Series(get_coordinates(x)))

In [26]:
attractions

Unnamed: 0,Tourist_Spot,Address,Zipcode,lat,lon
210,Sunset Park,Bet. Fifth and Seventh Aves. and 41st and 44th...,11220,,
309,City Ice Pavilion,"47-32 32nd Place Queens, NY 11101, Long Island...",11101,40.741485,-73.933709
208,National Park Service,c/o Statue of Liberty National Monument New Yo...,10004,,
244,Bronx Zoo Treetop Adventure,"Bronx, NY 10460, Fordham",10460,40.863093,-73.894513
150,The Ride,box office: inside The Gift Shop at 584 Eighth...,10018,,
7,American Museum of Natural History,"Central Park West at 79th St. Manhattan, NY 10...",10024,,
162,United Nations,"First Ave. at 46th St. Manhattan, NY 10017, Mi...",10017,,
261,Saint Augustine's Episcopal Church,"290 Henry St Manhattan, NY 10002, Lower East Side",10002,40.713275,-73.991392
219,Bank of America Winter Village at Bryant Park,"Bryant Park, 42nd St. and 6th Ave. Manhattan, ...",10018,,
337,Czech Center New York,"321 E 73rd St Manhattan, NY, Upper East Side",10021,40.770344,-73.95989


In [28]:
buissines.head(2)

Unnamed: 0,DCA License Number,License Type,License Expiration Date,License Category,Business Name,Business Name 2,Address Building,Address Street Name,Secondary Address Street Name,Address City,...,Detail,Longitude,Latitude,Borough,Comminity Board,Census Tract,Council District,NTA,BBL,BIN
6,1460055-DCA,Business,3/31/2018,Stoop Line Stand,"LIBERTY PRODUCE, INC.",,12716,111TH AVE,,SOUTH OZONE PARK,...,"Product Category: Fruits, Vegetables, Soft Dri...",-73.81406,40.682565,QUEENS,410.0,172.0,28.0,South Ozone Park,4116320000.0,4250792.0
12,1280923-DCA,Individual,9/30/2017,General Vendor,"OLIJNYK, STEVEN",,9,MIDLAND GDNS,,BRONXVILLE,...,,,,,,,,,,


In [32]:
businesses_gdf = gpd.GeoDataFrame(
        buissines,
        geometry=gpd.points_from_xy(buissines.Longitude, buissines.Latitude),
        crs="EPSG:4326"
    ).to_crs("EPSG:32618") 

attractions_gdf = gpd.GeoDataFrame(
        attractions,
        geometry=gpd.points_from_xy(attractions.lon, attractions.lat),
        crs="EPSG:4326"
    ).to_crs("EPSG:32618")

In [38]:
def find_nearby_pois(trip_points_gdf, pois_gdf, radius=500):
    """Find points of interest within radius (meters) of trip locations"""
    trip_points_gdf['buffer'] = trip_points_gdf.geometry.buffer(radius)
    joined = gpd.sjoin(
        trip_points_gdf.set_geometry('buffer'),
        pois_gdf,
        how='left',
        predicate='intersects'
    )
    return joined.groupby(level=0).size().rename('nearby_count')


In [47]:
def process_partition(df_partition, businesses_gdf, attractions_gdf):
    # Convert to GeoDataFrame for pickup locations
    pickup_gdf = gpd.GeoDataFrame(
        df_partition,
        geometry=gpd.points_from_xy(
            df_partition.pickup_longitude, 
            df_partition.pickup_latitude
        ),
        crs="EPSG:4326"
    ).to_crs("EPSG:32618")
    
    # Find nearby businesses and attractions for pickup
    df_partition['pickup_nearby_businesses'] = find_nearby_pois(pickup_gdf, businesses_gdf)
    df_partition['pickup_nearby_attractions'] = find_nearby_pois(pickup_gdf, attractions_gdf)
    
    # Convert to GeoDataFrame for dropoff locations
    dropoff_gdf = gpd.GeoDataFrame(
        df_partition,
        geometry=gpd.points_from_xy(
            df_partition.dropoff_longitude, 
            df_partition.dropoff_latitude
        ),
        crs="EPSG:4326"
    ).to_crs("EPSG:32618")
    
    # Find nearby businesses and attractions for dropoff
    df_partition['dropoff_nearby_businesses'] = find_nearby_pois(dropoff_gdf, businesses_gdf)
    df_partition['dropoff_nearby_attractions'] = find_nearby_pois(dropoff_gdf, attractions_gdf)
    # fill na only these columns
    df_partition['pickup_nearby_businesses'] = df_partition['pickup_nearby_businesses'].fillna(0)
    df_partition['pickup_nearby_attractions'] = df_partition['pickup_nearby_attractions'].fillna(0)
    df_partition['dropoff_nearby_businesses'] = df_partition['dropoff_nearby_businesses'].fillna(0)
    df_partition['dropoff_nearby_attractions'] = df_partition['dropoff_nearby_attractions'].fillna(0)
    
    return df_partition

In [49]:
# df_taxi = taxi_df.compute()
# process_partition(df_taxi, businesses_gdf, attractions_gdf)

In [50]:
meta = taxi_df._meta.copy()
meta['pickup_nearby_businesses'] = pd.Series(dtype='int64')
meta['pickup_nearby_attractions'] = pd.Series(dtype='int64')
meta['dropoff_nearby_businesses'] = pd.Series(dtype='int64')
meta['dropoff_nearby_attractions'] = pd.Series(dtype='int64')

In [54]:
taxi_enriched = taxi_df.map_partitions(
    process_partition,
    businesses_gdf=businesses_gdf,
    attractions_gdf=attractions_gdf,
    meta=meta
)

In [55]:
taxi_enriched.to_parquet(
    f"{OUTPUT_DIR}/sample_taxi_data_enriched.parquet",
    engine='pyarrow',
    compression='snappy',
    # write_options=pa.parquet.WriteOptions(compression="snappy"),
    # partition_on=['pickup_nearby_businesses', 'pickup_nearby_attractions', 'dropoff_nearby_businesses', 'dropoff_nearby_attractions']
)
taxi_enriched = taxi_enriched.compute()

In [56]:
taxi_enriched

Unnamed: 0,vendorid,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,ratecodeid,store_and_fwd_flag,pulocationid,dolocationid,payment_type,...,airport_fee,year,pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude,pickup_nearby_businesses,pickup_nearby_attractions,dropoff_nearby_businesses,dropoff_nearby_attractions
0,1.0,2020-01-22 20:10:13,2020-01-22 20:28:03,1.0,2.20,1.0,N,100,113,1.0,...,,2020,40.753513,-73.988787,40.732579,-73.994305,193,1,111,1
1,2.0,2020-01-18 23:54:57,2020-01-19 00:04:55,6.0,1.96,1.0,N,264,264,1.0,...,,2020,,,,,1,1,1,1
2,1.0,2020-01-04 11:10:51,2020-01-04 11:18:37,3.0,1.30,1.0,N,161,237,2.0,...,,2020,40.758028,-73.977698,40.768615,-73.965635,454,1,101,1
3,2.0,2020-01-23 08:05:22,2020-01-23 08:17:23,5.0,1.73,1.0,N,141,162,1.0,...,,2020,40.766948,-73.959635,40.756688,-73.972356,123,1,130,1
4,2.0,2020-01-16 17:09:03,2020-01-16 17:12:50,1.0,0.93,1.0,N,263,236,1.0,...,,2020,40.778766,-73.951010,40.780436,-73.957012,120,1,89,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3280,1.0,2024-02-07 19:08:26,2024-02-07 19:08:26,1.0,0.00,1.0,N,162,264,2.0,...,0.0,2024,40.756688,-73.972356,,,130,1,1,1
3281,2.0,2024-02-05 11:08:46,2024-02-05 11:30:10,3.0,1.79,1.0,N,230,233,2.0,...,0.0,2024,40.759818,-73.984196,40.749914,-73.970443,388,1,68,1
3282,2.0,2024-02-06 14:40:16,2024-02-06 15:15:56,5.0,3.47,1.0,N,140,151,1.0,...,0.0,2024,40.765484,-73.954739,40.797962,-73.968168,74,1,39,1
3283,2.0,2024-02-09 18:38:17,2024-02-09 18:41:30,1.0,0.44,1.0,N,236,263,1.0,...,0.0,2024,40.780436,-73.957012,40.778766,-73.951010,89,1,120,1


In [None]:
taxi