# Data Setup

In [1]:
from sklearn.neighbors import BallTree
from sklearn.preprocessing import MinMaxScaler

import numpy as np
import pandas as pd
import geopandas as gpd
import re
import unicodedata
from collections import Counter

import duckdb
import aiohttp
import asyncio
import nest_asyncio
nest_asyncio.apply()
import re
from shapely import wkt
from shapely.geometry import Polygon
from shapely.ops import unary_union


## Generating FSQ Places Dataset

### Using Bounding Box

In [8]:
def generate_df_from_bb(min_lon, max_lon, min_lat, max_lat, fsq_release_date = "2025-06-10", file_name = 'gdf_fsq_pois'):
    """
    Produces a parquet file of the FSQ POI dataset within a given bounding box using the version published on [fsq_release_date].
    The file is saved at '/share/garg/accessgaps2024/fsq_dedup_pipeline/data/fsq_data/{file_name}_by_bb.parquet'.
    Parameters:
        min_lon (float): southmost longitude
        max_lon (float): northmost longitude
        min_lat (float): westmost latitude
        max_lat (float): eastmost latitude
        fsq_release_date (string): FSQ release date, in the form "YYYY-MM-DD".
        file_name (string): name of file that the resulting fsq dataset will be saved as.
    ______________________________
    Ex Usage: 
    To generate (one time only):
        US_MIN_LAT = 24.396308   # Southern tip of Florida
        US_MAX_LAT = 49.384358   # Northern border (Minnesota)
        US_MIN_LON = -124.848974 # Western edge (California coast)
        US_MAX_LON = -66.93457   # Eastern edge (Maine)
        us_df = generate_df_from_bb(US_MIN_LON, US_MAX_LON, US_MIN_LAT, US_MAX_LAT, fsq_release_date = "2025-06-10", file_name = 'us_by_bb')
    To retrieve parquet on future runs:
        us_df = pd.read_parquet('/share/garg/accessgaps2024/fsq_dedup_pipeline/data/fsq_data/{file_name}_by_bb.parquet')
    """
    s3_path = f"s3://fsq-os-places-us-east-1/release/dt={fsq_release_date}/places/parquet/*.parquet"

    # Connect to DuckDB
    conn = duckdb.connect()

    # Query POIs within Manhattan bounds
    copy_query = f"""
        COPY (
            SELECT *
            FROM read_parquet('{s3_path}')
            WHERE 
                longitude BETWEEN {min_lon} AND {max_lon}
                AND latitude BETWEEN {min_lat} AND {max_lat}
        ) TO '/share/garg/accessgaps2024/fsq_dedup_pipeline/data/fsq_data/{file_name}_by_bb.parquet' (FORMAT PARQUET)
    """

    conn.execute(copy_query)

    # Load the result into a dataframe
    q = duckdb.connect()
    q.execute(f"SELECT * FROM '/share/garg/accessgaps2024/fsq_dedup_pipeline/data/fsq_data/{file_name}_by_bb.parquet'")
    res_df = q.fetch_df()

    conn.close()
    q.close()
    return res_df

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 34)

### Per Region

In [9]:
def generate_df_from_fsq_by_region(selected_regions_lst, fsq_release_date = "2025-06-10", file_name = 'gdf_fsq_pois'):
    """Produces a parquet file of the FSQ POI dataset within the regions of [selected_regions_lst], 
    using the version published on [fsq_release_date].
    The file is saved at '/share/garg/accessgaps2024/fsq_dedup_pipeline/data/fsq_data/{file_name}_by_region.parquet'.
    Parameters:
        selected_regions_lst (list): list of states, provinces, territories, etc. 
                                    Abbreviations are used in the countries: US, CA, AU, BR.
                                    Please check the FourSquare Places Dataset documentation for more specific details.
        fsq_release_date (string): FSQ release date, in the form "YYYY-MM-DD".
        file_name (string): name of file that the resulting fsq dataset will be saved as.

    Ex Usage:
    To generate (one time only):
    ny_df = generate_df_from_fsq_by_region(['NY'], fsq_release_date = "2025-06-10", file_name = 'ny_fsq_pois')
    To retrieve parquet on future runs:
    ny_df = pd.read_parquet('/share/garg/accessgaps2024/fsq_dedup_pipeline/data/fsq_data/{file_name}_by_region.parquet')
    """
    assert isinstance(selected_regions_lst, list) and selected_regions_lst != []

    s3_path = f"s3://fsq-os-places-us-east-1/release/dt={fsq_release_date}/places/parquet/*.parquet"

    # Connect to DuckDB
    conn = duckdb.connect()

    # Query POIs within Manhattan bounds
    copy_query = f"""
        COPY (
            SELECT *
            FROM read_parquet('{s3_path}')
            WHERE 
                region in {selected_regions_lst}
        ) TO '/share/garg/accessgaps2024/fsq_dedup_pipeline/data/fsq_data/{file_name}_by_region.parquet' (FORMAT PARQUET)
    """

    conn.execute(copy_query)

    # Load the result into a dataframe
    q = duckdb.connect()
    q.execute(f"SELECT * FROM '/share/garg/accessgaps2024/fsq_dedup_pipeline/data/fsq_data/{file_name}_by_region.parquet'")
    res = q.fetch_df()
    conn.close()
    q.close()
    return res


#### Per County

In [None]:
counties = gpd.read_file('/share/garg/accessgaps2024/fsq_dedup_pipeline/data/tl_2024_us_county/tl_2024_us_county.shp')
def generate_df_from_fsq_by_us_county(selected_county_geoids, fsq_release_date = "2025-06-10", file_name = 'gdf_fsq_pois'):
    """Produces a parquet file of the FSQ POI dataset within the US counties of [selected_county_geoids], 
    using the version published on [fsq_release_date].
    The file is saved at '/share/garg/accessgaps2024/fsq_dedup_pipeline/data/fsq_data/{file_name}_by_county.parquet'.
    Parameters:
        selected_us_county_geoids (list): list of US county geoids (5-digit strings). Please check Census.Gov documentation for more specific details.
        fsq_release_date (string): FSQ release date, in the form "YYYY-MM-DD".
        file_name (string): name of file that the resulting fsq dataset will be saved as.

    Ex Usage:
    To generate (one time only):
    nyc_df = generate_df_from_fsq_by_us_county(["36005", "36047", "36061", "36081", "36085"], fsq_release_date = "2025-06-10", file_name = 'nyc_fsq_pois')
    To retrieve parquet on future runs:
    nyc_df = pd.read_parquet('/share/garg/accessgaps2024/fsq_dedup_pipeline/data/fsq_data/{file_name}_by_county.parquet')
    """
    assert isinstance(selected_county_geoids, list) and selected_county_geoids != []
    # Filter to the specified counties by GEOID
    gdf = counties[counties["GEOID"].isin(selected_county_geoids)].to_crs("EPSG:4326")
    gdf_geom = unary_union(gdf['geometry'])

    # Get WKT (Well-Known Text) for DuckDB
    gdf_wkt = gdf_geom.wkt

    s3_path = f"s3://fsq-os-places-us-east-1/release/dt={fsq_release_date}/places/parquet/*.parquet"

    # Connect to DuckDB
    conn = duckdb.connect()
    conn.execute("INSTALL spatial;")
    conn.execute("LOAD spatial;")

    # Register the WKT as a geometry in DuckDB
    conn.execute("CREATE TEMP TABLE gdf_bounds AS SELECT ST_GeomFromText(?) AS geom", [gdf_wkt])

    # Now do the spatial filter using the geometry column
    query = f"""
    COPY (
        SELECT *
        FROM read_parquet('{s3_path}'),
            gdf_bounds
        WHERE ST_Contains(gdf_bounds.geom, ST_Point(longitude, latitude))
    ) TO '/share/garg/accessgaps2024/fsq_dedup_pipeline/data/{file_name}_by_county' (FORMAT PARQUET)
    """
    conn.execute(query)

    # Load the result into a dataframe
    q = duckdb.connect()
    q.execute(f"SELECT * FROM '/share/garg/accessgaps2024/fsq_dedup_pipeline/data/{file_name}_by_county.parquet'")
    res =  q.fetch_df()
    conn.close()
    q.close()
    return res

In [3]:
ne_df = pd.read_parquet("/share/garg/accessgaps2024/fsq_dedup_pipeline/data/fsq_data/ne_fsq_pois_by_region.parquet")
ri_df = ne_df[ne_df['region'] == "RI"]
ri_df

Unnamed: 0,fsq_place_id,name,latitude,longitude,address,locality,region,postcode,admin_region,post_town,...,facebook_id,instagram,twitter,fsq_category_ids,fsq_category_labels,placemaker_url,unresolved_flags,geom,bbox,dt
93,59f9bdc6419a9e1a5972c186,Vanderbilt Grill,,,41 Mary St,Newport,RI,02840,,,...,,,,,,https://foursquare.com/placemakers/review-plac...,[closed],,,2025-06-10
114,56306758498e711813e7aa0c,Ocean State Cesspool Inc,36.297775,-119.137405,,Exeter,RI,02822,,,...,,,,[545419b1498ea6ccd0202f58],[Business and Professional Services > Home Imp...,https://foursquare.com/placemakers/review-plac...,,b'\x00\x00\x00\x00\x01\xc0]\xc8\xcb?\x00\x00\x...,"{'xmin': -119.13740515708923, 'ymin': 36.29777...",2025-06-10
165,570f2f69cd10c8b767b23ee5,Bristol Painting,32.261207,-110.962965,,Bristol,RI,02809,,,...,,,,[5453de49498eade8af355881],[Business and Professional Services > Business...,https://foursquare.com/placemakers/review-plac...,,b'\x00\x00\x00\x00\x01\xc0[\xbd\xa1:\x00\x00\x...,"{'xmin': -110.96296548843384, 'ymin': 32.26120...",2025-06-10
200,68328ded7c29ab6d46642f13,"Regal Roofing (West Warwick, RI)",39.865506,-97.614599,294 Cowesett Ave #4,West Warwick,RI,02893,,,...,,,,[63be6904847c3692a84b9b61],[Business and Professional Services > Home Imp...,https://foursquare.com/placemakers/review-plac...,,b'\x00\x00\x00\x00\x01\xc0XgU\x95h\x00\x00@C\x...,"{'xmin': -97.61459860950708, 'ymin': 39.865505...",2025-06-10
201,53993521498ec5085901e4f2,Proper Cleaning LLC,40.145289,-98.217773,,Cranston,RI,02910,,,...,,,,[4bf58dd8d48988d124941735],[Business and Professional Services > Office],https://foursquare.com/placemakers/review-plac...,,b'\x00\x00\x00\x00\x01\xc0X\x8d\xf0\x00\x00\x0...,"{'xmin': -98.2177734375, 'ymin': 40.1452892956...",2025-06-10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1233297,51ebe4ea454a1e96acd3b0e8,Garage Door Rhode Island,52.268431,-113.770574,162 Carpenter St,Providence,RI,02903,,,...,,,,[4d954b06a243a5684965b473],[Community and Government > Residential Buildi...,https://foursquare.com/placemakers/review-plac...,,"b'\x00\x00\x00\x00\x01\xc0\\qQ\x15\xcd\x08%@J""...","{'xmin': -113.77057404539717, 'ymin': 52.26843...",2025-06-10
1233298,51efa882ccda066c9a4af45a,Rhode Island Garage Door Repair,56.762734,-111.432257,218 Chalkstone Ave,Providence,RI,02908,,,...,,,,[4d954b06a243a5684965b473],[Community and Government > Residential Buildi...,https://foursquare.com/placemakers/review-plac...,,b'\x00\x00\x00\x00\x01\xc0[\xdb\xaa\x1b\r\t @L...,"{'xmin': -111.43225742600453, 'ymin': 56.76273...",2025-06-10
1233316,569e005e498e49dfe28e1b94,Modern Home Improvements,45.676453,-118.755447,2811 NE Riverside,Pendleton,RI,97801,,,...,,,,[545419b1498ea6ccd0202f58],[Business and Professional Services > Home Imp...,https://foursquare.com/placemakers/review-plac...,,b'\x00\x00\x00\x00\x01\xc0]\xb0Y>f\xaf\xcf@F\x...,"{'xmin': -118.75544700649264, 'ymin': 45.67645...",2025-06-10
1233322,568d0c1e498e88f9764cd3c5,Sherry Dress LDM CPM,44.331131,-118.948799,25198 Hwy. 395 S,Canyon City,RI,97820,,,...,7.959868e+14,,,[4f4532974b9074f6e4fb0104],[Business and Professional Services > Child Ca...,https://foursquare.com/placemakers/review-plac...,,b'\x00\x00\x00\x00\x01\xc0]\xbc\xb9 \x00\x00\x...,"{'xmin': -118.94879913330078, 'ymin': 44.33113...",2025-06-10


In [None]:
def token_distribution_by_geohash(df, geohash_precision=5):
    df = df.copy()
    
    # Vectorized geohash encoding using zip and list comprehension
    df['geohash'] = [geohash.encode(lat, lon, precision=geohash_precision) 
                     for lat, lon in zip(df['latitude'], df['longitude'])]
    
    # Tokenize names
    df['tokens'] = df['name'].apply(tokenize)  # assumes list of strings

    print(f"Total POIs: {len(df)}")
    
    # Flatten all tokens for global count
    global_token_counts = Counter(token for tokens in df['tokens'] for token in tokens)
    
    # Group by geohash
    geo_token_counts = defaultdict(Counter)
    geo_token_sets = defaultdict(set)

    for geo, tokens in zip(df['geohash'], df['tokens']):
        geo_token_counts[geo].update(tokens)
        geo_token_sets[geo].update(tokens)

    # Now compile final stats
    rows = []
    for token, global_count in global_token_counts.items():
        n_regions = sum(1 for geo in geo_token_counts if token in geo_token_counts[geo])
        # max_local_count = max(
        #     geo_token_counts[geo][token] for geo in geo_token_counts if token in geo_token_counts[geo]
        # )
        rows.append({
            'token': token,
            'global_count': global_count,
            'n_regions': n_regions
            # 'max_local_count': max_local_count
        })

    token_df = pd.DataFrame(rows)
    n_unique_regions = df['geohash'].nunique()
    
    return n_unique_regions, token_df.sort_values(by='global_count', ascending=False)

In [20]:
ny_df = pd.read_parquet("/share/garg/accessgaps2024/fsq_dedup_pipeline/data/fsq_data/ny_fsq_pois_by_region.parquet")
print("og len: " + str(len(ny_df)))
ny_df = ny_df[(ny_df['latitude'].notna()) & (ny_df['longitude'].notna()) & (ny_df['region'] == 'NY')]
n_regions, token_df = token_distribution_by_geohash(ny_df, geohash_precision=7)
token_df

og len: 1499730


NameError: name 'geohash' is not defined

In [None]:
lbled_10005 = pd.read_parquet("/share/garg/accessgaps2024/fsq_dedup_pipeline/results/fqs_labelled_10005")
dups = lbled_10005[lbled_10005['isdup'] == True].sort_values(by = 'name')
groups = dups.groupby("resolved_fsq_id")[['name']].transform(lambda x: ", ".join(x)).reset_index()
for name in groups['name']:
    print(name)

10 Hanover Rooftop, 10 Hanover Square 9th Floor, 11 Hanover Square
10 Hanover Rooftop, 10 Hanover Square 9th Floor, 11 Hanover Square
10 Hanover Rooftop, 10 Hanover Square 9th Floor, 11 Hanover Square
15 William Basketball Court, 15 William Gym, 15 William Pool
15 William Basketball Court, 15 William Gym, 15 William Pool
15 William Basketball Court, 15 William Gym, 15 William Pool
18 Broad Street, 20 Broad Street
18 Broad Street, 20 Broad Street
20 Exchange Place Lounge, Twenty Exchange
20 Pine Gym, 20 Pine Pool / Sauna, 20 Pine Rooftop, 20 Pine Steam Room, Athletic Center At 20 Pine St
20 Pine Gym, 20 Pine Pool / Sauna, 20 Pine Rooftop, 20 Pine Steam Room, Athletic Center At 20 Pine St
20 Pine Gym, 20 Pine Pool / Sauna, 20 Pine Rooftop, 20 Pine Steam Room, Athletic Center At 20 Pine St
20 Pine Gym, 20 Pine Pool / Sauna, 20 Pine Rooftop, 20 Pine Steam Room, Athletic Center At 20 Pine St
28 Liberty Street, The Liberty Plaza Apartments
37 Wall Street Roof Deck, 45 Wall Roof Deck
37 Wall 

In [None]:
MIN_LAT = 42.2279
MAX_LAT = 42.4008
MIN_LON = -71.1912
MAX_LON = -70.9860

boston_df = generate_df_from_bb(MIN_LON, MAX_LON, MIN_LAT, MAX_LAT, fsq_release_date = "2025-06-10", file_name = 'ma_fsq_pois')
boston_gdf = gpd.GeoDataFrame(boston_df, geometry='geometry', crs="EPSG:4326")
boston_gdf['date_created'] = pd.to_datetime(boston_gdf['date_created'], errors='coerce')
boston_gdf['date_closed'] = pd.to_datetime(boston_gdf['date_closed'], errors='coerce')
boston_dedup, boston_lbled = await deduplicate(nyc_child, max_distance = 100, name_similarity_threshold = 90, precision = 7, blacklist = NYC_BLACKLIST, file_name = '/share/garg/accessgaps2024/fsq_dedup_pipeline/results/childcare_test')


: 