# Script for Generating Automatic Blacklist Based on a FSQ Dataset

In [1]:
from sklearn.neighbors import BallTree
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import re
import unicodedata
from collections import defaultdict, Counter
from shapely import wkt
from shapely.geometry import Polygon
from shapely.ops import unary_union
import pandas as pd
import geohash
import nltk
from nltk.corpus import stopwords
from shapely.geometry import Point

# Download stopwords once
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))


[nltk_data] Downloading package stopwords to /home/kl877/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Helper Functions

In [2]:
def tokenize(text):
    """Lowercase, remove punctuation, and split into tokens."""
    words = re.findall(r'\b[a-z]{2,}\b', text.lower())
    return words

def token_distribution_by_geohash(df, geohash_precision=5):
    """ 
    Helper function for [generate_blacklist].
    Produces a DataFrame storing the tokens of all names in [df], 
    along with the global count for each token and the number of regions it appears in.

    Returns the total number of regions the df is partitioned into and the token dataframe
    as a tuple, n_regions, token_df

    Parameters:
        df (FSQ POI DataFrame/GeoDataFrame): must have 'name', 'latitude' and 'longitude' columns.
        geohash_precision (int, optional): must be between 1 (inclusive) and 12 (inclusive). 
        Larger precisions mean smaller partitioning of grids in [df]. The default param is 5.
    """
    df = df.copy()
    
    # Vectorized geohash encoding using zip and list comprehension
    df['geohash'] = [geohash.encode(lat, lon, precision=geohash_precision) 
                     for lat, lon in zip(df['latitude'], df['longitude'])]
    
    # Tokenize names
    df['tokens'] = df['name'].apply(tokenize)  # assumes list of strings

    print(f"Total POIs: {len(df)}")
    
    # Flatten all tokens for global count
    global_token_counts = Counter(token for tokens in df['tokens'] for token in tokens)
    
    # Group by geohash
    geo_token_counts = defaultdict(Counter)
    geo_token_sets = defaultdict(set)

    for geo, tokens in zip(df['geohash'], df['tokens']):
        geo_token_counts[geo].update(tokens)
        geo_token_sets[geo].update(tokens)

    # Now compile final stats
    rows = []
    for token, global_count in global_token_counts.items():
        n_regions = sum(1 for geo in geo_token_counts if token in geo_token_counts[geo])
        # max_local_count = max(
        #     geo_token_counts[geo][token] for geo in geo_token_counts if token in geo_token_counts[geo]
        # )
        rows.append({
            'token': token,
            'global_count': global_count,
            'n_regions': n_regions
            # 'max_local_count': max_local_count
        })

    token_df = pd.DataFrame(rows)
    n_unique_regions = df['geohash'].nunique()
    
    return n_unique_regions, token_df.sort_values(by='global_count', ascending=False)

def generate_blacklist(gdf, geohash_precision=5, file_path = None):
    """ 
    Creates a list of globally common and widespread words found from names in [gdf].

    Parameters:
        gdf (GeoDataFrame): contains tokens and their global counts and number of regions appeared in 
        (look at token_distribution_by_geohash). Must have 'token', 'global_count', and 'region_coverage' columns.
        geohas_precision (int, optional): must be between 1 (inclusive) and 12 (inclusive). 
        Larger precisions mean smaller partitioning of grids in [df]. The default param is 5.
        file_path (string, optional): path to a txt file storing the blacklist, where each line is a single token.
    """
    n_regions, token_df = token_distribution_by_geohash(gdf, geohash_precision)
    print(f"Total unique geohashes: {n_regions}")
    token_df['global_pct'] = token_df['global_count'] / token_df['global_count'].max()
    token_df['region_coverage'] = token_df['n_regions'] / n_regions
    blacklist = token_df[(token_df['global_pct'] > 0.01) & (token_df['region_coverage'] > 0.005)]['token'].to_list()
    if file_path:
        # Open the file in write mode ('w')
        with open(file_path, 'w') as f:  
            for item in blacklist:
                # Write each item followed by a newline
                f.write(f"{item}\n") 
    return blacklist 

### Example Usage

In [None]:
ne_df = pd.read_parquet("/share/garg/accessgaps2024/fsq_dedup_pipeline/data/fsq_data/ne_fsq_pois_by_region.parquet")
ma_df = ne_df[(ne_df['latitude'].notna()) & (ne_df['longitude'].notna()) & (ne_df['region'] == "MA")]
ma_df['geometry'] = ma_df.apply(lambda row: Point(row['longitude'], row['latitude']), axis=1)
ma_gdf = gpd.GeoDataFrame(ma_df, geometry='geometry', crs="EPSG:4326")
ma_gdf['date_created'] = pd.to_datetime(ma_gdf['date_created'], errors='coerce')
ma_gdf['date_closed'] = pd.to_datetime(ma_gdf['date_closed'], errors='coerce')
print("ma len: " + str(len(ma_df)))

MIN_LAT = 42.2279
MAX_LAT = 42.4008
MIN_LON = -71.1912
MAX_LON = -70.9860

boston_df = ne_df[(ne_df['latitude'] >= MIN_LAT) & (ne_df['latitude'] <= MAX_LAT) & (ne_df['longitude'] >= MIN_LON) & (ma_df['longitude'] <= MAX_LON)]
print("bos len: " + str(len(boston_df)))

ri_df = ne_df[(ne_df['latitude'].notna()) & (ne_df['longitude'].notna()) & (ne_df['region'] == "RI")]
print("ri len: " + str(len(ri_df)))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ma_df['geometry'] = ma_df.apply(lambda row: Point(row['longitude'], row['latitude']), axis=1)


ma len: 545701
bos len: 112892
ri len: 79723


In [19]:
ri_df['geometry'] = ri_df.apply(lambda row: Point(row['longitude'], row['latitude']), axis=1)
ri_gdf = gpd.GeoDataFrame(ri_df, geometry='geometry', crs="EPSG:4326")
ri_gdf['date_created'] = pd.to_datetime(ri_gdf['date_created'], errors='coerce')
ri_gdf['date_closed'] = pd.to_datetime(ri_gdf['date_closed'], errors='coerce')
ri_gdf.to_parquet("/share/garg/accessgaps2024/fsq_dedup_pipeline/data/fsq_data/ri_fsq_pois")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ri_df['geometry'] = ri_df.apply(lambda row: Point(row['longitude'], row['latitude']), axis=1)


In [11]:
ri_blacklist = generate_blacklist(ri_df, geohash_precision = 7, file_name = '/share/garg/accessgaps2024/fsq_dedup_pipeline/ri_blklist_test')
ri_blacklist

Total POIs: 79723
Total unique geohashes: 23359


['the',
 'of',
 'and',
 'center',
 'at',
 'island',
 'co',
 'services',
 'service',
 'rhode',
 'auto',
 'school',
 'providence',
 'ri',
 'voting',
 'house',
 'inc',
 'newport',
 'salon',
 'office',
 'group',
 'home',
 'east',
 'restaurant',
 'st',
 'pizza',
 'new',
 'construction',
 'church',
 'park',
 'financial',
 'shop',
 'dr',
 'bank',
 'associates',
 'club',
 'bar',
 'company',
 'care',
 'law',
 'cafe',
 'room',
 'hall',
 'state',
 'llc',
 'street',
 'studio',
 'hill',
 'repair',
 'north',
 'beach',
 'farm',
 'ocean',
 'station',
 'insurance',
 'family',
 'warwick',
 'design',
 'store',
 'hair',
 'market',
 'england',
 'union',
 'south',
 'electric',
 'for',
 'blue',
 'fire',
 'atm',
 'on',
 'sales',
 'by',
 'health',
 'in',
 'john',
 'fitness',
 'west',
 'american',
 'medical',
 'bay',
 'inn',
 'america',
 'management',
 'community',
 'field',
 'building',
 'solutions',
 'grill',
 'kitchen',
 'car',
 'united',
 'bus',
 'ave',
 'town',
 'body',
 'spa',
 'department',
 'place',
 'u

In [51]:
# ne_df = pd.read_parquet("/share/garg/accessgaps2024/fsq_dedup_pipeline/data/fsq_data/ne_fsq_pois_by_region.parquet")
# print("og len: " + str(len(ne_df)))
# # ne_regions = ['MA', 'ME', 'NH', 'CT', 'VT', 'RI']
# ne_regions = ['MA']
# ne_df = ne_df[(ne_df['latitude'].notna()) & (ne_df['longitude'].notna()) & (ne_df['region'].isin(ne_regions))]
n_regions, token_df = token_distribution_by_geohash(boston_df, geohash_precision=7)
token_df

Total POIs: 112892


Unnamed: 0,token,global_count,n_regions
182,boston,6519,2372
103,the,5663,2802
312,of,3376,1921
251,center,2839,1574
1308,md,2668,183
...,...,...,...
42897,asure,1,1
42886,tcaa,1,1
42885,celerity,1,1
42884,adfiliare,1,1


In [28]:
token_df = pd.read_csv("/share/garg/accessgaps2024/fsq_dedup_pipeline/results/ma_blacklist_tokens")

In [56]:
token_df['global_pct'] = token_df['global_count'] / token_df['global_count'].max()
token_df['region_coverage'] = token_df['n_regions'] / n_regions
blacklist_tokens = token_df[(token_df['global_pct'] > 0.01) & (token_df['region_coverage'] > 0.005)]
# blacklist_tokens = blacklist_tokens.drop('Unnamed: 0', axis = 1)
blacklist_tokens

Unnamed: 0,token,global_count,n_regions,global_pct,region_coverage
182,boston,6519,2372,1.000000,0.219305
103,the,5663,2802,0.868692,0.259061
312,of,3376,1921,0.517871,0.177607
251,center,2839,1574,0.435496,0.145525
1308,md,2668,183,0.409265,0.016919
...,...,...,...,...,...
1311,cpa,66,59,0.010124,0.005455
7698,language,66,61,0.010124,0.005640
2254,anthony,66,61,0.010124,0.005640
862,coach,66,59,0.010124,0.005455


In [57]:
for i, row in blacklist_tokens.iterrows():
    print(row['token'] + ", " + str(row['global_count']) + ", " + str(row['n_regions']))

boston, 6519, 2372
the, 5663, 2802
of, 3376, 1921
center, 2839, 1574
md, 2668, 183
and, 2410, 1607
st, 1890, 1240
at, 1879, 1163
street, 1822, 1236
services, 1313, 916
group, 1302, 811
flight, 1269, 180
office, 1243, 765
co, 1223, 945
park, 1191, 905
school, 1182, 829
bos, 1126, 170
cafe, 1085, 773
house, 1084, 846
mbta, 1044, 738
service, 1025, 862
cambridge, 1023, 490
bank, 935, 575
inc, 932, 638
bus, 891, 692
square, 882, 543
for, 869, 628
room, 860, 546
new, 853, 682
associates, 834, 575
to, 778, 409
law, 777, 380
salon, 775, 537
market, 733, 565
hall, 732, 424
shop, 730, 579
bar, 720, 508
ave, 713, 503
club, 712, 575
restaurant, 710, 502
harvard, 679, 288
company, 673, 540
auto, 669, 525
health, 660, 481
church, 652, 518
studio, 651, 507
financial, 631, 389
care, 622, 491
hill, 614, 425
building, 614, 401
station, 613, 503
united, 602, 332
design, 588, 469
management, 574, 408
stop, 573, 479
pizza, 569, 456
university, 556, 250
hair, 556, 418
home, 553, 456
city, 551, 450
college,

In [9]:
ne_global_blacklist = ne_global_rows['token'].to_list
file_name = "output.txt"
with open("/share/garg/accessgaps2024/fsq_dedup_pipeline/data/ne_blacklist.txt", 'w') as f:  # Open the file in write mode ('w')
    for item in ne_global_blacklist:
        f.write(f"{item}\n")  # Write each item followed by a newline


KeyboardInterrupt: 

In [29]:
man_pois = pd.read_parquet("/share/garg/accessgaps2024/fsq_dedup_pipeline/data/fsq_data/man_fsq_pois")
blacklist = generate_blacklist(man_pois, geohash_precision = 7, file_path = "/share/garg/accessgaps2024/fsq_dedup_pipeline/data/blacklists/manhattan_blacklist")
blacklist

Total POIs: 396948
Total unique geohashes: 8190


['the',
 'nyc',
 'taxi',
 'new',
 'of',
 'md',
 'street',
 'york',
 'and',
 'st',
 'center',
 'at',
 'park',
 'group',
 'inc',
 'office',
 'dr',
 'west',
 'studio',
 'ave',
 'ny',
 'east',
 'bus',
 'bar',
 'cafe',
 'co',
 'hq',
 'restaurant',
 'shop',
 'city',
 'avenue',
 'services',
 'for',
 'room',
 'salon',
 'deli',
 'manhattan',
 'house',
 'mta',
 'in',
 'food',
 'club',
 'school',
 'on',
 'party',
 'to',
 'spa',
 'gallery',
 'by',
 'service',
 'associates',
 'pizza',
 'lounge',
 'media',
 'design',
 'law',
 'llc',
 'management',
 'broadway',
 'bank',
 'coffee',
 'cab',
 'international',
 'la',
 'market',
 'store',
 'art',
 'health',
 'parking',
 'day',
 'studios',
 'up',
 'company',
 'central',
 'world',
 'care',
 'station',
 'square',
 'hair',
 'capital',
 'kitchen',
 'gym',
 'american',
 'cleaners',
 'hotel',
 'brooklyn',
 'medical',
 'financial',
 'church',
 'beauty',
 'usa',
 'cart',
 'america',
 'repair',
 'com',
 'grill',
 'express',
 'fitness',
 'dental',
 'de',
 'grocery',

In [34]:
GLOBAL_FOOD = {"pizza", "pizzeria", "cafe", "caffe", "coffee", "grill", "restaurant", "bar", "deli", "bakery", "express",
                  "market", "shop", "mart", "store", "grocery", "supermarket", "food", "gourmet", "cart", "fresh", "kitchen", 
                  "diner", "pub", "bistro", "tavern", "farm", "chicken", "burger", "sandwich", "taco", "tacos", "sushi", "noodle", "noodles"
                  "salad", "sub", "ice cream", "dessert", "breakfast", "brunch", "lunch", "dinner", "takeout", "delivery", 
                  "snack", "chocolate", "tea", "juice", "smoothie", "wine", "beer", "cocktail", "brewery", "distillery", 
                  "winery", "patisserie", "pastry", "bagel", "donut", "pancake", "waffle", "crepe", "treat", "cuisine", "truck",
                  "italian", "mexican", "chinese", "japanese", "korean", "indian", "thai", "vietnamese", "greek", "spanish",
                  "french", "american", "cuban", "cajun", "creole", "soul food", "bbq", "steakhouse", "seafood",
                  "vegetarian", "vegan", "gluten-free", "organic", "local", "artisan", "handmade", "craft", "homemade",
                  "family-owned", "authentic", "traditional", "fusion", "gastro", "gastropub", "halal", "kosher"}
GLOBAL_TRANS_ADDR = {"east", "west", "north", "south", "st", "street", "ave", "avenue", 
                 "blvd", "road", "rd", "rd.", "drive", "cab", "car", "truck", "van", "taxi", "metro", "sub", "subway", "mta", "station", "apt", "apartment", "station", "corner", "bus", "express", "line", "line", "plaza", "plz", "square", "sq", "lane", "ln", "way", "wy", "court", "ct",
                 "park", "pl", "pkwy", "parkway", "circle", "cir", "highway", "hwy", "route", "rte", "exit", "exit", "bridge", "bridges", "crossing", "crossings",
                 "crossings", "intersection", "intersections", "boulevard", "boulevards", "roadway", "roadways", "driveway", "driveways","avenue", "avenues", "streetway"}
GLOBAL_LOC = {"city", "village", "town", "museum", "group", "house", "center", "ctr", "art", "shop", "show", "theatre", "theater", "office", "service", "services", "bank", "jewelry", "club", 
              "community", "garden", "park", "field", "beach", "ocean", "river", "playground", "school", "college", "university", "library", "gallery", "studio", "hall", "auditorium", "venue", "church"
              "cleaner", "cleaners", "laundry", "laundromt", "pharmacy", "church", "clinic", "gym", "hospital", "fitness", "nail", "nails", "salon", "spa", "barber", "project", "projects"}
GLOBAL_WORDS = {"the", "a", "an", "and", "&", "of", "in", "for", "to", "at", "@", "on", "out", "with", "by", "from", "as", "that", "this", "it", "is", "was", "be", "are", "day", "care", "co."}
GLOBAL_SCHOOL_WRDS = {"high", "middle", "elementary", "school", "academy", "charter", "magnet", "daycare", "day", "care"}
GLOBAL_FIRE_STATION = {"fdny", "engine", "ems", "rescue", "group"}
GLOBAL_POLICE_STATION = {"nypd"}
LOCAL_PHRASES_NYC = {"new", "york", "nyc", "manhattan", "brooklyn", "queens", "bronx", "staten", "island", "ny", "city", "upper", "lower", "side", "marks"}
COMMON_PHRASES = GLOBAL_FOOD.union(GLOBAL_TRANS_ADDR).union(GLOBAL_LOC).union(GLOBAL_WORDS).union(LOCAL_PHRASES_NYC).union(GLOBAL_SCHOOL_WRDS).union(GLOBAL_FIRE_STATION).union(GLOBAL_POLICE_STATION)

# Neighborhood and local area terms to ignore during name normalization
LOCAL_AREAS_NYC = [

    # Lower Manhattan / East Side
    "alphabet", "lower", "west", "north", "east", "south", "street", "side", "les",
    "two", "bridges", "chinatown", "nolita", "soho", "noho", "little", "italy",
    "bowery", "seaport", "civic", "center", "marks", "wall", "financial", "tribeca", "fidi", "delancey",
    "clinton", "canal",
    
    # Midtown / Gramercy / Chelsea
    "gramercy", "flatiron", "murray", "midtown",
    "koreatown", "garment", "district", "nomad", "chelsea", "hell", "hells",
    "hudson", "yards", "theater", "times", "square", "rockefeller", "kips", "turtle", "bay", "herald", "penn", "station", "empire",
    
    # Upper Manhattan
    "upper", "harlem", "spanish", "heights",
    "morningside", "hamilton", "inwood", "washington","manhattenville", "sugar", "hill", "dyckman", "fort", "george", "columbia",
    
    # Downtown / River Areas / Parks
    "tompkins", "river", "park",
    "stuytown", "stuyvesant", "oval", "union",
    "madison", "bryant", "central", "battery", "riverside", "fdr", "drive",
    
    # Outer Boroughs or Bordering Areas
    "brooklyn", "queens", "bushwick", "greenpoint", "williamsburg", "bed-stuy",
    "dumbo", "long island city", "astoria", "ridgewood",
]

NYC_BLACKLIST = COMMON_PHRASES.union(LOCAL_AREAS_NYC)

print(set(blacklist) - set(LOCAL_AREAS_NYC))
print(set(LOCAL_AREAS_NYC).intersection(set(blacklist)))
# print(set(blacklist) - set(NYC_BLACKLIST))

{'pain', 'state', 'digital', 'cuisine', 'tavern', 'american', 'party', 'gallery', 'spot', 'network', 'starbucks', 'academy', 'college', 'clinic', 'voting', 'fund', 'casa', 'cornell', 'nj', 'architects', 'dental', 'of', 'diamond', 'downtown', 'nicholas', 'off', 'tree', 'pizza', 'gym', 'brown', 'cpa', 'international', 'repair', 'art', 'joseph', 'group', 'great', 'metro', 'studio', 'tech', 'house', 'laundry', 'ny', 'manhattan', 'space', 'partners', 'unisex', 'hotel', 'village', 'anthony', 'library', 'headquarters', 'limo', 'technology', 'production', 'one', 'nyc', 'ii', 'technologies', 'richard', 'roof', 'professional', 'stanley', 'liquors', 'conference', 'lincoln', 'sports', 'david', 'laundromat', 'news', 'candy', 'ps', 'dance', 'association', 'inc', 'program', 'phd', 'wine', 'lounge', 'cohen', 'third', 'beach', 'in', 'citi', 'pnc', 'french', 'magazine', 'construction', 'lexington', 'training', 'nations', 'water', 'museum', 'mortgage', 'farm', 'records', 'parade', 'amsterdam', 'john', 's