In [None]:
from datasets import load_dataset, load_dataset_builder
import pandas as pd

import os
import time
import numpy as np
from pathlib import Path # makes paths more readable

In [3]:
# load foursquare open places dataset builder to inspect it
ds_builder = load_dataset_builder("foursquare/fsq-os-places", "places")

Resolving data files:   0%|          | 0/100 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/100 [00:00<?, ?it/s]

In [4]:
# inspect dataset features
ds_builder.info.features

{'fsq_place_id': Value('string'),
 'name': Value('string'),
 'latitude': Value('float64'),
 'longitude': Value('float64'),
 'address': Value('string'),
 'locality': Value('string'),
 'region': Value('string'),
 'postcode': Value('string'),
 'admin_region': Value('string'),
 'post_town': Value('string'),
 'po_box': Value('string'),
 'country': Value('string'),
 'date_created': Value('string'),
 'date_refreshed': Value('string'),
 'date_closed': Value('string'),
 'tel': Value('string'),
 'website': Value('string'),
 'email': Value('string'),
 'facebook_id': Value('int64'),
 'instagram': Value('string'),
 'twitter': Value('string'),
 'fsq_category_ids': List(Value('string')),
 'fsq_category_labels': List(Value('string')),
 'placemaker_url': Value('string'),
 'unresolved_flags': List(Value('string')),
 'geom': Value('binary'),
 'bbox': {'xmin': Value('float64'),
  'ymin': Value('float64'),
  'xmax': Value('float64'),
  'ymax': Value('float64')}}

In [5]:
# us and canada ('country' column)
countries_include = ['US', 'CA']

# countries to exclude ('country' column)
countries_exclude = [
    "AF", "AX", "AL", "DZ", "AS", "AD", "AO", "AI", "AQ", "AG", "AR", "AM", "AW", "AU", "AT", "AZ",
    "BS", "BH", "BD", "BB", "BY", "BE", "BZ", "BJ", "BM", "BT", "BO", "BQ", "BA", "BW", "BV", "BR",
    "IO", "BN", "BG", "BF", "BI", "CV", "KH", "CM", "KY", "CF", "TD", "CL", "CN", "CX", "CC", "CO",
    "KM", "CG", "CD", "CK", "CR", "CI", "HR", "CU", "CW", "CY", "CZ", "DK", "DJ", "DM", "DO", "EC",
    "EG", "SV", "GQ", "ER", "EE", "SZ", "ET", "FK", "FO", "FJ", "FI", "FR", "GF", "PF", "TF", "GA",
    "GM", "GE", "DE", "GH", "GI", "GR", "GL", "GD", "GP", "GU", "GT", "GG", "GN", "GW", "GY", "HT",
    "HM", "VA", "HN", "HK", "HU", "IS", "IN", "ID", "IR", "IQ", "IE", "IM", "IL", "IT", "JM", "JP",
    "JE", "JO", "KZ", "KE", "KI", "KP", "KR", "KW", "KG", "LA", "LV", "LB", "LS", "LR", "LY", "LI",
    "LT", "LU", "MO", "MG", "MW", "MY", "MV", "ML", "MT", "MH", "MQ", "MR", "MU", "YT", "MX", "FM",
    "MD", "MC", "MN", "ME", "MS", "MA", "MZ", "MM", "NA", "NR", "NP", "NL", "NC", "NZ", "NI", "NE",
    "NG", "NU", "NF", "MK", "MP", "NO", "OM", "PK", "PW", "PS", "PA", "PG", "PY", "PE", "PH", "PN",
    "PL", "PT", "PR", "QA", "RE", "RO", "RU", "RW", "BL", "SH", "KN", "LC", "MF", "PM", "VC", "WS",
    "SM", "ST", "SA", "SN", "RS", "SC", "SL", "SG", "SX", "SK", "SI", "SB", "SO", "ZA", "GS", "SS",
    "ES", "LK", "SD", "SR", "SJ", "SE", "CH", "SY", "TW", "TJ", "TZ", "TH", "TL", "TG", "TK", "TO",
    "TT", "TN", "TR", "TM", "TC", "TV", "UG", "UA", "AE", "GB", "UY", "UZ", "VU", "VE", "VN", "VG",
    "VI", "WF", "EH", "YE", "ZM", "ZW", "UM",
]

# us states ('region' column)
us_states_include = [
    "AL", "AK", "AZ", "AR", "AS", "CA", "CO", "CT", "DE", "DC", "FL", "GA", "GU", "HI", "ID", "IL", "IN",
    "IA", "KS", "KY", "LA", "ME", "MD", "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", "NM", 
    "NY", "NC", "ND", "MP", "OH", "OK", "OR", "PA", "PR", "RI", "SC", "SD", "TN", "TX", "TT", "UT", "VT", 
    "VA", "VI", "WA", "WV", "WI", "WY",
]

# canadian provinces ('region' column)
ca_provinces_include = [
    "NL", "PE", "NS", "NB", "QC", "ON", "MB", "SK", "AB", "BC", "YT", "NT", "NU",
]



In [12]:
# fsq chinese restaurant category ids
# wondering if some restaurants are just labeled "chinese" with no subcategory


chinese_category_ids = [
    "4bf58dd8d48988d145941735",  # Chinese Restaurant (main category)
    "52af3a723cf9994f4e043bec",  # Beijing Restaurant
    "52af3a7c3cf9994f4e043bed",  # Cantonese Restaurant
    "58daa1558bbb0b01f18ec1d3",  # Cha Chaan Teng
    "52af3a673cf9994f4e043beb",  # Chinese Aristocrat Restaurant
    "52af3a903cf9994f4e043bee",  # Chinese Breakfast Restaurant
    "4bf58dd8d48988d1f5931735",  # Dim Sum Restaurant
    "52af3a9f3cf9994f4e043bef",  # Dongbei Restaurant
    "52af3aaa3cf9994f4e043bf0",  # Fujian Restaurant
    "52af3ab53cf9994f4e043bf1",  # Guizhou Restaurant
    "52af3abe3cf9994f4e043bf2",  # Hainan Restaurant
    "52af3ac83cf9994f4e043bf3",  # Hakka Restaurant
    "52af3ad23cf9994f4e043bf4",  # Henan Restaurant
    "52af3add3cf9994f4e043bf5",  # Hong Kong Restaurant
    "52af3af23cf9994f4e043bf7",  # Huaiyang Restaurant
    "52af3ae63cf9994f4e043bf6",  # Hubei Restaurant
    "52af3afc3cf9994f4e043bf8",  # Hunan Restaurant
    "52af3b053cf9994f4e043bf9",  # Imperial Restaurant
    "52af3b213cf9994f4e043bfa",  # Jiangsu Restaurant
    "52af3b293cf9994f4e043bfb",  # Jiangxi Restaurant
    "52af3b343cf9994f4e043bfc",  # Macanese Restaurant
    "52af3b3b3cf9994f4e043bfd",  # Manchu Restaurant
    "52af3b463cf9994f4e043bfe",  # Peking Duck Restaurant
    "52af3b633cf9994f4e043c01",  # Shaanxi Restaurant
    "52af3b513cf9994f4e043bff",  # Shandong Restaurant
    "52af3b593cf9994f4e043c00",  # Shanghai Restaurant
    "52af3b6e3cf9994f4e043c02",  # Shanxi Restaurant
    "52af3b773cf9994f4e043c03",  # Szechuan Restaurant
    "52af3b813cf9994f4e043c04",  # Taiwanese Restaurant
    "52af3b893cf9994f4e043c05",  # Tianjin Restaurant
    "52af3b913cf9994f4e043c06",  # Xinjiang Restaurant
    "52af3b9a3cf9994f4e043c07"   # Yunnan Restaurant
]


In [None]:
# load full(!!!!!) dataset (11GB)
dataset = load_dataset("foursquare/fsq-os-places", "places", split="train")

Resolving data files:   0%|          | 0/100 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/100 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/88 [00:00<?, ?it/s]

In [None]:
# extract all the chinese restaurants in us and canada

# based on country (country code) or region (state/province) variables
# include chinese restaurants with null country or region just in case

# Create output directory
output_dir = Path("chinese_restaurants_filtered")
output_dir.mkdir(exist_ok=True)

# Clear any existing files
for f in output_dir.glob("*.csv"):
    f.unlink()

batch_size = 500000
total_chinese_restaurants = 0

print("Starting batch processing with filtering...")

def is_chinese_restaurant(category_ids):
    """Check if any category ID matches Chinese restaurant categories"""
    try:
        # Check for null values
        if pd.isna(category_ids) or category_ids is None:
            return False
        
        # apparently 'fsq-category-ids' holds numpy arrays, not python lists
        # Convert numpy arrays to lists
        if isinstance(category_ids, np.ndarray):
            category_ids = category_ids.tolist()
        
        # Check if it's a list and contains Chinese restaurant categories
        if isinstance(category_ids, list):
            return any(cat in chinese_category_ids for cat in category_ids)
        return False
    except:
        # Fallback: if anything goes wrong, return False
        return False

def is_us_ca_location(country, region):
    # Check if location is in US/Canada, handling nulls properly
    # If both country and region are null, return True (we'll include these)
    if pd.isna(country) and pd.isna(region):
        return True
    
    # If country is in our include list
    if not pd.isna(country) and country in countries_include:
        return True
    
    # If country is in exclude list, return False
    if not pd.isna(country) and country in countries_exclude:
        return False
    
    # If region is a US state or Canadian province
    if not pd.isna(region):
        if region in us_states_include or region in ca_provinces_include:
            return True
    
    # If we have country but it's not in include/exclude lists, be conservative
    if not pd.isna(country):
        return False
    
    # If we only have region and it's not US/CA, return False
    if not pd.isna(region):
        return False
    
    # Default case: include if we can't determine (null country/region)
    return True

try:
    start_time = time.time()
    
    for i, df_batch in enumerate(dataset.to_pandas(batch_size=batch_size, batched=True)):
        print(f"Processing batch {i+1} with {len(df_batch)} rows...")
        
        if 'country' in df_batch.columns and 'fsq_category_ids' in df_batch.columns:
            # Get region column if it exists
            region_col = 'region' if 'region' in df_batch.columns else None
            
            # Apply location filtering
            if region_col:
                location_mask = df_batch.apply(
                    lambda row: is_us_ca_location(row['country'], row[region_col]), 
                    axis=1
                )
            else:
                location_mask = df_batch['country'].apply(
                    lambda x: is_us_ca_location(x, None)
                )
            
            location_filtered = df_batch[location_mask]
            print(f"  → {len(location_filtered)} rows after location filtering")
            
            # Apply Chinese restaurant filtering
            chinese_mask = location_filtered['fsq_category_ids'].apply(is_chinese_restaurant)
            filtered_batch = location_filtered[chinese_mask]
            
            if len(filtered_batch) > 0:
                output_file = output_dir / f"chinese_restaurants_batch_{i}.csv"
                filtered_batch.to_csv(output_file, index=False)
                total_chinese_restaurants += len(filtered_batch)
                print(f"  → Found {len(filtered_batch)} Chinese restaurants")
            else:
                print(f"  → No Chinese restaurants in this batch")
        else:
            print(f"  → Skipping batch - missing required columns")
            print(f"  → Available columns: {list(df_batch.columns)}")
        
        if (i + 1) % 10 == 0:
            elapsed = time.time() - start_time
            print(f"Completed {i + 1} batches in {elapsed:.1f} seconds")
            print(f"Total Chinese restaurants found so far: {total_chinese_restaurants}")
            
except Exception as e:
    print(f"Error processing batch {i + 1}: {e}")
    import traceback
    traceback.print_exc()

print(f"Processing complete. Total Chinese restaurants found: {total_chinese_restaurants}")


Starting corrected batch processing with proper filtering...
Processing batch 1 with 500000 rows...
  → 431893 rows after location filtering
  → Found 1469 Chinese restaurants
Processing batch 2 with 500000 rows...
  → 499999 rows after location filtering
  → Found 2064 Chinese restaurants
Processing batch 3 with 500000 rows...
  → 499998 rows after location filtering
  → Found 1015 Chinese restaurants
Processing batch 4 with 500000 rows...
  → 500000 rows after location filtering
  → Found 2495 Chinese restaurants
Processing batch 5 with 500000 rows...
  → 397211 rows after location filtering
  → Found 901 Chinese restaurants
Processing batch 6 with 500000 rows...
  → 499967 rows after location filtering
  → Found 1227 Chinese restaurants
Processing batch 7 with 500000 rows...
  → 67232 rows after location filtering
  → Found 109 Chinese restaurants
Processing batch 8 with 500000 rows...
  → 0 rows after location filtering
  → No Chinese restaurants in this batch
Processing batch 9 wi

In [19]:
# Combine all the CSV files into one

csv_files = sorted(Path(output_dir).glob("chinese_restaurants_batch_*.csv"))
dfs = []
for csv_file in csv_files:
    df = pd.read_csv(csv_file)
    dfs.append(df)

if dfs:
    combined_df = pd.concat(dfs, ignore_index=True)
    combined_output_file = Path(output_dir) / "chinese_restaurants_combined.csv"
    combined_df.to_csv(combined_output_file, index=False)
    print(f"Combined {len(csv_files)} CSV files into {combined_output_file}")
    print(f"Total rows in combined file: {len(combined_df)}")
else:
    print("No batch CSV files found to combine.")


Combined 61 CSV files into chinese_restaurants_filtered/chinese_restaurants_combined.csv
Total rows in combined file: 64361


In [22]:
chinese_restaurants = pd.read_csv(output_dir / "chinese_restaurants_combined.csv")
chinese_restaurants.head()

Unnamed: 0,fsq_place_id,name,latitude,longitude,address,locality,region,postcode,admin_region,post_town,...,email,facebook_id,instagram,twitter,fsq_category_ids,fsq_category_labels,placemaker_url,unresolved_flags,geom,bbox
0,4c8c4e92cf3ea1434a7af451,Lee's Chinese,39.444009,-123.804738,154 E Redwood Ave,Fort Bragg,CA,95437.0,,,...,info@bbbemail.org,162608400000000.0,,,['4bf58dd8d48988d145941735'],['Dining and Drinking > Restaurant > Asian Res...,https://foursquare.com/placemakers/review-plac...,,b'\x00\x00\x00\x00\x01\xc0^\xf3\x80\xd5\xb0\xd...,"{'xmin': -123.80473844785541, 'ymin': 39.44400..."
1,4f442d5819836ed00192b620,China Express,39.433506,-123.805474,660 S Main St,Fort Bragg,CA,95437.0,,,...,,,,,['4bf58dd8d48988d145941735'],['Dining and Drinking > Restaurant > Asian Res...,https://foursquare.com/placemakers/review-plac...,,b'\x00\x00\x00\x00\x01\xc0^\xf3\x8c\xe3\xdd\xf...,"{'xmin': -123.80547424960108, 'ymin': 39.43350..."
2,a194a71384e241d278559b60,Hill House Restaurant,39.310533,-123.79836,10701 Palette Dr,Mendocino,CA,95460.0,,,...,frontdesk@mendocinohotels.com,,,,['4bf58dd8d48988d145941735'],['Dining and Drinking > Restaurant > Asian Res...,https://foursquare.com/placemakers/review-plac...,,b'\x00\x00\x00\x00\x01\xc0^\xf3\x18T\x8a\x9b\x...,"{'xmin': -123.79836, 'ymin': 39.310533, 'xmax'..."
3,5dffd431f4a5f60008cfa1ce,Panda Express,39.130484,-123.198543,1236 Airport Park Blvd,Ukiah,CA,95482.0,,,...,,,,,['4bf58dd8d48988d145941735'],['Dining and Drinking > Restaurant > Asian Res...,https://foursquare.com/placemakers/review-plac...,,b'\x00\x00\x00\x00\x01\xc0^\xcc\xb4\xed|X;@C\x...,"{'xmin': -123.19854294913323, 'ymin': 39.13048..."
4,4f32482419836c91c7c8471b,New Dragon,39.140123,-123.205708,765 S State St,Ukiah,CA,95482.0,,,...,,111568100000000.0,,,['4bf58dd8d48988d145941735'],['Dining and Drinking > Restaurant > Asian Res...,https://foursquare.com/placemakers/review-plac...,,b'\x00\x00\x00\x00\x01\xc0^\xcd*Pt\xa0@@C\x91\...,"{'xmin': -123.20570765866523, 'ymin': 39.14012..."


In [28]:
chinese_restaurants.shape

(64361, 27)

In [26]:
chinese_restaurants.columns

Index(['fsq_place_id', 'name', 'latitude', 'longitude', 'address', 'locality',
       'region', 'postcode', 'admin_region', 'post_town', 'po_box', 'country',
       'date_created', 'date_refreshed', 'date_closed', 'tel', 'website',
       'email', 'facebook_id', 'instagram', 'twitter', 'fsq_category_ids',
       'fsq_category_labels', 'placemaker_url', 'unresolved_flags', 'geom',
       'bbox'],
      dtype='object')

In [None]:
# find all the rows with no country listed
country_null = chinese_restaurants[chinese_restaurants['country'].isna()]
print(country_null.shape)
country_null.head()

(13, 27)


Unnamed: 0,fsq_place_id,name,latitude,longitude,address,locality,region,postcode,admin_region,post_town,...,email,facebook_id,instagram,twitter,fsq_category_ids,fsq_category_labels,placemaker_url,unresolved_flags,geom,bbox
5180,57ba8b46498ed8856ca33475,Great Wei Restaurant,2.195557,102.237099,,,,,,,...,,,,,['4bf58dd8d48988d145941735'],['Dining and Drinking > Restaurant > Asian Res...,https://foursquare.com/placemakers/review-plac...,,"b'\x00\x00\x00\x00\x01@Y\x8f,\xa1H\xba\x84@\x0...","{'xmin': 102.237099, 'ymin': 2.195557, 'xmax':..."
7661,57c5e9ca498eea2cadb12bc1,RoomFree,43.45938,39.900918,,,,,,,...,,,,,['4bf58dd8d48988d145941735'],['Dining and Drinking > Restaurant > Asian Res...,https://foursquare.com/placemakers/review-plac...,,b'\x00\x00\x00\x00\x01@C\xf3QG\xf10Y@E\xba\xcc...,"{'xmin': 39.900918, 'ymin': 43.45938, 'xmax': ..."
7662,57c5e96d498ecbc36caea471,бар вок,43.458568,39.901039,,,,,,,...,,,,,['4bf58dd8d48988d145941735'],['Dining and Drinking > Restaurant > Asian Res...,https://foursquare.com/placemakers/review-plac...,,b'\x00\x00\x00\x00\x01@C\xf3U>\xf6\xb5\xd4@E\x...,"{'xmin': 39.901039, 'ymin': 43.458568, 'xmax':..."
8001,57bada85498e61797111af4a,эээ,59.87001,29.855366,,,,,,,...,,,,,['4bf58dd8d48988d145941735'],['Dining and Drinking > Restaurant > Asian Res...,https://foursquare.com/placemakers/review-plac...,,b'\x00\x00\x00\x00\x01@=\xda\xf9D$\x1c?@M\xef\...,"{'xmin': 29.855366, 'ymin': 59.87001, 'xmax': ..."
8002,57badab2498e4fd6c6a3d773,Япония,59.869533,29.854272,,,,,,,...,,,,,['4bf58dd8d48988d145941735'],['Dining and Drinking > Restaurant > Asian Res...,https://foursquare.com/placemakers/review-plac...,,b'\x00\x00\x00\x00\x01@=\xda\xb1\x91\xdd\xe3v@...,"{'xmin': 29.854272, 'ymin': 59.869533, 'xmax':..."


In [None]:
# remove rows with no country listed (13 rows, not gonna bother looking into them)
chinese_restaurants = chinese_restaurants[chinese_restaurants['country'].notna()]
chinese_restaurants.shape

# save new csv
chinese_restaurants.to_csv(output_dir / "chinese_restaurants_cleaned.csv", index=False)


In [31]:
# find all the rows with no region (state or province) listed
region_null = chinese_restaurants[chinese_restaurants['region'].isna()]
print(region_null.shape)
region_null.head()

(1640, 27)


Unnamed: 0,fsq_place_id,name,latitude,longitude,address,locality,region,postcode,admin_region,post_town,...,email,facebook_id,instagram,twitter,fsq_category_ids,fsq_category_labels,placemaker_url,unresolved_flags,geom,bbox
26,4bd881f40b779c74b95507a0,Fu Hing,39.520284,-122.193704,,,,,,,...,,,,,['4bf58dd8d48988d145941735'],['Dining and Drinking > Restaurant > Asian Res...,https://foursquare.com/placemakers/review-plac...,,b'\x00\x00\x00\x00\x01\xc0^\x8ce\xa5\x17\nO@C\...,"{'xmin': -122.19370391130472, 'ymin': 39.52028..."
43,4f5bb94ee4b01f8f4c30222c,Rice Wok,39.761965,-121.82338,,,,,,,...,,,,,['4bf58dd8d48988d145941735'],['Dining and Drinking > Restaurant > Asian Res...,https://foursquare.com/placemakers/review-plac...,,b'\x00\x00\x00\x00\x01\xc0^t\xb2@\x9c\xde\xb0@...,"{'xmin': -121.82337966269802, 'ymin': 39.76196..."
237,4f6b7272e4b0d8154c2fb5ef,Manchu Wok,38.269847,-121.948936,,,,,,,...,,,,,['4bf58dd8d48988d145941735'],['Dining and Drinking > Restaurant > Asian Res...,https://foursquare.com/placemakers/review-plac...,,"b'\x00\x00\x00\x00\x01\xc0^|\xbb^\xce[\xfb@C""\...","{'xmin': -121.94893617775226, 'ymin': 38.26984..."
391,4e79745852b1fc6a4ae6a494,MR. NOODLE,37.973042,-122.33957,,,,,,,...,,,,,['4bf58dd8d48988d145941735'],['Dining and Drinking > Restaurant > Asian Res...,https://foursquare.com/placemakers/review-plac...,,"b'\x00\x00\x00\x00\x01\xc0^\x95\xbb\x83\x03*,@...","{'xmin': -122.33956981, 'ymin': 37.97304232, '..."
488,4d55bdd1a05c3704ca8dbc87,Emmo's Kitchen,37.851761,-122.273421,,,,,,,...,,,,,['4bf58dd8d48988d145941735'],['Dining and Drinking > Restaurant > Asian Res...,https://foursquare.com/placemakers/review-plac...,,b'\x00\x00\x00\x00\x01\xc0^\x91\x7f\xbb\xa8DB@...,"{'xmin': -122.27342120582856, 'ymin': 37.85176..."


In [None]:
# count rows with no lat/long coords
print(len(chinese_restaurants[chinese_restaurants['latitude'].isna() | chinese_restaurants['longitude'].isna()]))

# answer is 0 rows, yay!

0


In [43]:
# count rows where 'date_closed' is null
open_restaurants = len(chinese_restaurants[chinese_restaurants['date_closed'].isna()])
print(f'open restaurants: {open_restaurants}')

open restaurants: 50712


In [42]:
total_restaurants = len(chinese_restaurants)
open_restaurants = len(chinese_restaurants[chinese_restaurants['date_closed'].isna()])

print(f'closed restaurants: {total_restaurants - open_restaurants}')

closed restaurants: 13636
