In [None]:
import pandas as pd
import re
import requests
import time
import os

API_KEY = os.getenv("YELP_API_KEY")  
if not API_KEY:
    raise ValueError("YELP_API_KEY environment variable is not set.")

HEADERS = {"Authorization": f"Bearer {API_KEY}"}
CUISINES = ["Chinese", "Italian", "Mexican", "Indian", "Japanese", "Thai"]
LOCATION = "Manhattan, NY"
OUTPUT_CSV = "manhattan_restaurants.csv"

def fetch_restaurants(cuisine, offset):
    """Fetch a batch of restaurants from Yelp API"""
    url = "https://api.yelp.com/v3/businesses/search"
    params = {
        "term": f"{cuisine} restaurant",
        "location": LOCATION,
        "limit": 50,  # max per request
        "offset": offset
    }
    response = requests.get(url, headers=HEADERS, params=params)
    if response.status_code == 200:
        return response.json().get("businesses", [])
    else:
        print(f"Error {response.status_code} for {cuisine} offset {offset}")
        return []

def extract_zip(address):
    """Extract 5-digit zip code from address"""
    match = re.search(r'\b\d{5}\b', address)
    return match.group(0) if match else ""

# Main Script
all_data = []
seen_ids = set()

for cuisine in CUISINES:
    print(f"\nFetching {cuisine} restaurants...")
    collected = []
    offset = 0

    while len(collected) < 250:
        businesses = fetch_restaurants(cuisine, offset)
        if not businesses:
            break

        for b in businesses:
            if b["id"] not in seen_ids:
                address_str = ", ".join(b["location"].get("display_address", []))
                collected.append({
                    "id": b["id"],
                    "name": b["name"],
                    "cuisine": cuisine,
                    "rating": b.get("rating"),
                    "review_count": b.get("review_count"),
                    "price": b.get("price"),
                    "address": address_str,
                    "zip_code": extract_zip(address_str),
                    "latitude": b["coordinates"]["latitude"],
                    "longitude": b["coordinates"]["longitude"],
                    "phone": b.get("display_phone"),
                    "url": b.get("url")
                })
                seen_ids.add(b["id"])

        offset += 50
        time.sleep(0.3)  # to avoid rate limits

    all_data.extend(collected[:200])
    print(f"Collected {len(collected)} {cuisine} restaurants.")

# Save to CSV
df = pd.DataFrame(all_data)
df.to_csv(OUTPUT_CSV, index=False)
print(f"Data saved to {OUTPUT_CSV}")

print(df[['name', 'cuisine', 'address', 'zip_code']].head())



Fetching Chinese restaurants...
Error 400 for Chinese offset 200
Collected 200 Chinese restaurants.

Fetching Italian restaurants...
Error 400 for Italian offset 200
Collected 200 Italian restaurants.

Fetching Mexican restaurants...
Error 400 for Mexican offset 200
Collected 199 Mexican restaurants.

Fetching Indian restaurants...
Error 400 for Indian offset 200
Collected 198 Indian restaurants.

Fetching Japanese restaurants...
Error 400 for Japanese offset 200
Collected 198 Japanese restaurants.

Fetching Thai restaurants...
Error 400 for Thai offset 200
Collected 184 Thai restaurants.
✅ Data saved to manhattan_restaurants.csv
                              name  cuisine  \
0                  Blue Willow 夜来湘  Chinese   
1             Chi Restaurant & Bar  Chinese   
2                   Mountain House  Chinese   
3  Easy Joy Dim Sum & AYCE Hot Pot  Chinese   
4                   Dim Sum Palace  Chinese   

                               address zip_code  
0     40 W 56th St, New York