In [3]:
import pandas as pd
from geopy.geocoders import GoogleV3
import geopandas as gpd
from shapely.geometry import Point, shape
import json
import requests

# NYC Open Data API

In [4]:


#Base URL for the API
url = 'https://data.cityofnewyork.us/resource/9nt8-h7nd.json'
limit = 5000  # Set a limit per page
offset = 0    # Start at the beginning
total_records_geo = []

while True:
    # Construct URL with limit and offset
    paginated_url = f"{url}?$limit={limit}&$offset={offset}"
    response = requests.get(paginated_url)
    if response.status_code == 200:
        data = response.json()
        if not data:  # Break the loop if no more data is returned
            break
        total_records_geo.extend(data)
        offset += limit  # Move the offset
    else:
        print("Failed to retrieve data. Status code:", response.status_code)
        break

# Output the data

neighborhoods_gdf = pd.DataFrame(total_records_geo)
neighborhoods_gdf.to_csv('/Users/skareti/Desktop/Independent Study/NYTRestaurantReviews/data/nyc_neighborhood_boundaries.csv')
print(neighborhoods_gdf.shape)
neighborhoods_gdf.head()

(262, 12)


Unnamed: 0,the_geom,borocode,boroname,countyfips,nta2020,ntaname,ntaabbrev,ntatype,cdta2020,cdtaname,shape_leng,shape_area
0,"{'type': 'MultiPolygon', 'coordinates': [[[[-7...",3,Brooklyn,47,BK0101,Greenpoint,Grnpt,0,BK01,BK01 Williamsburg-Greenpoint (CD 1 Equivalent),28912.5653122,35321204.8204
1,"{'type': 'MultiPolygon', 'coordinates': [[[[-7...",3,Brooklyn,47,BK0102,Williamsburg,Wllmsbrg,0,BK01,BK01 Williamsburg-Greenpoint (CD 1 Equivalent),28098.0267744,28854314.555
2,"{'type': 'MultiPolygon', 'coordinates': [[[[-7...",3,Brooklyn,47,BK0103,South Williamsburg,SWllmsbrg,0,BK01,BK01 Williamsburg-Greenpoint (CD 1 Equivalent),18250.2804159,15208960.44
3,"{'type': 'MultiPolygon', 'coordinates': [[[[-7...",3,Brooklyn,47,BK0104,East Williamsburg,EWllmsbrg,0,BK01,BK01 Williamsburg-Greenpoint (CD 1 Equivalent),43184.773814,52266209.4439
4,"{'type': 'MultiPolygon', 'coordinates': [[[[-7...",3,Brooklyn,47,BK0201,Brooklyn Heights,BkHts,0,BK02,BK02 Downtown Brooklyn-Fort Greene (CD 2 Appro...,14312.506134,9982321.73877


# Spatial Join with NYT Restaurants
here we will use the latitude/longitude points for each NYT resaturant used in the previous notebook to perform a spatial join to determine if a restaurant is in NYC or not.

In [5]:
ntas_data = gpd.read_file('/Users/skareti/Desktop/Independent Study/NYTRestaurantReviews/data/nyc_neighborhood_boundaries.csv')


# Convert 'the_geom' column from GeoJSON-like format to actual geometry
ntas_data['geometry'] = ntas_data['the_geom'].apply(lambda x: shape(json.loads(x.replace("'", '"'))))
gdf_ntas = gpd.GeoDataFrame(ntas_data, geometry='geometry')

# Read in nyt restaurants with coordainates
nyt_restaurants = pd.read_csv('/Users/skareti/Desktop/Independent Study/NYTRestaurantReviews/data/NYTRestaurants_Coords.csv', index_col=0)

# Convert restaurants dataframe to a GeoDataFrame
geometry = [Point(xy) for xy in zip(nyt_restaurants['Longitude'], nyt_restaurants['Latitude'])]
gdf_restaurants = gpd.GeoDataFrame(nyt_restaurants, geometry=geometry)

# Ensure both GeoDataFrames use the same coordinate reference system (CRS)
gdf_restaurants.crs = "EPSG:4326"
gdf_ntas.crs = "EPSG:4326"


In [6]:

# Perform spatial join to determine if each restaurant is in NYC
gdf_restaurants = gpd.sjoin(gdf_restaurants, gdf_ntas, how='left', op='within')

# Create 'isNYC' column: 1 if restaurant is in NYC, 0 otherwise
gdf_restaurants['isNYC'] = gdf_restaurants['index_right'].notnull().astype(int)

# Drop the spatial join helper columns and reset index
gdf_restaurants.drop(columns=['index_right'], inplace=True)
gdf_restaurants.reset_index(drop=True, inplace=True)

# Save the updated GeoDataFrame back to a CSV if needed
# gdf_restaurants.to_csv('/path/to/Updated_NYTRestaurants_Coords.csv')

# Display the first few rows to verify
print(gdf_restaurants.head())

  if (await self.run_code(code, result,  async_=asy)):
  if not (lk == lk.astype(rk.dtype))[~np.isnan(lk)].all():


  Restaurant_name          Author Publishing_Date     Rating Price  \
0       '21' Club      PETE WELLS   Oct. 23, 2012  0.75 star  $$$$   
1          'Cesca  WILLIAM GRIMES   Dec. 31, 2003     2 star   $$$   
2      'Wichcraft    PETER MEEHAN   Sept. 7, 2005        NaN     $   
3       12 Chairs     ERIC ASIMOV   Oct. 18, 1996        NaN    $$   
4         15 East     FRANK BRUNI   July 11, 2007     2 star  $$$$   

                Cuisine     Neighborhood  CriticsPick  \
0              American          Midtown            0   
1               Italian  Upper West Side            0   
2  American, Sandwiches    Midtown South            0   
3               Russian             SoHo            0   
4       Japanese, Sushi     Union Square            0   

                                         Description  Rank  ...  countyfips  \
0  There are only a few things at “21” that one c...   0.0  ...         061   
1  Cesca, on the Upper West Side, isn't the scene...   0.0  ...         061   

In [7]:
#quick investigation for verification
gdf_restaurants[gdf_restaurants['isNYC']!=1][['Restaurant_name', 'Neighborhood']]

Unnamed: 0,Restaurant_name,Neighborhood
57,Amácita,Culver City
105,Babajan,Carlton North
110,Bad Saint,Washington D.C.
126,Bar Patrón by Rockpool,Circular Quay
210,Bread and Salt,Jersey City
249,Cafe Ohlone,Berkeley
292,Cassia,"Santa Monica, Calif."
444,El Jardín,San Diego
482,Federal Doma Cafe,"Federal, New South Wales, Australia"
512,Franklin Barbecue,"Austin, Tex."


it looks like all these restaurants are in fact, not a part of NYC so we will drop these restaurants from the master dataset

In [None]:
#mamak house, paradou, 

In [8]:
gdf_restaurants.loc[gdf_restaurants['Restaurant_name']=='Mamak House', 'isNYC']=1
gdf_restaurants.loc[gdf_restaurants['Restaurant_name']=='Paradou', 'isNYC']=1


In [12]:
gdf_restaurants_final = gdf_restaurants[gdf_restaurants['isNYC']==1]
gdf_restaurants_final.to_csv('/Users/skareti/Desktop/Independent Study/NYTRestaurantReviews/data/PreprocessedData_Final.csv')

In [13]:
gdf_restaurants_final.shape

(1507, 42)

In [14]:
gdf_restaurants.columns.tolist()

['Restaurant_name',
 'Author',
 'Publishing_Date',
 'Rating',
 'Price',
 'Cuisine',
 'Neighborhood',
 'CriticsPick',
 'Description',
 'Rank',
 'top100_2024',
 'Year',
 'Month',
 'Rating_Categorical',
 'Price_Categorical',
 'Rating_Categorical_Excellent',
 'Rating_Categorical_Extraordinary',
 'Rating_Categorical_Good',
 'Rating_Categorical_NR',
 'Rating_Categorical_Very Good',
 'Price_Categorical_Expensive',
 'Price_Categorical_Inexpensive',
 'Price_Categorical_Moderate',
 'Price_Categorical_Very Expensive',
 'Coordinates',
 'Latitude',
 'Longitude',
 'geometry',
 'field_1',
 'the_geom',
 'borocode',
 'boroname',
 'countyfips',
 'nta2020',
 'ntaname',
 'ntaabbrev',
 'ntatype',
 'cdta2020',
 'cdtaname',
 'shape_leng',
 'shape_area',
 'isNYC']