In [5]:
import pandas as pd
import requests
import json
import datetime
import csv
import time
import matplotlib.pyplot as plt
import ast
from dotenv import load_dotenv
import os 
from sqlalchemy import create_engine


In [2]:
load_dotenv()

True

In [3]:
DB_USER = os.getenv("DB_USER")
DB_PASSWORD = os.getenv("DB_PASSWORD")
DB_HOST = os.getenv("DB_HOST")
DB_PORT = os.getenv("DB_PORT")
DB_NAME = os.getenv("DB_NAME")

In [4]:
TOKEN = os.getenv("NYC_open_data_token")

base_url = "https://data.cityofnewyork.us/resource/9nt8-h7nd.json"

headers = {"X-App-Token": TOKEN}

limit = 1000    
offset = 0
all_records = []

while True:
    url = f"{base_url}?$limit={limit}&$offset={offset}"
    try:
        response = requests.get(url, headers=headers, timeout=20)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print("Request failed, retrying in 2 seconds...", e)
        time.sleep(2)
        continue

    batch = response.json()

    if not batch:
        print("No more data returned. Stopping.")
        break

    all_records.extend(batch)
    print(f"Fetched {len(batch)} rows (offset={offset})")

    # Stop if fewer than the limit means end of dataset
    if len(batch) < limit:
        break

    offset += limit
    time.sleep(0.2)  # polite rate-limit protection


# dataframe:

nta = pd.DataFrame(all_records)

print("Done! Total rows:", len(nta))
nta.head()

Fetched 262 rows (offset=0)
Done! Total rows: 262


Unnamed: 0,borocode,boroname,countyfips,nta2020,ntaname,ntaabbrev,ntatype,cdta2020,cdtaname,shape_leng,shape_area,the_geom
0,3,Brooklyn,47,BK0101,Greenpoint,Grnpt,0,BK01,BK01 Williamsburg-Greenpoint (CD 1 Equivalent),28919.5611508,35321808.3909,"{'type': 'MultiPolygon', 'coordinates': [[[[-7..."
1,3,Brooklyn,47,BK0102,Williamsburg,Wllmsbrg,0,BK01,BK01 Williamsburg-Greenpoint (CD 1 Equivalent),28134.0826611,28852852.9133,"{'type': 'MultiPolygon', 'coordinates': [[[[-7..."
2,3,Brooklyn,47,BK0103,South Williamsburg,SWllmsbrg,0,BK01,BK01 Williamsburg-Greenpoint (CD 1 Equivalent),18250.2800908,15208960.645,"{'type': 'MultiPolygon', 'coordinates': [[[[-7..."
3,3,Brooklyn,47,BK0104,East Williamsburg,EWllmsbrg,0,BK01,BK01 Williamsburg-Greenpoint (CD 1 Equivalent),43184.8003755,52267406.735,"{'type': 'MultiPolygon', 'coordinates': [[[[-7..."
4,3,Brooklyn,47,BK0201,Brooklyn Heights,BkHts,0,BK02,BK02 Downtown Brooklyn-Fort Greene (CD 2 Appro...,14312.1922849,9982022.78755,"{'type': 'MultiPolygon', 'coordinates': [[[[-7..."


In [6]:
cols_to_keep = ["nta2020", "ntaname", "the_geom"]

In [7]:
nta = nta[cols_to_keep]

In [8]:
def compute_centroid_safe(multipolygon):
    # If it's a string, convert to dict
    if isinstance(multipolygon, str):
        import ast
        multipolygon = ast.literal_eval(multipolygon)
    
    coords = multipolygon['coordinates']
    
    # Flatten all coordinate points (handles multiple polygons and rings)
    all_points = []
    for polygon in coords:
        for ring in polygon:
            all_points.extend(ring)  # ring is a list of [lon, lat] pairs
    
    # Compute average longitude and latitude
    avg_lon = sum(pt[0] for pt in all_points) / len(all_points)
    avg_lat = sum(pt[1] for pt in all_points) / len(all_points)
    
    return avg_lon, avg_lat

# Apply to DataFrame
nta['centroid'] = nta['the_geom'].apply(compute_centroid_safe)

# Optionally split into lon/lat columns
nta[['centroid_lon', 'centroid_lat']] = pd.DataFrame(nta['centroid'].tolist(), index=nta.index)



In [9]:
nta.head()

Unnamed: 0,nta2020,ntaname,the_geom,centroid,centroid_lon,centroid_lat
0,BK0101,Greenpoint,"{'type': 'MultiPolygon', 'coordinates': [[[[-7...","(-73.94906387560083, 40.734549353088134)",-73.949064,40.734549
1,BK0102,Williamsburg,"{'type': 'MultiPolygon', 'coordinates': [[[[-7...","(-73.96313948192903, 40.7149205418108)",-73.963139,40.714921
2,BK0103,South Williamsburg,"{'type': 'MultiPolygon', 'coordinates': [[[[-7...","(-73.95564940186196, 40.70327076717946)",-73.955649,40.703271
3,BK0104,East Williamsburg,"{'type': 'MultiPolygon', 'coordinates': [[[[-7...","(-73.93029120726106, 40.715353005312814)",-73.930291,40.715353
4,BK0201,Brooklyn Heights,"{'type': 'MultiPolygon', 'coordinates': [[[[-7...","(-73.99467364738847, 40.69658101337548)",-73.994674,40.696581


In [10]:
nta = nta.drop(["the_geom", "centroid"], axis=1)

In [11]:
nta.head()

Unnamed: 0,nta2020,ntaname,centroid_lon,centroid_lat
0,BK0101,Greenpoint,-73.949064,40.734549
1,BK0102,Williamsburg,-73.963139,40.714921
2,BK0103,South Williamsburg,-73.955649,40.703271
3,BK0104,East Williamsburg,-73.930291,40.715353
4,BK0201,Brooklyn Heights,-73.994674,40.696581


In [12]:
nta.nunique()

nta2020         262
ntaname         262
centroid_lon    262
centroid_lat    262
dtype: int64

In [None]:
nta.drop_duplicates

<bound method DataFrame.drop_duplicates of     nta2020                      ntaname  centroid_lon  centroid_lat
0    BK0101                   Greenpoint    -73.949064     40.734549
1    BK0102                 Williamsburg    -73.963139     40.714921
2    BK0103           South Williamsburg    -73.955649     40.703271
3    BK0104            East Williamsburg    -73.930291     40.715353
4    BK0201             Brooklyn Heights    -73.994674     40.696581
..      ...                          ...           ...           ...
257  SI0391      Freshkills Park (South)    -74.207287     40.564524
258  SI9561               Fort Wadsworth    -74.056313     40.602079
259  SI9591  Hoffman & Swinburne Islands    -74.051223     40.568345
260  SI9592                 Miller Field    -74.094762     40.569592
261  SI9593             Great Kills Park    -74.129730     40.540695

[262 rows x 4 columns]>

In [14]:
# Send to db:
engine = create_engine(f"postgresql+psycopg2://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}")

In [15]:
# Send df to PostgreSQL
nta.to_sql('nta_bridge', engine, if_exists='replace', index=False)


262