In [27]:
import pandas as pd
import requests
import json
import datetime
import csv
import time
import matplotlib.pyplot as plt
import ast
from dotenv import load_dotenv
import os 
from sqlalchemy import create_engine

In [28]:
load_dotenv()

True

In [29]:
DB_USER = os.getenv("DB_USER")
DB_PASSWORD = os.getenv("DB_PASSWORD")
DB_HOST = os.getenv("DB_HOST")
DB_PORT = os.getenv("DB_PORT")
DB_NAME = os.getenv("DB_NAME")

In [30]:
TOKEN = os.getenv("NYC_open_data_token")

base_url = "https://data.cityofnewyork.us/resource/feuq-due4.json"

headers = {"X-App-Token": TOKEN}

limit = 1000    
offset = 0
all_records = []

while True:
    url = f"{base_url}?$limit={limit}&$offset={offset}"
    try:
        response = requests.get(url, headers=headers, timeout=20)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print("Request failed, retrying in 2 seconds...", e)
        time.sleep(2)
        continue

    batch = response.json()

    if not batch:
        print("No more data returned. Stopping.")
        break

    all_records.extend(batch)
    print(f"Fetched {len(batch)} rows (offset={offset})")

    # Stop if fewer than the limit means end of dataset
    if len(batch) < limit:
        break

    offset += limit
    time.sleep(0.2)  # polite rate-limit protection


# dataframe:

libraries = pd.DataFrame(all_records)

print("Done! Total rows:", len(libraries))
print(libraries.head())

Fetched 216 rows (offset=0)
Done! Total rows: 216
                                            the_geom          name  \
0  {'type': 'Point', 'coordinates': [-73.95353074...  115th Street   
1  {'type': 'Point', 'coordinates': [-73.93484756...  125th Street   
2  {'type': 'Point', 'coordinates': [-73.97736329...   53rd Street   
3  {'type': 'Point', 'coordinates': [-73.96938170...   58th Street   
4  {'type': 'Point', 'coordinates': [-73.95954994...   67th Street   

          streetname housenum      city    zip  \
0  West 115th Street      203  New York  10026   
1  East 125th Street      224  New York  10035   
2   West 53rd Street       18  New York  10019   
3   East 58th Street      127  New York  10022   
4   East 67th Street      328  New York  10065   

                                          url      bin         bbl  \
0  http://www.nypl.org/locations/115th-street  1055236  1018310026   
1  http://www.nypl.org/locations/125th-street  1054674  1017890037   
2   http://www.nyp

In [31]:
columns_to_keep = ["name", "the_geom"]

In [32]:
libraries_final = libraries[columns_to_keep]

In [33]:
libraries_final.head()

Unnamed: 0,name,the_geom
0,115th Street,"{'type': 'Point', 'coordinates': [-73.95353074..."
1,125th Street,"{'type': 'Point', 'coordinates': [-73.93484756..."
2,53rd Street,"{'type': 'Point', 'coordinates': [-73.97736329..."
3,58th Street,"{'type': 'Point', 'coordinates': [-73.96938170..."
4,67th Street,"{'type': 'Point', 'coordinates': [-73.95954994..."


In [34]:
# Convert string representation of dict to actual dict

#Check what type the_geom actually is
print(libraries_final['the_geom'].dtype)
print(type(libraries_final['the_geom'].iloc[0]))

#extract longitude and latitude
libraries_final['longitude'] = libraries_final['the_geom'].apply(lambda x: x['coordinates'][0])
libraries_final['latitude'] = libraries_final['the_geom'].apply(lambda x: x['coordinates'][1])

object
<class 'dict'>


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  libraries_final['longitude'] = libraries_final['the_geom'].apply(lambda x: x['coordinates'][0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  libraries_final['latitude'] = libraries_final['the_geom'].apply(lambda x: x['coordinates'][1])


In [35]:
libraries_final.head()

Unnamed: 0,name,the_geom,longitude,latitude
0,115th Street,"{'type': 'Point', 'coordinates': [-73.95353074...",-73.953531,40.80298
1,125th Street,"{'type': 'Point', 'coordinates': [-73.93484756...",-73.934848,40.803018
2,53rd Street,"{'type': 'Point', 'coordinates': [-73.97736329...",-73.977363,40.760807
3,58th Street,"{'type': 'Point', 'coordinates': [-73.96938170...",-73.969382,40.762186
4,67th Street,"{'type': 'Point', 'coordinates': [-73.95954994...",-73.95955,40.764915


In [36]:
libraries_final = libraries_final.drop('the_geom', axis=1)

In [37]:
libraries_final.head()

Unnamed: 0,name,longitude,latitude
0,115th Street,-73.953531,40.80298
1,125th Street,-73.934848,40.803018
2,53rd Street,-73.977363,40.760807
3,58th Street,-73.969382,40.762186
4,67th Street,-73.95955,40.764915


In [38]:
# Send to db:
engine = create_engine(f"postgresql+psycopg2://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}")

In [39]:
# Send df to PostgreSQL
libraries_final.to_sql('libraries', engine, if_exists='replace', index=False)


216