In [1]:
import subprocess

result = subprocess.run(
    ["pip", "install", "git+https://github.com/kcinbk/get_opendata.git"],
    stdout=subprocess.DEVNULL,
    stderr=subprocess.DEVNULL,
)

if result.returncode == 0:
    print("Module installed")
else:
    raise RuntimeError("Module install failed")


Module installed


In [2]:
from get_opendata import get_data

In [13]:
from datetime import datetime
datetime.now()
iso_datetime_now = datetime.now().isoformat()
iso_datetime_now = datetime.now().isoformat()
iso_datetime_now

'2026-01-27T01:59:20.274558'

In [9]:
datetime.now().isoformat()

'2026-01-27T01:58:30.159344'

In [14]:
data = get_data(endpoint = 'data.cityofnewyork.us', 
                dataset_id = 'rmhc-afj9', 
                datetime_field= 'last_visited',
                start_date = '2026-01-25T00:00:00', 
                end_date = iso_datetime_now, 
                limit = 500000         
)



Getting page 1 with offset 0...
Obtained page 1, records in this page: 500000, total records: 500000
Sleeping for 5 seconds...
Getting page 2 with offset 500000...
[Retry 1/3] HTTPSConnectionPool(host='data.cityofnewyork.us', port=443): Read timed out. (read timeout=10)
Retrying in 5 seconds...
Getting page 2 with offset 500000...
Obtained page 2, records in this page: 500000, total records: 1000000
Sleeping for 8 seconds...
Getting page 3 with offset 1000000...
Obtained page 3, records in this page: 500000, total records: 1500000
Sleeping for 8 seconds...
Getting page 4 with offset 1500000...
Obtained page 4, records in this page: 500000, total records: 2000000
Sleeping for 9 seconds...
Getting page 5 with offset 2000000...
Obtained page 5, records in this page: 500000, total records: 2500000
Sleeping for 7 seconds...
Getting page 6 with offset 2500000...
[Retry 1/3] HTTPSConnectionPool(host='data.cityofnewyork.us', port=443): Read timed out. (read timeout=10)
Retrying in 5 seconds...

In [15]:
len(data)

15254367

In [None]:
import os
from pathlib import Path
current_dir = Path.cwd()
data_dir = current_dir.parent / 'raw_data'

os.makedirs(data_dir, exist_ok=True)

In [20]:
import json

with open(f'{data_dir}/plownyc_raw.json', 'w') as file:
    json.dump(data, file) 

In [21]:
import pandas as pd
from tqdm import tqdm
import gc

chunk_size = 1_000_000
dfs = []

for i in tqdm(range(0, len(data), chunk_size), desc="Chunk-size"):
    chunk = data[i:i + chunk_size]
    df_chunk = pd.DataFrame.from_records(chunk)
    dfs.append(df_chunk)

    del chunk
    gc.collect()

df = pd.concat(dfs, ignore_index=True)

del dfs
gc.collect()


Chunk-size: 100%|██████████| 16/16 [00:08<00:00,  1.93it/s]


0

In [22]:
df.shape

(15254367, 3)

In [24]:
cleaned_df = df.drop_duplicates(subset=['physical_id', 'last_visited'])
cleaned_df.shape

(3351332, 3)

In [28]:
# Save cleaned_df into raw_data dir
cleaned_df.to_parquet(f'{data_dir}/plownyc_dropduplicated.parquet', engine='fastparquet')