In [7]:
import glob
import os
from pathlib import Path
current_dir = Path.cwd()
data_dir = current_dir.parent / 'raw_data'
existing_data = glob.glob(f'{data_dir}/plownyc*.parquet')

In [8]:
import pandas as pd
existing_df = pd.concat([pd.read_parquet(path) for path in existing_data])
existing_df.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Index: 6885924 entries, 0 to 8238007
Data columns (total 3 columns):
 #   Column        Non-Null Count    Dtype 
---  ------        --------------    ----- 
 0   physical_id   6885924 non-null  object
 1   last_visited  6885924 non-null  object
 2   snapshot      6885924 non-null  object
dtypes: object(3)
memory usage: 210.1+ MB


In [9]:
from datetime import datetime

last_scraped_date = existing_df.last_visited.max()
print(last_scraped_date)

iso_datetime_now = datetime.now().isoformat()
print(iso_datetime_now)

2026-02-24T10:15:00.000
2026-02-25T09:24:12.694877


In [10]:
import subprocess

result = subprocess.run(
    ["pip", "install", "git+https://github.com/kcinbk/get_opendata.git"],
    stdout=subprocess.DEVNULL,
    stderr=subprocess.DEVNULL,
)

if result.returncode == 0:
    print("Module installed")
else:
    raise RuntimeError("Module install failed")

Module installed


In [11]:
from get_opendata import get_data
help(get_data)

Help on function get_data in module get_opendata.get_opendata:

get_data(dataset_id, endpoint, datetime_field=None, start_date=None, end_date=None, order=None, first_filter_field=None, first_filter_value=None, second_filter_field=None, second_filter_value=None, third_filter_field=None, third_filter_value=None, app_token=None, base_sleep=5, limit=100000, max_retries=3, backoff_factor=2)



In [12]:
data = get_data(
    endpoint = 'data.cityofnewyork.us', 
    dataset_id = 'rmhc-afj9', 
    datetime_field = 'last_visited',
    start_date = last_scraped_date, 
    end_date = iso_datetime_now, 
    limit = 500000         
)



Getting page 1 with offset 0...
Obtained page 1, records in this page: 500000, total records: 500000
Sleeping for 5 seconds...
Getting page 2 with offset 500000...
Obtained page 2, records in this page: 500000, total records: 1000000
Sleeping for 7 seconds...
Getting page 3 with offset 1000000...
Obtained page 3, records in this page: 500000, total records: 1500000
Sleeping for 5 seconds...
Getting page 4 with offset 1500000...
Obtained page 4, records in this page: 500000, total records: 2000000
Sleeping for 7 seconds...
Getting page 5 with offset 2000000...
Obtained page 5, records in this page: 500000, total records: 2500000
Sleeping for 5 seconds...
Getting page 6 with offset 2500000...
Obtained page 6, records in this page: 500000, total records: 3000000
Sleeping for 10 seconds...
Getting page 7 with offset 3000000...
Obtained page 7, records in this page: 500000, total records: 3500000
Sleeping for 5 seconds...
Getting page 8 with offset 3500000...
Obtained page 8, records in thi

In [18]:
new_data_df = pd.DataFrame(data)
new_data_df.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6307102 entries, 0 to 6307101
Data columns (total 3 columns):
 #   Column        Non-Null Count    Dtype 
---  ------        --------------    ----- 
 0   physical_id   6307102 non-null  object
 1   last_visited  6307102 non-null  object
 2   snapshot      6307102 non-null  object
dtypes: object(3)
memory usage: 144.4+ MB


In [19]:
existing_df.shape

(6885924, 3)

In [20]:
existing_df.last_visited.max()

'2026-02-24T10:15:00.000'

In [21]:
existing_df.snapshot.max()

'2026-02-24T08:17:18.000'

In [22]:
merged_df = pd.concat([existing_df, new_data_df])
merged_df = merged_df.drop_duplicates(subset=['physical_id', 'last_visited'])
merged_df.shape

(8028286, 3)

In [23]:
merged_df.last_visited.max()

'2026-02-25T05:15:00.000'

In [24]:
# Save cleaned_df into raw_data dir
merged_df.to_parquet(f'{data_dir}/plownyc_dropduplicated.parquet', engine='fastparquet')