In [13]:
import glob
import os
from pathlib import Path
current_dir = Path.cwd()
data_dir = current_dir.parent / 'raw_data'
existing_data = glob.glob(f'{data_dir}/plownyc*.parquet')

In [14]:
import pandas as pd
existing_df = pd.concat([pd.read_parquet(path) for path in existing_data])
existing_df.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Index: 5211590 entries, 0 to 119485
Data columns (total 3 columns):
 #   Column        Non-Null Count    Dtype 
---  ------        --------------    ----- 
 0   physical_id   5211590 non-null  object
 1   last_visited  5211590 non-null  object
 2   snapshot      5211590 non-null  object
dtypes: object(3)
memory usage: 159.0+ MB


In [15]:
from datetime import datetime

last_scraped_date = existing_df.last_visited.max()
print(last_scraped_date)

iso_datetime_now = datetime.now().isoformat()
print(iso_datetime_now)

2026-02-23T09:15:00.000
2026-02-23T12:10:55.524693


In [16]:
import subprocess

result = subprocess.run(
    ["pip", "install", "git+https://github.com/kcinbk/get_opendata.git"],
    stdout=subprocess.DEVNULL,
    stderr=subprocess.DEVNULL,
)

if result.returncode == 0:
    print("Module installed")
else:
    raise RuntimeError("Module install failed")

Module installed


In [17]:
from get_opendata import get_data
help(get_data)

Help on function get_data in module get_opendata.get_opendata:

get_data(dataset_id, endpoint, datetime_field=None, start_date=None, end_date=None, order=None, first_filter_field=None, first_filter_value=None, second_filter_field=None, second_filter_value=None, third_filter_field=None, third_filter_value=None, app_token=None, base_sleep=5, limit=100000, max_retries=3, backoff_factor=2)



In [18]:
data = get_data(
    endpoint = 'data.cityofnewyork.us', 
    dataset_id = 'rmhc-afj9', 
    datetime_field = 'last_visited',
    start_date = last_scraped_date, 
    end_date = iso_datetime_now, 
    limit = 500000         
)



Getting page 1 with offset 0...
Obtained page 1, records in this page: 69, total records: 69
All data obtained.
Finished fetching all data!


In [19]:
new_data_df = pd.DataFrame(data)
new_data_df.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69 entries, 0 to 68
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   physical_id   69 non-null     object
 1   last_visited  69 non-null     object
 2   snapshot      69 non-null     object
dtypes: object(3)
memory usage: 1.7+ KB


In [20]:
existing_df.shape

(5211590, 3)

In [21]:
existing_df.last_visited.max()

'2026-02-23T09:15:00.000'

In [22]:
existing_df.snapshot.max()

'2026-02-23T09:15:13.000'

In [23]:
merged_df = pd.concat([existing_df, new_data_df])
merged_df = merged_df.drop_duplicates(subset=['physical_id', 'last_visited'])
merged_df.shape

(5211590, 3)

In [24]:
# Save cleaned_df into raw_data dir
merged_df.to_parquet(f'{data_dir}/plownyc_dropduplicated.parquet', engine='fastparquet')