In [None]:
# Import standard libraries
import os
import re
import json
import dataclasses
import codecs
import requests
from urllib.request import urlopen
import datetime


# # Import third-party libraries
import geopandas as gpd
from geoalchemy2 import Geometry
import pandas as pd
import numpy as np
import pyogrio
import sqlalchemy
from sqlalchemy import create_engine, Column, Integer, Float, String, Date, MetaData, event, Table, text, LargeBinary, ForeignKey
from sqlalchemy.dialects.sqlite import insert
from sqlalchemy.orm import sessionmaker, declarative_base
from sqlalchemy.sql.sqltypes import Boolean
from sqlalchemy.event import listen
from sqlalchemy.engine import Engine
# from sqlalchemy.ext.declarative import declarative_base

import sqlite3
import fiona
from fiona.crs import from_epsg



In [None]:
%%bash

mkdir -p {DATADIR}

declare -A geodata

geodata["census_blocks"]="https://s-media.nyc.gov/agencies/dcp/assets/files/zip/data-tools/bytes/nycb2020_24d.zip"
geodata["NTA"]="https://s-media.nyc.gov/agencies/dcp/assets/files/zip/data-tools/bytes/nynta2020_24d.zip"
geodata["MapPLUTO"]="https://s-media.nyc.gov/agencies/dcp/assets/files/zip/data-tools/bytes/nyc_mappluto_24v3_1_fgdb.zip"

for dataset in "${!geodata[@]}"; do
  echo "${geodata[${dataset}]}"
  wget -nv -O $DATADIR/Downloads/"${dataset}".zip ${geodata[${dataset}]}
  unzip $DATADIR/Downloads/"${dataset}" -d $DATADIR/intermediate_files/
  rm -f $DATADIR/Downloads/"${dataset}".zip
done

## Download the main datasets of interest:
* Lien sales
* Tax assessments
* Applications for changes in property assessments or classification

In [None]:
%%bash

declare -A datasets

datasets["lien_data"]="https://data.cityofnewyork.us/api/views/9rz4-mjek/rows.json?accessType=DOWNLOAD"
datasets["assessment_data"]="https://data.cityofnewyork.us/api/views/yjxr-fw8i/rows.json?accessType=DOWNLOAD"
datasets["assessment_actions"]="https://data.cityofnewyork.us/api/views/4nft-bihw/rows.json?accessType=DOWNLOAD"

# Function to download datasets and extract data and column names. This seems to work with many NYC Open Data datasets
get_nyc_open_dataset () {
    name=$1
    url=$2
    datadir=$3
    declare ${name}_url=$url
    wget -nv -O $datadir/Downloads/${name}.json $url && 
    echo "The file ${name}.json, was dowloaded from $url at $(date -u)\n"  >> $datadir/Downloads/data_download.log
    # Extract the column names
    jq '.meta.view.columns.[].name '<$datadir/Downloads/${name}.json > $datadir/intermediate_files/${name}_colnames.txt
    # Extract data types
    jq '.meta.view.columns.[].dataTypeName '<$datadir/Downloads/${name}.json > $datadir/intermediate_files/${name}_data_types.txt
    # Extract the data
    cat $datadir/Downloads/${name}.json | jq '.data' > $DATADIR/intermediate_files/${name}_rows.json
}


# Make sure I am create with a new logfile
echo '' > $DATADIR/Downloads/data_download.log

for dataset in "${!datasets[@]}"; do
  echo "${dataset} => ${datasets[${dataset}]}"
  get_nyc_open_dataset $dataset "${datasets[${dataset}]}" $DATADIR
done




### Rename a column in one of the datasets for consistency later on, this is due to a peculiarity due to different schemas between the two datasets
* This is a good place for any other fixes that are specific to the dataset at hand.

In [None]:
%%bash

sed -i 's/Borough/borough/g' $DATADIR/intermediate_files/lien_data_colnames.txt
