In [1]:
# Import standard libraries
import os
import re
import json
import codecs
import requests
import dill
from urllib.request import urlopen

# Import third-party libraries
import geopandas as gpd
from geoalchemy2 import Geometry
import pandas as pd
from sqlalchemy import create_engine, Column, Integer, String, Date, MetaData, event, Table, text, LargeBinary
from sqlalchemy.dialects.sqlite import insert
from sqlalchemy.orm import sessionmaker
from sqlalchemy.event import listen
from sqlalchemy.engine import Engine
from sqlalchemy.ext.declarative import declarative_base

import sqlite3
from sqlite3 import dbapi2 as sqlite

import fiona
from fiona.crs import from_epsg
# import pdfplumber

from src.helpers import *
from src.dbutils import *
from src.ORMutils import *
from src.models import *
from src.geo import *
from src.pdfutils import *

In [2]:

# Load the environment
with open("environment_data/select.pkl", "rb") as f:
    env = dill.load(f)

# Restore the environment
globals().update(env)

In [3]:
for name, dataset in datasets.items():
    print(dataset.short_name)

mapPLUTO
assessments
tax_liens
housing_violations
assessment_actions
housing_database
NTAs2020
NTA_population_2020
NTA_demographics_2020
census_blocks2020
CDTAs2020
puma2020
cert_of_occupancy


### Set up a dictionaries that define things specific to each dataset.
* `prefix` is the directory where the dataset in question is expected to be found.
*`cols_to_drop` are columns that I am sure I will not need.
* `cols_to_rename` are key-value pairs where keys are column names found in the dataset and values are the names they are to be changed to. This is sometimes necessary to standardize column names that have the same type of data but different spellings between datasets. I also choose to rename some columns to standard names that are not abbreviated if I think the abbreviation meanings are not obvious. 
* `dtype_exceptions` are key-value pairs where the keys are the column names and the values are the datatype that I want to specify for the column when it is put in the database. For example date could appear in the json dataset as an integer or a string. When inserted to a table, I want it to be a proper date type.
* `lookup_columns` are categorical variables (usually strings, but in some cases like zip code they can be numbers.). For these, I replace them with an integer foreign key and put the actual name (as well as any additional information about the category) into a lookup (join) table.

In [None]:
# Define shared dataset configurations
SHARED_DATASET_CONFIGS = {
    "prefix": PROJECT_DATA,
    "cols_to_drop": [ # These columns are dropped from all datasets because I decided I won't need them
        "id",
        "sid",
        "position",
        "created_at",
        "created_meta",
        "updated_at",
        "updated_meta",
        "borough",
        "meta",
    ],
    "cols_to_rename": {}, # These are columns that are to be renamed in all datasets
    "lookup_columns": [],
    "dtype_mappings": { # These are columns that are to be cast to a specific data type
        "zip_code": String,
        "meta_data": String,
        "postcode": String,
        "calendar_date": Date,
        "number": Integer,
        "text": String,
        "point": String,
    },
}

# Define specific dataset configurations
SPECIFIC_DATASET_CONFIGS = {
    "lien_data": {
        "prefix": f"{PROJECT_DATA}/intermediate_files",
        "cols_to_drop": [],
        "cols_to_rename": {"BORO": "borough"},
        "lookup_columns": [],
        "dtype_mappings": {},
    },
    "assessment_data": {
        "prefix": f"{PROJECT_DATA}/intermediate_files",
        "cols_to_drop": [],
        "cols_to_rename": {
            "BLDGCL": "building_class",
            "TAXCLASS": "tax_class_code",
            "Zip Codes": "zip_code",
        },
        "lookup_columns": ["building_class", "street_name", "owner", "zip_code"],
        "dtype_mappings": {},
    },
    "PLUTO": {
        "prefix": None,
        "cols_to_drop": [],
        "cols_to_rename": {
            "BldgClass": "building_class",
            "ZipCode": "zip_code",
            "SchoolDist": "school_district",
            "PolicePrct": "police_precinct",
            "Council": "council_district",
            "OwnerName": "owner",
            "HistDist": "historic_district",
            "SanitDistrict": "sanitation_district",
            "SanitSub": "sanitation_subdistrict",
            "Borough" : "borough"
        },
        "lookup_columns": [
            "borough"
            "building_class",
            "street_name",
            "owner",
            "zip_code",
            "school_district",
            "council_district",
            "police_precinct",
        ],
        "dtype_mappings" : {
            "school_district": Integer,
            "council_district": Integer,
            "police_precinct": Integer,
            "YearBuilt": Date,
            "YearAlter1": Date,
            "YearAlter2": Date,
            "APPDate": Date,
            "geometry": LargeBinary,
        },
    },
}

* ### Create the database engine that will be used throughout the rest of the notebook.

In [5]:
engine = create_engine(f'{SQLITE_PATH}?check_same_thread=False', echo=False)

SessionLocal = sessionmaker(bind=engine, autoflush=False, autocommit=False)

* ### Configure the database

In [6]:
@event.listens_for(engine, "connect")
def load_spatialite(dbapi_conn, connection_record):
    print("Loading SpatiaLite extension")
    dbapi_conn.enable_load_extension(True)
    dbapi_conn.load_extension("mod_spatialite")
    dbapi_conn.enable_load_extension(False)


with engine.connect() as conn:
    print("Connection established")
    result = conn.execute(text("SELECT spatialite_version()"))
    spatialite_version = result.fetchone()
    print(f"SpatiaLite version: {spatialite_version[0]}")

# Enable WAL mode
with engine.connect() as conn:
    conn.execute(text("PRAGMA journal_mode=WAL"))

# Initialize spatial metadata if not already present
with engine.connect() as conn:
    conn.execute(text("SELECT InitSpatialMetaData(1)"))

Loading SpatiaLite extension
Connection established
SpatiaLite version: 5.1.0


### Manually create the borough codes lookup table
* These are standardized and available many places, however I could not find a single official source of record to programatically get them from, since there are only five of them, I enter them manually.

In [7]:
borough_codes = {'Manhattan' : 1,
'Bronx' : 2,
'Brooklyn' : 3,
'Queens' : 4,
'Staten Island' : 5}

* Create the lookup table.

In [8]:
metadata = MetaData()
metadata.reflect(bind=engine)

def create_lookup_table_simple(engine=engine, metadata=metadata, lookup_table_name='new_lookup_table', lookup_column_name='name'):
    lookup_table = Table(
        lookup_table_name,
        metadata,
        Column('id', Integer, primary_key=True, autoincrement=False),
        Column(lookup_column_name, String, unique=True, nullable=False, default="NO DATA"),
        extend_existing = True
    )
    if table_exists(engine, lookup_table_name):
        print("Table exists")
    else:
        lookup_table.create(engine)
    return lookup_table

* Populate the table with the borough codes.

In [9]:
borough_lookup_table = create_lookup_table_simple(engine=engine, metadata=metadata, lookup_table_name='boroughs', lookup_column_name='borough')

with engine.connect() as connection:
    for key,value in borough_codes.items():
        stmt = insert(borough_lookup_table).values(id = value, borough = key).on_conflict_do_nothing()
        connection.execute(stmt)
    connection.commit()

In [10]:
# This file uses some non-standard borough codes, so we need to replace them
replacement_dict = {"MN": 1, "BX": 2, "BN": 3, "QN": 4, "SI": 5}


gdb_path = f"{PROJECT_DATA}/files_to_use/MapPLUTO24v4.gdb"
layers = fiona.listlayers(gdb_path)
layer_dict = process_layers(gdb_path, layers)

In [11]:
for k in layer_dict.keys():
    layer_dict[k] = layer_dict[k]
    layer_dict[k].rename(columns=SPECIFIC_DATASET_CONFIGS['PLUTO']['cols_to_rename'], inplace=True)
    print(layer_dict[k].columns)
    layer_dict[k]['borough'] = layer_dict[k]['borough'].replace(replacement_dict)
    # Convert geometries to WKT
    if 'geometry' in layer_dict:
        layer_dict[k]['wkb'] = layer_dict[k]['geometry'].apply(lambda geom: geom.wkb if geom else None)

# Create ORM classes
orm_classes = create_orm_classes(layer_dict, Base, SPECIFIC_DATASET_CONFIGS['PLUTO']['dtype_exceptions'])

# Create tables in the database
Base.metadata.create_all(engine)

Index(['borough', 'Block', 'Lot', 'CD', 'BCT2020', 'BCTCB2020', 'CT2010',
       'CB2010', 'school_district', 'council_district', 'zip_code', 'FireComp',
       'police_precinct', 'HealthCenterDistrict', 'HealthArea', 'Sanitboro',
       'sanitation_district', 'sanitation_subdistrict', 'Address', 'ZoneDist1',
       'ZoneDist2', 'ZoneDist3', 'ZoneDist4', 'Overlay1', 'Overlay2',
       'SPDist1', 'SPDist2', 'SPDist3', 'LtdHeight', 'SplitZone',
       'building_class', 'LandUse', 'Easements', 'OwnerType', 'owner',
       'LotArea', 'BldgArea', 'ComArea', 'ResArea', 'OfficeArea', 'RetailArea',
       'GarageArea', 'StrgeArea', 'FactryArea', 'OtherArea', 'AreaSource',
       'NumBldgs', 'NumFloors', 'UnitsRes', 'UnitsTotal', 'LotFront',
       'LotDepth', 'BldgFront', 'BldgDepth', 'Ext', 'ProxCode', 'IrrLotCode',
       'LotType', 'BsmtCode', 'AssessLand', 'AssessTot', 'ExemptTot',
       'YearBuilt', 'YearAlter1', 'YearAlter2', 'historic_district',
       'Landmark', 'BuiltFAR', 'ResidFAR

In [12]:
with SessionLocal() as session:
    for layer in layer_dict.keys():
        write_layer_to_db(layer_dict, layer, orm_classes, session)

In [13]:
for name, dataset in datasets.items():
    if 'the_geom' in dataset.columns:
        print(dataset.name, dataset.columns)

2020 Neighborhood Tabulation Areas (NTAs) {'the_geom': 'multipolygon', 'BoroCode': 'number', 'BoroName': 'text', 'CountyFIPS': 'text', 'NTA2020': 'text', 'NTAName': 'text', 'NTAAbbrev': 'text', 'NTAType': 'text', 'CDTA2020': 'text', 'CDTAName': 'text', 'Shape_Leng': 'number', 'Shape_Area': 'number'}
2020 Census Blocks {'the_geom': 'multipolygon', 'CB2020': 'text', 'BoroCode': 'text', 'BoroName': 'text', 'CT2020': 'text', 'BCTCB2020': 'text', 'GEOID': 'text', 'Shape_Leng': 'number', 'Shape_Area': 'number'}
2020 Community District Tabulation Areas (CDTAs) {'the_geom': 'multipolygon', 'BoroCode': 'number', 'BoroName': 'text', 'CountyFIPS': 'text', 'CDTA2020': 'text', 'CDTAName': 'text', 'CDTAType': 'text', 'Shape_Leng': 'number', 'Shape_Area': 'number'}
2020 Public Use Microdata Areas (PUMAs) {'the_geom': 'multipolygon', 'PUMA': 'text', 'Shape_Length': 'number', 'Shape_Area': 'number'}


# **Create lookup tables**

## Get lot zoning info
* The main datasets used in this project all reference specific lots, so I thought it would be good to have the zoning for those lots.
* The most official source I could find for this was a pdf attached to the NYC Open Data dataset listing zoning for property lots in NYC.
* The extraction code is by necessity unique and unlikely to be reusable. If using this notebook as a template for future analyses, I would put here any tasks that are unique to the project and which I don't expect to be reusable.

* Download an additional data dictionary I found for another dataset that explains some of the codes used in PLUTO

In [14]:
url  = "https://data.cityofnewyork.us/api/views/fdkv-4t4z/files/997a4707-2e53-48bc-9652-3d69badca007?download=true&filename=zoningtaxlotdatabase_datadictionary"
filename = "zoning_definition_dict.pdf"

In [15]:
downloader(
            url=url,
            download_path=f"{PROJECT_DATA}/dictionaries/",
            outfile_name=filename,
            bigfile=False,
        )

[32mSuccess downloading [36mhttps://data.cityofnewyork.us/api/views/fdkv-4t4z/files/997a4707-2e53-48bc-9652-3d69badca007?download=true&filename=zoningtaxlotdatabase_datadictionary[0m


* Extract the tables from the zoning definition dictionary.

In [16]:
all_tables = parse_zoning_def_dict(f"{PROJECT_DATA}/dictionaries/{filename}")

* Create lookup table for zoning code definitions.

In [18]:
with engine.connect() as connection:
    for key in all_tables.keys():
        zoning_lookup_table = create_lookup_table(engine, lookup_table_name=key, text_column_name='code')
        for value in all_tables[key]:
            stmt = insert(zoning_lookup_table).values(code = value[0], info = value[1]).on_conflict_do_nothing()
            connection.execute(stmt)
        connection.commit()

Loading SpatiaLite extension


### Get explanations of zoning codes.
* I could only find this information in pdf form.
* I discovered how hard PDFs can be to parse.
* I had to do a lot of customization for just this specific pdf. I could have just manually cut and pasted the data from the pdf in the amount of time it took me to do that.
* I still think it was good to do for reproducibility reasons, but in the future I will try to avoid working with datasets that have important information only in PDF format.
* The following functions extract the tables from the pdf, detecting footnotes, and then subsitute the foonote number for the footnote text within the dataframe (so that it will end up as part of the relevant record in the databasee).

In [19]:
url = "https://www.nyc.gov/assets/bronxcb8/pdf/zoning_table_all.pdf"
filename = "zoning_table_all.pdf"  # Path to save the pdf containing the info we need

downloader(
            url=url,
            download_path=f"{PROJECT_DATA}/dictionaries/",
            outfile_name=filename,
            bigfile=False,
        )

[32mSuccess downloading [36mhttps://www.nyc.gov/assets/bronxcb8/pdf/zoning_table_all.pdf[0m


* Run the above functions to extract the data from the pdf.

In [20]:
tables_and_footnotes = parse_zoning_details(f"{PROJECT_DATA}/dictionaries/{filename}")

Assuming that <re.Match object; span=(1, 3), match='12'> represents two different footnotes separated by whitespaces. Also assuming only two footnotes
Assuming that <re.Match object; span=(0, 5), match='\n1\n5\n'> represents two columns of footnotes


In [21]:
# # Create a MetaData instance
# metadata = MetaData()
metadata.reflect(bind=engine)

# # Reflect the table
zoning_districts_lookup = Table("zoning_districts", metadata, autoload_with=engine)

for tablename in tables_and_footnotes.keys():
    print(tablename)
    df = tables_and_footnotes[tablename]['df']
    df.name = df.index.name
    with engine.connect() as conn:
        for series_name, series in df.items():
            tdf = pd.DataFrame(series)
            tdf.reset_index(inplace=True)
            jstring = pd.DataFrame(tdf).to_json()
            stmt = insert(zoning_districts_lookup).values(code=series_name, info=jstring).prefix_with("OR IGNORE")
            conn.execute(stmt)
            conn.commit()

ZONING DATA TABLE 1
ZONING DATA TABLE 2
ZONING DATA TABLE 3
ZONING DATA TABLE 4
ZONING DATA TABLE 5
ZONING DATA TABLE 6
ZONING DATA TABLE 7


### The PDF parsed above still has some definitions that are in text outside the tables. From `zoning_table_all.pdf`:

>C1-1 through C1-5 and C2-1 through C2-5 are commercial districts which are mapped as overlays within residential districts. When a commercial overlay is mapped within an R1 through R5 district, except an R5D district, the commercial FAR is 1.0; within an R5D district or an R6 through R10 district, the commercial FAR is 2.0. The residential FAR for a commercial overlay district is determined by the residential district regulations.

* I need to manually create the object to hold this information and put it in the database

In [22]:
more_zones = {}
info = "Commercial districts which are mapped as overlays within residential districts. When a commercial overlay is mapped within an R1 through R5 district, except an R5D district, the commercial FAR is 1.0; within an R5D district or an R6 through R10 district, the commercial FAR is 2.0. The residential FAR for a commercial overlay district is determined by the residential district regulations."
for i in range(1,6):
    more_zones[f'C1-{i}'] = info
    more_zones[f'C2-{i}'] = info

In [23]:
with engine.connect() as conn:
    for key in more_zones.keys():
        print(more_zones[key])
        stmt = insert(zoning_districts_lookup).values(code=key, info=more_zones[key]).prefix_with("OR IGNORE")
        conn.execute(stmt)
        conn.commit()

Commercial districts which are mapped as overlays within residential districts. When a commercial overlay is mapped within an R1 through R5 district, except an R5D district, the commercial FAR is 1.0; within an R5D district or an R6 through R10 district, the commercial FAR is 2.0. The residential FAR for a commercial overlay district is determined by the residential district regulations.
Commercial districts which are mapped as overlays within residential districts. When a commercial overlay is mapped within an R1 through R5 district, except an R5D district, the commercial FAR is 1.0; within an R5D district or an R6 through R10 district, the commercial FAR is 2.0. The residential FAR for a commercial overlay district is determined by the residential district regulations.
Commercial districts which are mapped as overlays within residential districts. When a commercial overlay is mapped within an R1 through R5 district, except an R5D district, the commercial FAR is 1.0; within an R5D dis

### Get a few more code meanings 
* From [NYC Department of Tax and Finance Data Dictionary](https://www.nyc.gov/assets/finance/downloads/tar/tarfieldcodes.pdf):
    * LandUse
    * OwnerType
    * Easment code
* Additional information about commercial zoning that I have not included can be [found here](https://www.nyc.gov/assets/planning/download/pdf/zoning/districts-tools/commercial_zoning_data_tables.pdf).
* Additional information about residential zoning that I have not included can be [found here](https://www.nyc.gov/assets/planning/download/pdf/zoning/districts-tools/residence_zoning_data_tables.pdf)

### Download and add to the database the zoning information for parcels of land in NYC.

In [24]:

dataset = "fdkv-4t4z"
url = f"https://data.cityofnewyork.us/resource/{dataset}.json" 
count = getDatasetRowCount(url)
metadata_url = f"https://data.cityofnewyork.us/api/views/{dataset}.json"
reader = codecs.getreader("utf-8")
metadata = reader(urlopen(metadata_url))
metadata = json.load(metadata)


In [29]:
title = "NYC Zoning Tax Lot Database"
tabname = re.sub(' ', '_', title)
datatype_mappings = SHARED_DATASET_CONFIGS['dtype_mappings']

column_info =  [{'fieldName': entry['fieldName'], 'dataType' : datatype_mappings.get(entry['dataTypeName'])} for entry in metadata['columns']]

allvals = []

for info in column_info:
    if info['dataType'] is String:
        url = f'https://data.cityofnewyork.us/resource/{dataset}.json?$select=distinct({info['fieldName']})'
        unique_vals = json.load(reader(urlopen(url)))
        allvals.append(unique_vals)

vals_list = [val for val in allvals if len(val) > 0]

newlist = []

for vals_list in allvals:
    vals_list = [val for val in vals_list if len(val) > 0]
    if len(vals_list) > 0:
        vals_list = {list(vals_list[0].keys())[0]: [d[list(vals_list[0].keys())[0]] for d in vals_list  if len(d) > 0]}
        newlist.append(vals_list)

KeyError: 'dtype_mappings'

* Add to the database. I had to set journling mode to WAL to avoid concurrency issues, merely waiting between commits was not sufficient.

In [None]:
with engine.connect() as conn:
    conn.execute(text("PRAGMA journal_mode=WAL"))

print(dataset_names)
for name in dataset_names:
    preprocess_dataset(engine, shared_dataset_configs, specific_dataset_configs[name], name)
