# Create and populate the database

In [66]:
# # Import standard libraries
# import os
# import re
import json
import codecs
# import requests
import time
import dill
time.sleep(3)

from urllib.request import urlopen

# # Import third-party libraries
# import geopandas as gpd
# from geoalchemy2 import Geometry
# import pandas as pd
from sqlalchemy import create_engine, Column, Integer, String, Date, MetaData, event, Table, text, LargeBinary, ForeignKey
from sqlalchemy.dialects.sqlite import insert
from sqlalchemy.orm import sessionmaker
# from sqlalchemy.event import listen
# from sqlalchemy.engine import Engine
# from sqlalchemy.ext.declarative import declarative_base

# import sqlite3
# from sqlite3 import dbapi2 as sqlite

# import fiona
# from fiona.crs import from_epsg

from src.helpers import *
from src.dbutils import *
from src.ORMutils import *
from src.models import *
from src.geo import *
from src.pdfutils import *

* #### Load the objects created in previous notebooks

In [67]:
# Load the environment
with open("environment_data/table_dicts.pkl", "rb") as f:
    env = dill.load(f)

# Restore the environment
globals().update(env)

In [68]:
# Load the environment
with open("environment_data/select.pkl", "rb") as f:
    env = dill.load(f)

# Restore the environment
globals().update(env)

In [69]:
for name, dataset in datasets.items():
    print(dataset.short_name)

mapPLUTO
assessments
tax_liens
housing_violations
assessment_actions
housing_database
NTAs2020
NTA_population_2020
NTA_demographics_2020
census_blocks2020
CDTAs2020
puma2020
cert_of_occupancy


* ### Create the database engine that will be used throughout the rest of the notebook.

In [70]:
engine = create_engine(f'{SQLITE_PATH}?check_same_thread=False', echo=False)

SessionLocal = sessionmaker(bind=engine, autoflush=False, autocommit=False)

* ### Configure the database

In [71]:
@event.listens_for(engine, "connect")
def load_spatialite(dbapi_conn, connection_record):
    print("Loading SpatiaLite extension")
    dbapi_conn.enable_load_extension(True)
    dbapi_conn.load_extension("mod_spatialite")
    dbapi_conn.enable_load_extension(False)


with engine.connect() as conn:
    print("Connection established")
    result = conn.execute(text("SELECT spatialite_version()"))
    spatialite_version = result.fetchone()
    print(f"SpatiaLite version: {spatialite_version[0]}")

# Enable WAL mode
with engine.connect() as conn:
    conn.execute(text("PRAGMA journal_mode=WAL"))

# Initialize spatial metadata if not already present
with engine.connect() as conn:
    conn.execute(text("SELECT InitSpatialMetaData(1)"))

Loading SpatiaLite extension
Connection established
SpatiaLite version: 5.1.0


In [72]:
Base = declarative_base()
metadata = MetaData()
Base.metadata.reflect(bind=engine) 

  Base = declarative_base()


* ### Create lookup tables variables identified as categorical and for which definitions were extracted from the metadata in the previous notebook.

* There are borough codes in the PLUTO dataset, but annyoingly, in contrast to most other datasets, the borough code is a two letter inital like "BK" or "BX". Also in the PLUTO dataset, "Sanitation Borough" does use the standard numeric codes that most other NYC OpenData datasets use. All this is is to say that it requires special handling separate from my system to extract categories and create lookup tables for them programatically.

In [73]:
lookups = {k:v for k,v in col_customization_dict.items() if col_customization_dict[k].is_category == True and 'overlay' not in col_customization_dict[k].new_name} # Codes for overlays are to go in the same table as other zoning codes

completed_tables = [] # This is for tracking the names of tables that have been created, which will be used to avoid creating redundant tables for columns that are same-kind repeats (such as "district_1" and "district_2"), and thus will use the same lookups.

lookup_tables = {}

for name,table in lookups.items():
    print(f"processing {table}")
    lookup_table_name= re.sub('_[0-9]+$', '', table.new_name)
    if any([table.new_name.startswith(prefix) and table.new_name[-1].isdigit() for prefix in completed_tables]):
    # if table.new_name[0:len(table)*75] in completed_tables:
        print(f"Lookup table {lookup_table_name} already created, continuing...")
        continue
    with engine.connect() as connection:
        print(f"Creating lookup table {lookup_table_name}...")
        # lookup_table = create_lookup_table(engine=engine, lookup_table_name=lookup_table_name, text_column_name='name_or_code')
        lookup_table = create_lookup_table(Base.metadata, lookup_table_name=lookup_table_name, text_column_name='name_or_code')
        # name_prefix = table.new_name[0:round(len(table.new_name)*.75)] # Hopefully this is a safe threshold to identify when columns are repeats of the same type
        name_prefix = lookup_table_name
        completed_tables.append(name_prefix)
        lookup_tables[lookup_table_name] = lookup_table

# metadata.create_all(engine)
Base.metadata.create_all(engine)



processing ColCustomization(short_name='Borough', new_name='borough', dtype='String', synonyms=[], definitions=[['BX', 'Bronx'], ['BK', 'Brooklyn'], ['MN', 'Manhattan'], ['QN', 'Queens'], ['SI', 'Staten Island']], drop=False, is_category=True, is_fk=False)
Creating lookup table borough...
processing ColCustomization(short_name='CD', new_name='community_district', dtype='Integer', synonyms=[], definitions={}, drop=False, is_category=True, is_fk=False)
Creating lookup table community_district...
processing ColCustomization(short_name='SchoolDist', new_name='school_district', dtype='String', synonyms=[], definitions={}, drop=False, is_category=True, is_fk=False)
Creating lookup table school_district...
processing ColCustomization(short_name='Council', new_name='city_council_district', dtype='Integer', synonyms=[], definitions={}, drop=False, is_category=True, is_fk=False)
Creating lookup table city_council_district...
processing ColCustomization(short_name='ZipCode', new_name='zip_code', 

In [74]:
completed_tables

['borough',
 'community_district',
 'school_district',
 'city_council_district',
 'zip_code',
 'fire_company',
 'police_precinct',
 'health_center_district',
 'health_area',
 'sanitation_district_boro',
 'sanitation_district_number',
 'zoning_district',
 'special_purpose_district',
 'limited_height_district',
 'building_class',
 'land_use_category',
 'type_of_ownership_code',
 'owner_name',
 'total_building_floor_area_source_code',
 'extension_code',
 'proximity_code',
 'irregular_lot_code',
 'lot_type',
 'basement_type_or_grade',
 'historic_district_name',
 'borough_code',
 'borough_tax_block_and_lot',
 'zoning_map_code',
 'pluto_dtm_base_map_indicator',
 'notes']

In [75]:
for name,table in lookups.items():
    with engine.connect() as connection:
        for definition in table.definitions:
            if len(definition) == 2:
                try:
                    stmt = insert(lookup_table).values(id=int(definition[0]), name_or_code=definition[1]).on_conflict_do_nothing()
                except ValueError:
                    stmt = insert(lookup_table).values(name_or_code=definition[0], info=definition[1]).on_conflict_do_nothing()
            elif len(definition) == 3:
                try:
                    stmt = insert(lookup_table).values(id=int(definition[0]), name_or_code=definition[1], info=definition[2]).on_conflict_do_nothing()
                except Exception as e:
                    print(e)
                    print(definition)
                    # stmt = insert(lookup_table).values(id=definition[0], name_or_code=definition[1], info=definition[2]).on_conflict_do_nothing()
            else:
                print(definition)
                raise ValueError("Was only expecting two or three columns")
            connection.execute(stmt)
        connection.commit()
    name_prefix = table.new_name[0:round(len(table.new_name)*.75)] # Hopefully this is a safe threshold to identify when columns are repeats of the same type
    completed_tables.append(name_prefix)

In [76]:
# lookups = {k:v for k,v in col_customization_dict.items() if col_customization_dict[k].definitions}

# completed_tables = [] # This is for tracking the names of tables that have been created, which will be used to avoid creating redundant tables for columns that are same-kind repeats (such as "district_1" and "district_2"), and thus will use the same lookups.

# for name,table in lookups.items():
#     lookup_table_name= re.sub('_[0-9]+$', '', table.new_name)
#     if any([table.new_name.startswith(prefix) for prefix in completed_tables]):
#     # if table.new_name[0:len(table)*75] in completed_tables:
#         print("Lookup table already created, continuing...")
#         continue
#     with engine.connect() as connection:
#         lookup_table = create_lookup_table(engine=engine, lookup_table_name=lookup_table_name, text_column_name='name_or_code')
#         if lookup_table is None:
#             continue
#         for definition in table.definitions:
#             if len(definition) == 2:
#                 try:
#                     stmt = insert(lookup_table).values(id=int(definition[0]), name_or_code=definition[1]).on_conflict_do_nothing()
#                 except ValueError:
#                     stmt = insert(lookup_table).values(name_or_code=definition[0], info=definition[1]).on_conflict_do_nothing()
#             elif len(definition) == 3:
#                 try:
#                     stmt = insert(lookup_table).values(id=int(definition[0]), name_or_code=definition[1], info=definition[2]).on_conflict_do_nothing()
#                 except Exception as e:
#                     print(e)
#                     print(definition)
#                     # stmt = insert(lookup_table).values(id=definition[0], name_or_code=definition[1], info=definition[2]).on_conflict_do_nothing()
#             else:
#                 print(definition)
#                 raise ValueError("Was only expecting two or three columns")
#             connection.execute(stmt)
#         connection.commit()
#     name_prefix = table.new_name[0:round(len(table.new_name)*.75)] # Hopefully this is a safe threshold to identify when columns are repeats of the same type
#     completed_tables.append(name_prefix)


### Get explanations of zoning codes.
* I could only find this information in pdf form.
* I discovered how hard PDFs can be to parse.
* I had to do a lot of customization for just this specific pdf. I could have just manually cut and pasted the data from the pdf in the amount of time it took me to do that.
* I still think it was good to do for reproducibility reasons, but in the future I will try to avoid working with datasets that have important information only in PDF format.
* The following functions extract the tables from the pdf, detecting footnotes, and then subsitute the foonote number for the footnote text within the dataframe (so that it will end up as part of the relevant record in the databasee).

In [77]:
col_customization_dict

{'Borough': ColCustomization(short_name='Borough', new_name='borough', dtype='String', synonyms=[], definitions=[['BX', 'Bronx'], ['BK', 'Brooklyn'], ['MN', 'Manhattan'], ['QN', 'Queens'], ['SI', 'Staten Island']], drop=False, is_category=True, is_fk=False),
 'Block': ColCustomization(short_name='Block', new_name='tax_block', dtype='Float', synonyms=[], definitions={}, drop=False, is_category=False, is_fk=False),
 'Lot': ColCustomization(short_name='Lot', new_name='tax_lot', dtype='Float', synonyms=[], definitions={}, drop=False, is_category=False, is_fk=False),
 'CD': ColCustomization(short_name='CD', new_name='community_district', dtype='Integer', synonyms=[], definitions={}, drop=False, is_category=True, is_fk=False),
 'BCT2020': ColCustomization(short_name='BCT2020', new_name='census_tract_2020', dtype='String', synonyms=[], definitions={}, drop=False, is_category=False, is_fk=False),
 'BCTCB2020': ColCustomization(short_name='BCTCB2020', new_name='census_block_2020', dtype='String

In [78]:
url = "https://www.nyc.gov/assets/bronxcb8/pdf/zoning_table_all.pdf"
filename = "zoning_table_all.pdf"  # Path to save the pdf containing the info we need

downloader(
            url=url,
            download_path=f"{PROJECT_DATA}/dictionaries/",
            outfile_name=filename,
            bigfile=False,
        )

[32mSuccess downloading [36mhttps://www.nyc.gov/assets/bronxcb8/pdf/zoning_table_all.pdf[0m


* Run the above functions to extract the data from the pdf.

In [79]:
tables_and_footnotes = parse_zoning_details(f"{PROJECT_DATA}/dictionaries/{filename}")

Assuming that <re.Match object; span=(1, 3), match='12'> represents two different footnotes separated by whitespaces. Also assuming only two footnotes
Assuming that <re.Match object; span=(0, 5), match='\n1\n5\n'> represents two columns of footnotes


In [80]:
lookup_tables.keys()

dict_keys(['borough', 'community_district', 'school_district', 'city_council_district', 'zip_code', 'fire_company', 'police_precinct', 'health_center_district', 'health_area', 'sanitation_district_boro', 'sanitation_district_number', 'zoning_district', 'special_purpose_district', 'limited_height_district', 'building_class', 'land_use_category', 'type_of_ownership_code', 'owner_name', 'total_building_floor_area_source_code', 'extension_code', 'proximity_code', 'irregular_lot_code', 'lot_type', 'basement_type_or_grade', 'historic_district_name', 'borough_code', 'borough_tax_block_and_lot', 'zoning_map_code', 'pluto_dtm_base_map_indicator', 'notes'])

In [81]:
import time
from sqlalchemy.exc import OperationalError

def populate_lookup_table(engine, lookup_table, source_table_name, lookup_table_name, text_column_name, chunk_size=100, max_retries=5):
    """
    Populate a lookup table in chunks with retries for database lock issues.
    """
    def retry(func, *args, **kwargs):
        """Retry function with backoff for SQLite locks."""
        for attempt in range(max_retries):
            try:
                return func(*args, **kwargs)
            except OperationalError as e:
                if "database is locked" in str(e):
                    print(f"Database is locked. Retrying ({attempt + 1}/{max_retries})...")
                    time.sleep(0.2 * (attempt + 1))  # Gradual backoff
                else:
                    raise
        raise Exception("Exceeded maximum retries due to database locks.")
    
    with engine.connect() as connection:
        # Ensure the new column exists
        try:
            retry(connection.execute, text(f"ALTER TABLE {source_table_name} ADD COLUMN {text_column_name}_id INTEGER"))
        except Exception as e:
            print(f"Column creation skipped or failed: {e}")

        # Process unique values in chunks
        unique_query = f"SELECT DISTINCT {text_column_name} FROM {source_table_name}"
        unique_values_iter = pd.read_sql(unique_query, engine, chunksize=chunk_size)
        
        for chunk in unique_values_iter:
            unique_values = chunk[text_column_name].dropna().tolist()

            # Insert into the lookup table in small batches
            for value in unique_values:
                stmt = insert(lookup_table).values({text_column_name: value}).on_conflict_do_nothing()
                try:
                    retry(connection.execute, stmt)
                except Exception as e:
                    print(f"Error inserting value '{value}': {e}")

        # Update the source table with foreign key references
        update_stmt = text(f"""
        UPDATE {source_table_name}
        SET {text_column_name}_id = (
            SELECT id 
            FROM {lookup_table_name}
            WHERE {text_column_name} = {source_table_name}.{text_column_name}
        )
        """)
        try:
            retry(connection.execute, update_stmt)
        except Exception as e:
            print(f"Error updating foreign keys: {e}")
        connection.commit()
        # Remove the original text column (optional)
        connection.execute(text(f"ALTER TABLE {source_table_name} DROP COLUMN {text_column_name}"))
        connection.commit()


In [82]:
# # Create a MetaData instance
# metadata = MetaData()
metadata.reflect(bind=engine)

# zoning_district_lookup = create_lookup_table(engine, "zoning_district", "code")
# # Reflect the table
# zoning_district_lookup = Table("zoning_district", metadata, autoload_with=engine)

for tablename in tables_and_footnotes.keys():
    print(tablename)
    df = tables_and_footnotes[tablename]['df']
    df.name = df.index.name
    with engine.connect() as conn:
        for series_name, series in df.items():
            tdf = pd.DataFrame(series)
            tdf.reset_index(inplace=True)
            jstring = pd.DataFrame(tdf).to_json()
            stmt = insert(lookup_tables['zoning_district']).values(name_or_code=series_name, info=jstring).prefix_with("OR IGNORE")
            conn.execute(stmt)
            conn.commit()

ZONING DATA TABLE 1
ZONING DATA TABLE 2
ZONING DATA TABLE 3
ZONING DATA TABLE 4
ZONING DATA TABLE 5
ZONING DATA TABLE 6
ZONING DATA TABLE 7


### The PDF parsed above still has some definitions that are in text outside the tables. From `zoning_table_all.pdf`:

>C1-1 through C1-5 and C2-1 through C2-5 are commercial districts which are mapped as overlays within residential districts. When a commercial overlay is mapped within an R1 through R5 district, except an R5D district, the commercial FAR is 1.0; within an R5D district or an R6 through R10 district, the commercial FAR is 2.0. The residential FAR for a commercial overlay district is determined by the residential district regulations.

* I need to manually create the object to hold this information and put it in the database

In [83]:
more_zones = {}
info = "Commercial districts which are mapped as overlays within residential districts. When a commercial overlay is mapped within an R1 through R5 district, except an R5D district, the commercial FAR is 1.0; within an R5D district or an R6 through R10 district, the commercial FAR is 2.0. The residential FAR for a commercial overlay district is determined by the residential district regulations."
for i in range(1,6):
    more_zones[f'C1-{i}'] = info
    more_zones[f'C2-{i}'] = info

In [84]:
with engine.connect() as conn:
    for key in more_zones.keys():
        print(more_zones[key])
        stmt = insert(lookup_tables['zoning_district']).values(name_or_code=key, info=more_zones[key]).prefix_with("OR IGNORE")
        conn.execute(stmt)
        conn.commit()

Commercial districts which are mapped as overlays within residential districts. When a commercial overlay is mapped within an R1 through R5 district, except an R5D district, the commercial FAR is 1.0; within an R5D district or an R6 through R10 district, the commercial FAR is 2.0. The residential FAR for a commercial overlay district is determined by the residential district regulations.
Commercial districts which are mapped as overlays within residential districts. When a commercial overlay is mapped within an R1 through R5 district, except an R5D district, the commercial FAR is 1.0; within an R5D district or an R6 through R10 district, the commercial FAR is 2.0. The residential FAR for a commercial overlay district is determined by the residential district regulations.
Commercial districts which are mapped as overlays within residential districts. When a commercial overlay is mapped within an R1 through R5 district, except an R5D district, the commercial FAR is 1.0; within an R5D dis

### Get a few more code meanings 
* From [NYC Department of Tax and Finance Data Dictionary](https://www.nyc.gov/assets/finance/downloads/tar/tarfieldcodes.pdf):
    * LandUse
    * OwnerType
    * Easment code
* Additional information about commercial zoning that I have not included can be [found here](https://www.nyc.gov/assets/planning/download/pdf/zoning/districts-tools/commercial_zoning_data_tables.pdf).
* Additional information about residential zoning that I have not included can be [found here](https://www.nyc.gov/assets/planning/download/pdf/zoning/districts-tools/residence_zoning_data_tables.pdf)

## Get the meanings of the building classification codes from the City of New York website.

In [85]:
import urllib.request #, urllib.parse, urllib.error
from bs4 import BeautifulSoup

webpage = "https://www.nyc.gov/assets/finance/jump/hlpbldgcode.html"

def get_table_rows(url):
    html = urllib.request.urlopen(url).read()
    soup = BeautifulSoup(html, 'html.parser')
    return soup('tr')


trs = get_table_rows(webpage)

class_codes = []
d = None
for tr in trs:    
    # Check if 'a' with 'name' exists
    a = tr.find('a', attrs={'name': True})
    if a:
        if d:
            class_codes.append(d)
        supercategory = tr.find_all('th')[1].text.capitalize()
        d = {"supercategory": supercategory}
    
    # Check if 'td' exists and update 'd'
    cells = tr.find_all('td')
    if cells:
        d = {}
        code, name = cells[:2]
        d['code'] = code.text.strip()
        d['name'] = name.text.capitalize().strip()
        class_codes.append(d)


In [86]:
class_codes

[{'code': 'A0', 'name': 'Cape cod'},
 {'code': 'A1', 'name': 'Two stories - detached sm or mid'},
 {'code': 'A2', 'name': 'One story - permanent living quarter'},
 {'code': 'A3', 'name': 'Large suburban residence'},
 {'code': 'A4', 'name': 'City residence one family'},
 {'code': 'A5', 'name': 'One family attached or semi-detached'},
 {'code': 'A6', 'name': 'Summer cottage'},
 {'code': 'A7', 'name': 'Mansion type or town house'},
 {'code': 'A8', 'name': 'Bungalow colony - cooperatively owned land'},
 {'code': 'A9', 'name': 'Miscellaneous one family'},
 {'code': 'A9', 'name': 'Miscellaneous one family'},
 {'code': 'B1', 'name': 'Two family brick'},
 {'code': 'B2', 'name': 'Two family frame'},
 {'code': 'B3', 'name': 'Two family converted from one family'},
 {'code': 'B9', 'name': 'Miscellaneous two family'},
 {'code': 'B9', 'name': 'Miscellaneous two family'},
 {'code': 'C0', 'name': 'Three families'},
 {'code': 'C1', 'name': 'Over six families without stores'},
 {'code': 'C2', 'name': '

In [87]:
with engine.connect() as conn:
    for row in class_codes:
        print(row)
        stmt = insert(lookup_tables['building_class']).values(name_or_code=row['code'], info=row['name']).prefix_with("OR IGNORE")
        conn.execute(stmt)
        conn.commit()

{'code': 'A0', 'name': 'Cape cod'}
{'code': 'A1', 'name': 'Two stories - detached sm or mid'}
{'code': 'A2', 'name': 'One story - permanent living quarter'}
{'code': 'A3', 'name': 'Large suburban residence'}
{'code': 'A4', 'name': 'City residence one family'}
{'code': 'A5', 'name': 'One family attached or semi-detached'}
{'code': 'A6', 'name': 'Summer cottage'}
{'code': 'A7', 'name': 'Mansion type or town house'}
{'code': 'A8', 'name': 'Bungalow colony - cooperatively owned land'}
{'code': 'A9', 'name': 'Miscellaneous one family'}
{'code': 'A9', 'name': 'Miscellaneous one family'}
{'code': 'B1', 'name': 'Two family brick'}
{'code': 'B2', 'name': 'Two family frame'}
{'code': 'B3', 'name': 'Two family converted from one family'}
{'code': 'B9', 'name': 'Miscellaneous two family'}
{'code': 'B9', 'name': 'Miscellaneous two family'}
{'code': 'C0', 'name': 'Three families'}
{'code': 'C1', 'name': 'Over six families without stores'}
{'code': 'C2', 'name': 'Five to six families'}
{'code': 'C3',

## Import the MaPLUTO data:
* List the layers in the file
* In this case there is only one layer, so it isn't necessary to know and specify which one to import, but including anyway for future reference.

In [88]:
# Import the MapPLUTO data from geo database file (.gdb)
gdb_path = f"{PROJECT_DATA}/files_to_use/MapPLUTO24v4.gdb"


* Import the geodatabase (.gdb) file.

In [89]:

geodata = {}
# List layers in the GDB file
layers = fiona.listlayers(gdb_path)
print("Layers in the GDB file:")
for layer in layers:
    print(layer)
    gdf = gpd.read_file(gdb_path, layer=layer)
    # gdf['borough'] = gdf['Borough'].replace(replacement_dict)
    try:
        gdf['wkb'] = gdf['geometry'].apply(lambda geom: geom.wkb if geom else None)
    except KeyError:
        pass
    geodata[layer] = gdf


Layers in the GDB file:
MapPLUTO_24v4_clipped
NOT_MAPPED_LOTS


In [90]:
geodata.keys()


dict_keys(['MapPLUTO_24v4_clipped', 'NOT_MAPPED_LOTS'])

In [91]:
col_customization_dict

{'Borough': ColCustomization(short_name='Borough', new_name='borough', dtype='String', synonyms=[], definitions=[['BX', 'Bronx'], ['BK', 'Brooklyn'], ['MN', 'Manhattan'], ['QN', 'Queens'], ['SI', 'Staten Island']], drop=False, is_category=True, is_fk=False),
 'Block': ColCustomization(short_name='Block', new_name='tax_block', dtype='Float', synonyms=[], definitions={}, drop=False, is_category=False, is_fk=False),
 'Lot': ColCustomization(short_name='Lot', new_name='tax_lot', dtype='Float', synonyms=[], definitions={}, drop=False, is_category=False, is_fk=False),
 'CD': ColCustomization(short_name='CD', new_name='community_district', dtype='Integer', synonyms=[], definitions={}, drop=False, is_category=True, is_fk=False),
 'BCT2020': ColCustomization(short_name='BCT2020', new_name='census_tract_2020', dtype='String', synonyms=[], definitions={}, drop=False, is_category=False, is_fk=False),
 'BCTCB2020': ColCustomization(short_name='BCTCB2020', new_name='census_block_2020', dtype='String

* Create the table in the Sqlite database and insert the (modified) data from the gdb file.

In [92]:
gdf = geodata['MapPLUTO_24v4_clipped']
gdf.columns

Index(['Borough', 'Block', 'Lot', 'CD', 'BCT2020', 'BCTCB2020', 'CT2010',
       'CB2010', 'SchoolDist', 'Council', 'ZipCode', 'FireComp', 'PolicePrct',
       'HealthCenterDistrict', 'HealthArea', 'Sanitboro', 'SanitDistrict',
       'SanitSub', 'Address', 'ZoneDist1', 'ZoneDist2', 'ZoneDist3',
       'ZoneDist4', 'Overlay1', 'Overlay2', 'SPDist1', 'SPDist2', 'SPDist3',
       'LtdHeight', 'SplitZone', 'BldgClass', 'LandUse', 'Easements',
       'OwnerType', 'OwnerName', 'LotArea', 'BldgArea', 'ComArea', 'ResArea',
       'OfficeArea', 'RetailArea', 'GarageArea', 'StrgeArea', 'FactryArea',
       'OtherArea', 'AreaSource', 'NumBldgs', 'NumFloors', 'UnitsRes',
       'UnitsTotal', 'LotFront', 'LotDepth', 'BldgFront', 'BldgDepth', 'Ext',
       'ProxCode', 'IrrLotCode', 'LotType', 'BsmtCode', 'AssessLand',
       'AssessTot', 'ExemptTot', 'YearBuilt', 'YearAlter1', 'YearAlter2',
       'HistDist', 'Landmark', 'BuiltFAR', 'ResidFAR', 'CommFAR', 'FacilFAR',
       'BoroCode', 'BBL', 'Cond

In [93]:
[col for col in gdf.columns if col not in col_customization_dict.keys()]

['HealthCenterDistrict',
 'Sanitboro',
 'SanitDistrict',
 'FIRM07_FLAG',
 'PFIRM15_FLAG',
 'Shape_Leng',
 'Shape_Area',
 'geometry',
 'wkb']

In [94]:
from sqlalchemy import inspect
inspector = inspect(engine)
print(inspector.get_table_names())  # Ensure "basement_type_or_grade_lookup" is listed


['ElementaryGeometries', 'KNN2', 'SpatialIndex', 'basement_type_or_grade_lookup', 'borough_code_lookup', 'borough_lookup', 'borough_tax_block_and_lot_lookup', 'building_class_lookup', 'city_council_district_lookup', 'community_district_lookup', 'data_licenses', 'extension_code_lookup', 'fire_company_lookup', 'geometry_columns', 'geometry_columns_auth', 'geometry_columns_field_infos', 'geometry_columns_statistics', 'geometry_columns_time', 'health_area_lookup', 'health_center_district_lookup', 'historic_district_name_lookup', 'irregular_lot_code_lookup', 'land_use_category_lookup', 'limited_height_district_lookup', 'lot_type_lookup', 'notes_lookup', 'owner_name_lookup', 'pluto_dtm_base_map_indicator_lookup', 'police_precinct_lookup', 'proximity_code_lookup', 'sanitation_district_boro_lookup', 'sanitation_district_number_lookup', 'school_district_lookup', 'spatial_ref_sys', 'spatial_ref_sys_aux', 'spatialite_history', 'special_purpose_district_lookup', 'sql_statements_log', 'total_buildi

In [95]:
lookup_tables

{'borough': Table('borough_lookup', MetaData(), Column('id', Integer(), table=<borough_lookup>, primary_key=True, nullable=False), Column('name_or_code', String(), table=<borough_lookup>, nullable=False, default=ScalarElementColumnDefault('NO DATA')), Column('info', String(), table=<borough_lookup>), schema=None),
 'community_district': Table('community_district_lookup', MetaData(), Column('id', Integer(), table=<community_district_lookup>, primary_key=True, nullable=False), Column('name_or_code', String(), table=<community_district_lookup>, nullable=False, default=ScalarElementColumnDefault('NO DATA')), Column('info', String(), table=<community_district_lookup>), schema=None),
 'school_district': Table('school_district_lookup', MetaData(), Column('id', Integer(), table=<school_district_lookup>, primary_key=True, nullable=False), Column('name_or_code', String(), table=<school_district_lookup>, nullable=False, default=ScalarElementColumnDefault('NO DATA')), Column('info', String(), tabl

In [96]:
rename_mappings = {v.short_name: v.new_name for v in col_customization_dict.values()}
rename_mappings

{'Borough': 'borough',
 'Block': 'tax_block',
 'Lot': 'tax_lot',
 'CD': 'community_district',
 'BCT2020': 'census_tract_2020',
 'BCTCB2020': 'census_block_2020',
 'CT2010': 'census_tract_2010',
 'CB2010': 'census_block_2010',
 'SchoolDist': 'school_district',
 'Council': 'city_council_district',
 'ZipCode': 'zip_code',
 'FireComp': 'fire_company',
 'PolicePrct': 'police_precinct',
 'HealthCenterDist': 'health_center_district',
 'HealthArea': 'health_area',
 'SanitBoro': 'sanitation_district_boro',
 'SanitDist': 'sanitation_district_number',
 'SanitSub': 'sanitation_subsection',
 'Address': 'address',
 'ZoneDist1': 'zoning_district_1',
 'ZoneDist2': 'zoning_district_2',
 'ZoneDist3': 'zoning_district_3',
 'ZoneDist4': 'zoning_district_4',
 'Overlay1': 'commercial_overlay_1',
 'Overlay2': 'commercial_overlay_2',
 'SPDist1': 'special_purpose_district_1',
 'SPDist2': 'special_purpose_district_2',
 'SPDist3': 'special_purpose_district_3',
 'LtdHeight': 'limited_height_district',
 'SplitZone

In [97]:
gdf = gdf.rename(columns=rename_mappings)

In [98]:
print(gdf.columns)

Index(['borough', 'tax_block', 'tax_lot', 'community_district',
       'census_tract_2020', 'census_block_2020', 'census_tract_2010',
       'census_block_2010', 'school_district', 'city_council_district',
       'zip_code', 'fire_company', 'police_precinct', 'HealthCenterDistrict',
       'health_area', 'Sanitboro', 'SanitDistrict', 'sanitation_subsection',
       'address', 'zoning_district_1', 'zoning_district_2',
       'zoning_district_3', 'zoning_district_4', 'commercial_overlay_1',
       'commercial_overlay_2', 'special_purpose_district_1',
       'special_purpose_district_2', 'special_purpose_district_3',
       'limited_height_district', 'split_boundary_indicator', 'building_class',
       'land_use_category', 'number_of_easements', 'type_of_ownership_code',
       'owner_name', 'lot_area', 'total_building_floor_area',
       'commercial_floor_area', 'residential_floor_area', 'office_floor_area',
       'retail_floor_area', 'garage_floor_area', 'storage_floor_area',
       'f

In [99]:
# A few of the column names did not exactly match up due to slightly different field names than specified in the data dictionary, so these need to be renamed manually:

more_mappings = {
    "HealthCenterDistrict": "health_center_district",
    "SanitDistrict": "sanitation_district_number",
    "Sanitboro": "sanitation_district_boro",
    "FIRM07_FLAG": "2007_flood_insurance_rate_map_indicator",
    "PFIRM15_FLAG": "2015_preliminary_flood_insurance_rate_map",
}
gdf = gdf.rename(columns=more_mappings)

In [100]:
print(gdf.columns)

Index(['borough', 'tax_block', 'tax_lot', 'community_district',
       'census_tract_2020', 'census_block_2020', 'census_tract_2010',
       'census_block_2010', 'school_district', 'city_council_district',
       'zip_code', 'fire_company', 'police_precinct', 'health_center_district',
       'health_area', 'sanitation_district_boro', 'sanitation_district_number',
       'sanitation_subsection', 'address', 'zoning_district_1',
       'zoning_district_2', 'zoning_district_3', 'zoning_district_4',
       'commercial_overlay_1', 'commercial_overlay_2',
       'special_purpose_district_1', 'special_purpose_district_2',
       'special_purpose_district_3', 'limited_height_district',
       'split_boundary_indicator', 'building_class', 'land_use_category',
       'number_of_easements', 'type_of_ownership_code', 'owner_name',
       'lot_area', 'total_building_floor_area', 'commercial_floor_area',
       'residential_floor_area', 'office_floor_area', 'retail_floor_area',
       'garage_floor_a

In [101]:
[col for col in gdf.columns if col not in [i.new_name for i in col_customization_dict.values()]]

['Shape_Leng', 'Shape_Area', 'geometry', 'wkb']

In [102]:
from sqlalchemy import Table, MetaData, Column, Integer, String, ForeignKey, LargeBinary, Float, Date
from sqlalchemy.orm import declarative_base

# Base = declarative_base()
# metadata = MetaData()

# Reflect the existing database tables once
metadata.reflect(bind=engine)

# Function to map custom dtype to SQLAlchemy types
def map_custom_dtype(dtype):
    if dtype == 'Integer':
        return Integer
    elif dtype == 'String':
        return String
    elif dtype == 'Float':
        return Float
    elif dtype == 'Date':
        return Date
    elif dtype == 'LargeBinary':
        return LargeBinary
    else:
        raise ValueError(f"Unsupported dtype: {dtype}")

# Function to dynamically create the table class
def create_dynamic_table_class(table_name, col_customization_dict):
    attrs = {
        '__tablename__': table_name,
        'id': Column(Integer, primary_key=True, autoincrement=True),
        'geometry': Column(String),  
        'wkb': Column(LargeBinary),  # Use LargeBinary for WKB
        'Shape_Leng' : Column(Float), # Add columns not listed in the data dictionary
        'Shape_Area' : Column(Float),
    }
    
    for k, v in col_customization_dict.items():
        col_type = map_custom_dtype(v.dtype)
        if v.is_fk:
            attrs[v.new_name] = Column(col_type, ForeignKey(f'{v.new_name}_lookup.id'))
        else:
            attrs[v.new_name] = Column(col_type)
    
    return type(table_name, (Base,), attrs)

# Create the MapPLUTO_24v4_clipped table class
MapPLUTO24v4Clipped = create_dynamic_table_class('MapPLUTO_24v4_clipped', col_customization_dict)

# Reflect the metadata again to ensure it includes the new table class
metadata.reflect(bind=engine)

# Create all tables in the database
Base.metadata.create_all(engine)

# from sqlalchemy import Table, MetaData, Column, Integer, String, ForeignKey, LargeBinary
# from sqlalchemy.orm import declarative_base

# Base = declarative_base()
# metadata = MetaData()
# metadata.reflect(bind=engine)

# # Create the table dynamically
# def create_dynamic_table(table_name, col_customization_dict):
#     columns = [Column('id', Integer, primary_key=True, autoincrement=True)]
#     for _, col_info in col_customization_dict.items():
#         col_name = col_info.new_name
#         col_type = getattr(sqlalchemy, col_info.dtype)
#         if col_info.is_fk:
#             columns.append(Column(col_name, col_type))
#             foreign_key = ForeignKey(f'{col_info.new_name}_lookup.id')
#             columns.append(Column(f'{col_name}_lookup', Integer, foreign_key))
#         else:
#             columns.append(Column(col_name, col_type))
    
#     return Table(table_name, metadata, *columns)

# # Create the MapPLUTO_24v4_clipped table
# map_pluto_table = create_dynamic_table('MapPLUTO_24v4_clipped', col_customization_dict)

# # Create all tables in the database
# metadata.create_all(engine)

# metadata.reflect(bind=engine)


In [103]:
gdf.columns

Index(['borough', 'tax_block', 'tax_lot', 'community_district',
       'census_tract_2020', 'census_block_2020', 'census_tract_2010',
       'census_block_2010', 'school_district', 'city_council_district',
       'zip_code', 'fire_company', 'police_precinct', 'health_center_district',
       'health_area', 'sanitation_district_boro', 'sanitation_district_number',
       'sanitation_subsection', 'address', 'zoning_district_1',
       'zoning_district_2', 'zoning_district_3', 'zoning_district_4',
       'commercial_overlay_1', 'commercial_overlay_2',
       'special_purpose_district_1', 'special_purpose_district_2',
       'special_purpose_district_3', 'limited_height_district',
       'split_boundary_indicator', 'building_class', 'land_use_category',
       'number_of_easements', 'type_of_ownership_code', 'owner_name',
       'lot_area', 'total_building_floor_area', 'commercial_floor_area',
       'residential_floor_area', 'office_floor_area', 'retail_floor_area',
       'garage_floor_a

In [104]:
for col in MapPLUTO24v4Clipped.__table__.columns:
    print(col.name)

id
geometry
wkb
Shape_Leng
Shape_Area
borough
tax_block
tax_lot
community_district
census_tract_2020
census_block_2020
census_tract_2010
census_block_2010
school_district
city_council_district
zip_code
fire_company
police_precinct
health_center_district
health_area
sanitation_district_boro
sanitation_district_number
sanitation_subsection
address
zoning_district_1
zoning_district_2
zoning_district_3
zoning_district_4
commercial_overlay_1
commercial_overlay_2
special_purpose_district_1
special_purpose_district_2
special_purpose_district_3
limited_height_district
split_boundary_indicator
building_class
land_use_category
number_of_easements
type_of_ownership_code
owner_name
lot_area
total_building_floor_area
commercial_floor_area
residential_floor_area
office_floor_area
retail_floor_area
garage_floor_area
storage_floor_area
factory_floor_area
other_floor_area
total_building_floor_area_source_code
number_of_buildings
number_of_floors
residential_units
total_units
lot_frontage
lot_depth
buil

In [105]:
inspect(MapPLUTO24v4Clipped).columns.items()

[('id',
  Column('id', Integer(), table=<MapPLUTO_24v4_clipped>, primary_key=True, nullable=False)),
 ('geometry', Column('geometry', String(), table=<MapPLUTO_24v4_clipped>)),
 ('wkb', Column('wkb', LargeBinary(), table=<MapPLUTO_24v4_clipped>)),
 ('Shape_Leng', Column('Shape_Leng', Float(), table=<MapPLUTO_24v4_clipped>)),
 ('Shape_Area', Column('Shape_Area', Float(), table=<MapPLUTO_24v4_clipped>)),
 ('borough', Column('borough', String(), table=<MapPLUTO_24v4_clipped>)),
 ('tax_block', Column('tax_block', Float(), table=<MapPLUTO_24v4_clipped>)),
 ('tax_lot', Column('tax_lot', Float(), table=<MapPLUTO_24v4_clipped>)),
 ('community_district',
  Column('community_district', Integer(), table=<MapPLUTO_24v4_clipped>)),
 ('census_tract_2020',
  Column('census_tract_2020', String(), table=<MapPLUTO_24v4_clipped>)),
 ('census_block_2020',
  Column('census_block_2020', String(), table=<MapPLUTO_24v4_clipped>)),
 ('census_tract_2010',
  Column('census_tract_2010', String(), table=<MapPLUTO_

In [106]:
gdf.columns

Index(['borough', 'tax_block', 'tax_lot', 'community_district',
       'census_tract_2020', 'census_block_2020', 'census_tract_2010',
       'census_block_2010', 'school_district', 'city_council_district',
       'zip_code', 'fire_company', 'police_precinct', 'health_center_district',
       'health_area', 'sanitation_district_boro', 'sanitation_district_number',
       'sanitation_subsection', 'address', 'zoning_district_1',
       'zoning_district_2', 'zoning_district_3', 'zoning_district_4',
       'commercial_overlay_1', 'commercial_overlay_2',
       'special_purpose_district_1', 'special_purpose_district_2',
       'special_purpose_district_3', 'limited_height_district',
       'split_boundary_indicator', 'building_class', 'land_use_category',
       'number_of_easements', 'type_of_ownership_code', 'owner_name',
       'lot_area', 'total_building_floor_area', 'commercial_floor_area',
       'residential_floor_area', 'office_floor_area', 'retail_floor_area',
       'garage_floor_a

In [107]:
col_customization_dict.values()

dict_values([ColCustomization(short_name='Borough', new_name='borough', dtype='String', synonyms=[], definitions=[['BX', 'Bronx'], ['BK', 'Brooklyn'], ['MN', 'Manhattan'], ['QN', 'Queens'], ['SI', 'Staten Island']], drop=False, is_category=True, is_fk=False), ColCustomization(short_name='Block', new_name='tax_block', dtype='Float', synonyms=[], definitions={}, drop=False, is_category=False, is_fk=False), ColCustomization(short_name='Lot', new_name='tax_lot', dtype='Float', synonyms=[], definitions={}, drop=False, is_category=False, is_fk=False), ColCustomization(short_name='CD', new_name='community_district', dtype='Integer', synonyms=[], definitions={}, drop=False, is_category=True, is_fk=False), ColCustomization(short_name='BCT2020', new_name='census_tract_2020', dtype='String', synonyms=[], definitions={}, drop=False, is_category=False, is_fk=False), ColCustomization(short_name='BCTCB2020', new_name='census_block_2020', dtype='String', synonyms=[], definitions={}, drop=False, is_cat

In [108]:
datetime_cols = [col.new_name for col in col_customization_dict.values() if col.dtype == 'Date']
# datetime_cols = [col for col in datetime_cols if col is not None]
datetime_cols

['apportionment_date']

In [109]:
from sqlalchemy.orm import sessionmaker
from shapely import wkb




# Create a session
session = SessionLocal()

# gdf = geodata['MapPLUTO_24v4_clipped']

# Prepare the data for insertion
batch_size = 1000
for start in range(0, len(gdf), batch_size):
    batch = gdf.iloc[start:start + batch_size]
    for _, row in batch.iterrows():
        if row['apportionment_date']:
            row['apportionment_date'] = parseDateString(row['apportionment_date'])
        geometry_wkb = row['geometry'].wkb if row['geometry'] else None
        pluto_entry = MapPLUTO24v4Clipped(
            geometry=geometry_wkb,
            **{col: row[col] for col in gdf.columns if col not in ['geometry']}
        )
        session.add(pluto_entry)
    session.commit()

# Close the session
session.close()

In [110]:
del gdf

In [111]:
import time
from sqlalchemy.exc import OperationalError

def populate_lookup_table(engine, lookup_table, source_table_name, lookup_table_name, text_column_name, chunk_size=100, max_retries=5):
    """
    Populate a lookup table in chunks with retries for database lock issues.
    """
    def retry(func, *args, **kwargs):
        """Retry function with backoff for SQLite locks."""
        for attempt in range(max_retries):
            try:
                return func(*args, **kwargs)
            except OperationalError as e:
                if "database is locked" in str(e):
                    print(f"Database is locked. Retrying ({attempt + 1}/{max_retries})...")
                    time.sleep(0.2 * (attempt + 1))  # Gradual backoff
                else:
                    raise
        raise Exception("Exceeded maximum retries due to database locks.")
    
    with engine.connect() as connection:
        # Ensure the new column exists
        try:
            retry(connection.execute, text(f"ALTER TABLE {source_table_name} ADD COLUMN {text_column_name}_id INTEGER"))
        except Exception as e:
            print(f"Column creation skipped or failed: {e}")

        # Process unique values in chunks
        unique_query = f"SELECT DISTINCT {text_column_name} FROM {source_table_name}"
        unique_values_iter = pd.read_sql(unique_query, engine, chunksize=chunk_size)
        
        for chunk in unique_values_iter:
            unique_values = chunk[text_column_name].dropna().tolist()

            # Insert into the lookup table in small batches
            for value in unique_values:
                stmt = insert(lookup_table).values({text_column_name: value}).on_conflict_do_nothing()
                try:
                    retry(connection.execute, stmt)
                except Exception as e:
                    print(f"Error inserting value '{value}': {e}")

        # Update the source table with foreign key references
        update_stmt = text(f"""
        UPDATE {source_table_name}
        SET {text_column_name}_id = (
            SELECT id 
            FROM {lookup_table_name}
            WHERE {text_column_name} = {source_table_name}.{text_column_name}
        )
        """)
        try:
            retry(connection.execute, update_stmt)
        except Exception as e:
            print(f"Error updating foreign keys: {e}")
        connection.commit()
        # Remove the original text column (optional)
        connection.execute(text(f"ALTER TABLE {source_table_name} DROP COLUMN {text_column_name}"))
        connection.commit()


In [112]:
print(SQLITE_PATH)

sqlite:////home/james/Massive/PROJECTDATA/nyc_real_estate_data/nyc_real_estate_db.sqlite


* Make a test plot to verify that the geodata was stored correctly

In [None]:
import geopandas as gpd
import pandas as pd
from shapely import wkb
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import networkx as nx
from sqlalchemy import create_engine, event, text

# Read the data from the database
query = "SELECT zip_code, geometry FROM MapPLUTO_24v4_clipped"
df = pd.read_sql(query, engine)

# Debug: Print the DataFrame columns
print("DataFrame columns:", df.columns)

# Convert the geometry column from WKB to Shapely geometries
df['geometry'] = df['geometry'].apply(lambda x: wkb.loads(x) if x else None)

# Convert the DataFrame to a GeoDataFrame
gdf = gpd.GeoDataFrame(df, geometry='geometry')

# Print the GeoDataFrame
print(gdf.head())

# Ensure that zip_code is preserved during the dissolve process
merged_gdf = gdf.dissolve(by='zip_code', aggfunc={'zip_code': 'first'})  # Explicit aggregation of zip_code

# Check if zip_code is now present after dissolving
print(merged_gdf.columns)  # Should include 'zip_code'

# Create a new adjacency graph based on the merged geometries
G = nx.Graph()

# Add nodes and edges based on adjacency of merged shapes
for i, shape1 in merged_gdf.iterrows():
    for j, shape2 in merged_gdf.iterrows():
        if i != j and shape1.geometry.touches(shape2.geometry):
            G.add_edge(i, j)

# Perform graph coloring to ensure adjacent shapes don't share the same color
color_map = nx.coloring.greedy_color(G, strategy="largest_first")

# Plot the map with the colors assigned
fig, ax = plt.subplots(1, 1, figsize=(10, 10))

# Normalize the color map to cover the full range of the node indices
norm = mcolors.Normalize(vmin=min(color_map.values()), vmax=max(color_map.values()))
sm = plt.cm.ScalarMappable(cmap=plt.cm.tab20, norm=norm)

# Color the merged geometries based on the graph coloring using the full palette
merged_gdf['color'] = merged_gdf.index.map(color_map)
merged_gdf.plot(ax=ax, color=[sm.to_rgba(i) for i in merged_gdf['color']], edgecolor='black', linewidth=0, legend=False)

# Add labels at the center of each merged shape
for _, row in merged_gdf.iterrows():
    centroid = row.geometry.centroid
    ax.text(centroid.x, centroid.y, str(row['zip_code']), fontsize=2, ha='center', va='center')

# Add a colorbar to visualize the full range of colors
cbar = fig.colorbar(sm, ax=ax)
cbar.set_label('Color Range (Graph Coloring)', rotation=270, labelpad=20)

plt.savefig("/home/james/Massive/PROJECTDATA/map_output_zip_shuffled2.pdf", format="pdf")

plt.show()

OperationalError: (sqlite3.OperationalError) no such column: ZipCode
[SQL: SELECT ZipCode, geometry FROM MapPLUTO_24v4_clipped]
(Background on this error at: https://sqlalche.me/e/20/e3q8)