# Create and populate the database

In [None]:
# # Import standard libraries
# import os
import re
import json
import codecs
# import requests
import time
import dill
time.sleep(3)

from urllib.request import urlopen

# # Import third-party libraries
# import geopandas as gpd
# from geoalchemy2 import Geometry
import pandas as pd
from sqlalchemy import create_engine, Column, Integer, String, Date, MetaData, event, Table, text, LargeBinary, ForeignKey
from sqlalchemy.dialects.sqlite import insert
from sqlalchemy.orm import sessionmaker
# from sqlalchemy.event import listen
# from sqlalchemy.engine import Engine
# from sqlalchemy.ext.declarative import declarative_base

# import sqlite3
# from sqlite3 import dbapi2 as sqlite

# import fiona
# from fiona.crs import from_epsg

from src.helpers import *
from src.dbutils import *
from src.ORMutils import *
from src.models import *
from src.geo import *
from src.pdfutils import *

* #### Load the objects created in previous notebooks

In [2]:
# Load the environment
with open("environment_data/table_dicts.pkl", "rb") as f:
    env = dill.load(f)

# Restore the environment
globals().update(env)

In [3]:
# Load the environment
with open("environment_data/select.pkl", "rb") as f:
    env = dill.load(f)

# Restore the environment
globals().update(env)

In [4]:
for name, dataset in datasets.items():
    print(dataset.short_name)

mapPLUTO
assessments
tax_liens
housing_violations
assessment_actions
housing_database
NTAs2020
NTA_population_2020
NTA_demographics_2020
census_blocks2020
CDTAs2020
puma2020
cert_of_occupancy


* ### Create the database engine that will be used throughout the rest of the notebook.

In [5]:
engine = create_engine(f'{SQLITE_PATH}?check_same_thread=False', echo=False)

SessionLocal = sessionmaker(bind=engine, autoflush=False, autocommit=False)

* ### Configure the database

In [6]:
@event.listens_for(engine, "connect")
def load_spatialite(dbapi_conn, connection_record):
    print("Loading SpatiaLite extension")
    dbapi_conn.enable_load_extension(True)
    dbapi_conn.load_extension("mod_spatialite")
    dbapi_conn.enable_load_extension(False)


with engine.connect() as conn:
    print("Connection established")
    result = conn.execute(text("SELECT spatialite_version()"))
    spatialite_version = result.fetchone()
    print(f"SpatiaLite version: {spatialite_version[0]}")

# Enable WAL mode
with engine.connect() as conn:
    conn.execute(text("PRAGMA journal_mode=WAL"))

# Initialize spatial metadata if not already present
with engine.connect() as conn:
    conn.execute(text("SELECT InitSpatialMetaData(1)"))

Loading SpatiaLite extension
Connection established
SpatiaLite version: 5.1.0


In [7]:
Base = declarative_base()
metadata = MetaData()
Base.metadata.reflect(bind=engine) 

  Base = declarative_base()


* ### Create lookup tables variables identified as categorical and for which definitions were extracted from the metadata in the previous notebook.

* There are borough codes in the PLUTO dataset, but annyoingly, in contrast to most other datasets, the borough code is a two letter inital like "BK" or "BX". Also in the PLUTO dataset, "Sanitation Borough" does use the standard numeric codes that most other NYC OpenData datasets use. All this is is to say that it requires special handling separate from my system to extract categories and create lookup tables for them programatically.

In [8]:
lookups = {k:v for k,v in col_customization_dict.items() if col_customization_dict[k].is_category == True} # Codes for overlays are to go in the same table as other zoning codes

completed_tables = [] # This is for tracking the names of tables that have been created, which will be used to avoid creating redundant tables for columns that are same-kind repeats (such as "district_1" and "district_2"), and thus will use the same lookups.

lookup_tables = {}

for name,table in lookups.items():
    print(f"processing {table}")
    lookup_table_name= re.sub('_[0-9]+$', '', table.new_name)
    print(f"lookup_table_name: {lookup_table_name}")
    if any([table.new_name.startswith(prefix) and table.new_name[-1].isdigit() for prefix in completed_tables]):
    # if table.new_name[0:len(table)*75] in completed_tables:
        print(f"Lookup table {lookup_table_name} already created, continuing...")
        continue
    with engine.connect() as connection:
        print(f"Creating lookup table {lookup_table_name}...")
        # lookup_table = create_lookup_table(engine=engine, lookup_table_name=lookup_table_name, text_column_name='name_or_code')
        lookup_table = create_lookup_table(Base.metadata, lookup_table_name=lookup_table_name, text_column_name='name_or_code')
        print(f"Created lookup table: {lookup_table}")
        # name_prefix = table.new_name[0:round(len(table.new_name)*.75)] # Hopefully this is a safe threshold to identify when columns are repeats of the same type
        name_prefix = lookup_table_name
        completed_tables.append(name_prefix)
        # lookup_tables[lookup_table_name] = lookup_table
        lookups[name].orm = lookup_table

# metadata.create_all(engine)
Base.metadata.create_all(engine)



processing ColCustomization(short_name='Borough', new_name='borough', dtype='String', synonyms=[], definitions=[['BX', 'Bronx'], ['BK', 'Brooklyn'], ['MN', 'Manhattan'], ['QN', 'Queens'], ['SI', 'Staten Island']], drop=False, is_category=True, is_fk=False, orm=None)
lookup_table_name: borough
Creating lookup table borough...
Created lookup table: borough_lookup
processing ColCustomization(short_name='CD', new_name='community_district', dtype='Integer', synonyms=[], definitions=[], drop=False, is_category=True, is_fk=False, orm=None)
lookup_table_name: community_district
Creating lookup table community_district...
Created lookup table: community_district_lookup
processing ColCustomization(short_name='SchoolDist', new_name='school_district', dtype='String', synonyms=[], definitions=[], drop=False, is_category=True, is_fk=False, orm=None)
lookup_table_name: school_district
Creating lookup table school_district...
Created lookup table: school_district_lookup
processing ColCustomization(sho

In [9]:
lookups

{'borough': ColCustomization(short_name='Borough', new_name='borough', dtype='String', synonyms=[], definitions=[['BX', 'Bronx'], ['BK', 'Brooklyn'], ['MN', 'Manhattan'], ['QN', 'Queens'], ['SI', 'Staten Island']], drop=False, is_category=True, is_fk=False, orm=Table('borough_lookup', MetaData(), Column('id', Integer(), table=<borough_lookup>, primary_key=True, nullable=False), Column('name_or_code', String(), table=<borough_lookup>, nullable=False, default=ScalarElementColumnDefault('NO DATA')), Column('info', String(), table=<borough_lookup>), schema=None)),
 'community_district': ColCustomization(short_name='CD', new_name='community_district', dtype='Integer', synonyms=[], definitions=[], drop=False, is_category=True, is_fk=False, orm=Table('community_district_lookup', MetaData(), Column('id', Integer(), table=<community_district_lookup>, primary_key=True, nullable=False), Column('name_or_code', String(), table=<community_district_lookup>, nullable=False, default=ScalarElementColumn

In [10]:
multicolumns = {'zoning_district': 4, 'commercial_overlay': 2, 'special_purpose_district': 3}
for name,repetitions in multicolumns.items():
    print(f"processing {name}")
    cols = {k:v for k,v in col_customization_dict.items() if col_customization_dict[k].new_name.startswith(name)}
    main_col = [v for k,v in col_customization_dict.items() if col_customization_dict[k].new_name.endswith("_1")][0]
    print(cols.keys())
    print(f'main col: {main_col}')
    print(f'main col ORM: {main_col.orm}')
    for key in cols.keys():
        lookups[key].orm = main_col.orm

processing zoning_district
dict_keys(['zoning_district_1', 'zoning_district_2', 'zoning_district_3', 'zoning_district_4'])
main col: ColCustomization(short_name='ZoneDist1', new_name='zoning_district_1', dtype='String', synonyms=[], definitions=[['R1–1', '{"R1\\u2013R3 Lower-Density Residence Districts, ":{"0":"Single-family detached residences","1":"Two-family detached residences","2":"Semi-detached residences","3":"All residences","4":"Residential FAR (max)","5":"with attic allowance","6":"Community facility FAR (max)","7":"Lot width (min), Detached","8":"Lot width (min), Other","9":"Lot area (min), Detached","10":"Lot area (min), Other","11":"Open space ratio (min)","12":"Lot coverage (max)","13":"Front yard depth (min)","14":"Side yards   (number), Detached","15":"Side yards   (number), Semi-detached","16":"Total width of side yards (min), Detached","17":"Total width of side yards (min), Semi-detached","18":"Each side yard (min), Detached","19":"Each side yard (min), Semi-detached"

In [11]:
lookups

{'borough': ColCustomization(short_name='Borough', new_name='borough', dtype='String', synonyms=[], definitions=[['BX', 'Bronx'], ['BK', 'Brooklyn'], ['MN', 'Manhattan'], ['QN', 'Queens'], ['SI', 'Staten Island']], drop=False, is_category=True, is_fk=False, orm=Table('borough_lookup', MetaData(), Column('id', Integer(), table=<borough_lookup>, primary_key=True, nullable=False), Column('name_or_code', String(), table=<borough_lookup>, nullable=False, default=ScalarElementColumnDefault('NO DATA')), Column('info', String(), table=<borough_lookup>), schema=None)),
 'community_district': ColCustomization(short_name='CD', new_name='community_district', dtype='Integer', synonyms=[], definitions=[], drop=False, is_category=True, is_fk=False, orm=Table('community_district_lookup', MetaData(), Column('id', Integer(), table=<community_district_lookup>, primary_key=True, nullable=False), Column('name_or_code', String(), table=<community_district_lookup>, nullable=False, default=ScalarElementColumn

In [12]:
lookup_tables

{}

In [13]:
for name,table in lookups.items():
    lookup_table = table.orm
    if lookup_table is None:
        print(f"Skipping {name}...")
        continue
    print(lookup_table)
    with engine.connect() as connection:
        for definition in table.definitions:
            if len(definition) == 2:
                try:
                    stmt = insert(lookup_table).values(id=int(definition[0]), name_or_code=definition[1]).on_conflict_do_nothing()
                except ValueError:
                    stmt = insert(lookup_table).values(name_or_code=definition[0], info=definition[1]).on_conflict_do_nothing()
            elif len(definition) == 3:
                try:
                    stmt = insert(lookup_table).values(id=int(definition[0]), name_or_code=definition[1], info=definition[2]).on_conflict_do_nothing()
                except Exception as e:
                    print(e)
                    print(definition)
                    # stmt = insert(lookup_table).values(id=definition[0], name_or_code=definition[1], info=definition[2]).on_conflict_do_nothing()
            else:
                print(definition)
                raise ValueError("Was only expecting two or three columns")
            connection.execute(stmt)
        connection.commit()
    name_prefix = table.new_name[0:round(len(table.new_name)*.75)] # Hopefully this is a safe threshold to identify when columns are repeats of the same type
    # completed_tables.append(name_prefix)

borough_lookup
community_district_lookup
school_district_lookup
city_council_district_lookup
zip_code_lookup
fire_company_lookup
police_precinct_lookup
health_center_district_lookup
health_area_lookup
sanitation_district_boro_lookup
sanitation_district_number_lookup
zoning_district_lookup
zoning_district_lookup
zoning_district_lookup
zoning_district_lookup
zoning_district_lookup
zoning_district_lookup
zoning_district_lookup
zoning_district_lookup
zoning_district_lookup
limited_height_district_lookup
building_class_lookup
land_use_category_lookup
type_of_ownership_code_lookup
owner_name_lookup
total_building_floor_area_source_code_lookup
extension_code_lookup
proximity_code_lookup
irregular_lot_code_lookup
lot_type_lookup
basement_type_or_grade_lookup
historic_district_name_lookup
borough_code_lookup
zoning_map_code_lookup
pluto_dtm_base_map_indicator_lookup
notes_lookup


In [14]:
col_customization_dict

{'borough': ColCustomization(short_name='Borough', new_name='borough', dtype='String', synonyms=[], definitions=[['BX', 'Bronx'], ['BK', 'Brooklyn'], ['MN', 'Manhattan'], ['QN', 'Queens'], ['SI', 'Staten Island']], drop=False, is_category=True, is_fk=False, orm=Table('borough_lookup', MetaData(), Column('id', Integer(), table=<borough_lookup>, primary_key=True, nullable=False), Column('name_or_code', String(), table=<borough_lookup>, nullable=False, default=ScalarElementColumnDefault('NO DATA')), Column('info', String(), table=<borough_lookup>), schema=None)),
 'tax_block': ColCustomization(short_name='Block', new_name='tax_block', dtype='Float', synonyms=[], definitions=[], drop=False, is_category=False, is_fk=False, orm=None),
 'tax_lot': ColCustomization(short_name='Lot', new_name='tax_lot', dtype='Float', synonyms=[], definitions=[], drop=False, is_category=False, is_fk=False, orm=None),
 'community_district': ColCustomization(short_name='CD', new_name='community_district', dtype='

In [15]:
# with engine.connect() as conn:
#     for row in class_codes:
#         print(row)
#         stmt = insert(lookup_tables['building_class']).values(name_or_code=row['code'], info=row['name']).prefix_with("OR IGNORE")
#         conn.execute(stmt)
#         conn.commit()

## Import the MaPLUTO data:
* List the layers in the file
* In this case there is only one layer, so it isn't necessary to know and specify which one to import, but including anyway for future reference.

In [16]:
# Import the MapPLUTO data from geo database file (.gdb)
gdb_path = f"{PROJECT_DATA}/files_to_use/MapPLUTO24v4.gdb"


* Import the geodatabase (.gdb) file.

In [17]:

geodata = {}
# List layers in the GDB file
layers = fiona.listlayers(gdb_path)
print("Layers in the GDB file:")
for layer in layers:
    print(layer)
    gdf = gpd.read_file(gdb_path, layer=layer)
    # gdf['borough'] = gdf['Borough'].replace(replacement_dict)
    try:
        gdf['wkb'] = gdf['geometry'].apply(lambda geom: geom.wkb if geom else None)
    except KeyError:
        pass
    geodata[layer] = gdf


Layers in the GDB file:
MapPLUTO_24v4_clipped
NOT_MAPPED_LOTS


In [18]:
geodata.keys()


dict_keys(['MapPLUTO_24v4_clipped', 'NOT_MAPPED_LOTS'])

In [19]:
col_customization_dict

{'borough': ColCustomization(short_name='Borough', new_name='borough', dtype='String', synonyms=[], definitions=[['BX', 'Bronx'], ['BK', 'Brooklyn'], ['MN', 'Manhattan'], ['QN', 'Queens'], ['SI', 'Staten Island']], drop=False, is_category=True, is_fk=False, orm=Table('borough_lookup', MetaData(), Column('id', Integer(), table=<borough_lookup>, primary_key=True, nullable=False), Column('name_or_code', String(), table=<borough_lookup>, nullable=False, default=ScalarElementColumnDefault('NO DATA')), Column('info', String(), table=<borough_lookup>), schema=None)),
 'tax_block': ColCustomization(short_name='Block', new_name='tax_block', dtype='Float', synonyms=[], definitions=[], drop=False, is_category=False, is_fk=False, orm=None),
 'tax_lot': ColCustomization(short_name='Lot', new_name='tax_lot', dtype='Float', synonyms=[], definitions=[], drop=False, is_category=False, is_fk=False, orm=None),
 'community_district': ColCustomization(short_name='CD', new_name='community_district', dtype='

* Create the table in the Sqlite database and insert the (modified) data from the gdb file.

In [20]:
gdf = geodata['MapPLUTO_24v4_clipped']
# is_whole_number = [(gdf[col] % 1 == 0).all() for col in gdf.columns if gdf[col].dtype == 'float']
is_whole_number = {(gdf[col].notna() & (gdf[col] % 1 == 0)).all() for col in gdf.columns if gdf[col].dtype == 'float'}
# is_whole_number = {col : (gdf[col] % 1 == 0).all() for col in gdf.columns if gdf[col].dtype == 'float'}
gdf.columns

Index(['Borough', 'Block', 'Lot', 'CD', 'BCT2020', 'BCTCB2020', 'CT2010',
       'CB2010', 'SchoolDist', 'Council', 'ZipCode', 'FireComp', 'PolicePrct',
       'HealthCenterDistrict', 'HealthArea', 'Sanitboro', 'SanitDistrict',
       'SanitSub', 'Address', 'ZoneDist1', 'ZoneDist2', 'ZoneDist3',
       'ZoneDist4', 'Overlay1', 'Overlay2', 'SPDist1', 'SPDist2', 'SPDist3',
       'LtdHeight', 'SplitZone', 'BldgClass', 'LandUse', 'Easements',
       'OwnerType', 'OwnerName', 'LotArea', 'BldgArea', 'ComArea', 'ResArea',
       'OfficeArea', 'RetailArea', 'GarageArea', 'StrgeArea', 'FactryArea',
       'OtherArea', 'AreaSource', 'NumBldgs', 'NumFloors', 'UnitsRes',
       'UnitsTotal', 'LotFront', 'LotDepth', 'BldgFront', 'BldgDepth', 'Ext',
       'ProxCode', 'IrrLotCode', 'LotType', 'BsmtCode', 'AssessLand',
       'AssessTot', 'ExemptTot', 'YearBuilt', 'YearAlter1', 'YearAlter2',
       'HistDist', 'Landmark', 'BuiltFAR', 'ResidFAR', 'CommFAR', 'FacilFAR',
       'BoroCode', 'BBL', 'Cond

In [21]:
gdf['ZipCode'].dtype

dtype('float64')

In [22]:
# Function to check if all non-NaN values are whole numbers
# Function to check if all non-NaN values are whole numbers
def is_whole_number_series(s):
    return (s.dropna() % 1 == 0).all() 

# Iterate over columns and change dtype to int where applicable
for col in gdf.columns:
    if  gdf[col].dtype == float and is_whole_number_series(gdf[col]):
        print(f'Column {col} is {is_whole_number_series(gdf[col])}')
        print(f'Converting {col} to integer')
        gdf[col] = gdf[col].astype('Int64')  # 'Int64' for nullable integer type in Pandas
    else:
        print(f"Skipping {col}")


Skipping Borough
Skipping Block
Skipping Lot
Column CD is True
Converting CD to integer
Skipping BCT2020
Skipping BCTCB2020
Skipping CT2010
Skipping CB2010
Skipping SchoolDist
Column Council is True
Converting Council to integer
Column ZipCode is True
Converting ZipCode to integer
Skipping FireComp
Column PolicePrct is True
Converting PolicePrct to integer
Column HealthCenterDistrict is True
Converting HealthCenterDistrict to integer
Column HealthArea is True
Converting HealthArea to integer
Skipping Sanitboro
Skipping SanitDistrict
Skipping SanitSub
Skipping Address
Skipping ZoneDist1
Skipping ZoneDist2
Skipping ZoneDist3
Skipping ZoneDist4
Skipping Overlay1
Skipping Overlay2
Skipping SPDist1
Skipping SPDist2
Skipping SPDist3
Skipping LtdHeight
Skipping SplitZone
Skipping BldgClass
Skipping LandUse
Column Easements is True
Converting Easements to integer
Skipping OwnerType
Skipping OwnerName
Column LotArea is True
Converting LotArea to integer
Column BldgArea is True
Converting BldgAr

In [23]:
from sqlalchemy import inspect
inspector = inspect(engine)
print(inspector.get_table_names())  # Ensure "basement_type_or_grade_lookup" is listed


['ElementaryGeometries', 'KNN2', 'SpatialIndex', 'basement_type_or_grade_lookup', 'borough_code_lookup', 'borough_lookup', 'building_class_lookup', 'city_council_district_lookup', 'commercial_overlay_lookup', 'community_district_lookup', 'data_licenses', 'extension_code_lookup', 'fire_company_lookup', 'geometry_columns', 'geometry_columns_auth', 'geometry_columns_field_infos', 'geometry_columns_statistics', 'geometry_columns_time', 'health_area_lookup', 'health_center_district_lookup', 'historic_district_name_lookup', 'irregular_lot_code_lookup', 'land_use_category_lookup', 'limited_height_district_lookup', 'lot_type_lookup', 'notes_lookup', 'owner_name_lookup', 'pluto_dtm_base_map_indicator_lookup', 'police_precinct_lookup', 'proximity_code_lookup', 'sanitation_district_boro_lookup', 'sanitation_district_number_lookup', 'school_district_lookup', 'spatial_ref_sys', 'spatial_ref_sys_aux', 'spatialite_history', 'special_purpose_district_lookup', 'sql_statements_log', 'total_building_floo

In [24]:
rename_mappings = {v.short_name: v.new_name for v in col_customization_dict.values()}

In [25]:
gdf = gdf.rename(columns=rename_mappings)

In [26]:
print(gdf.columns)

Index(['borough', 'tax_block', 'tax_lot', 'community_district',
       'census_tract_2020', 'census_block_2020', 'census_tract_2010',
       'census_block_2010', 'school_district', 'city_council_district',
       'zip_code', 'fire_company', 'police_precinct', 'HealthCenterDistrict',
       'health_area', 'Sanitboro', 'SanitDistrict', 'sanitation_subsection',
       'address', 'zoning_district_1', 'zoning_district_2',
       'zoning_district_3', 'zoning_district_4', 'commercial_overlay_1',
       'commercial_overlay_2', 'special_purpose_district_1',
       'special_purpose_district_2', 'special_purpose_district_3',
       'limited_height_district', 'split_boundary_indicator', 'building_class',
       'land_use_category', 'number_of_easements', 'type_of_ownership_code',
       'owner_name', 'lot_area', 'total_building_floor_area',
       'commercial_floor_area', 'residential_floor_area', 'office_floor_area',
       'retail_floor_area', 'garage_floor_area', 'storage_floor_area',
       'f

In [27]:
# A few of the column names did not exactly match up due to slightly different field names than specified in the data dictionary, so these need to be renamed manually:

more_mappings = {
    "HealthCenterDistrict": "health_center_district",
    "SanitDistrict": "sanitation_district_number",
    "Sanitboro": "sanitation_district_boro",
    "FIRM07_FLAG": "2007_flood_insurance_rate_map_indicator",
    "PFIRM15_FLAG": "2015_preliminary_flood_insurance_rate_map",
}
gdf = gdf.rename(columns=more_mappings)

In [28]:
print(gdf.columns)

Index(['borough', 'tax_block', 'tax_lot', 'community_district',
       'census_tract_2020', 'census_block_2020', 'census_tract_2010',
       'census_block_2010', 'school_district', 'city_council_district',
       'zip_code', 'fire_company', 'police_precinct', 'health_center_district',
       'health_area', 'sanitation_district_boro', 'sanitation_district_number',
       'sanitation_subsection', 'address', 'zoning_district_1',
       'zoning_district_2', 'zoning_district_3', 'zoning_district_4',
       'commercial_overlay_1', 'commercial_overlay_2',
       'special_purpose_district_1', 'special_purpose_district_2',
       'special_purpose_district_3', 'limited_height_district',
       'split_boundary_indicator', 'building_class', 'land_use_category',
       'number_of_easements', 'type_of_ownership_code', 'owner_name',
       'lot_area', 'total_building_floor_area', 'commercial_floor_area',
       'residential_floor_area', 'office_floor_area', 'retail_floor_area',
       'garage_floor_a

In [29]:
# [col for col in gdf.columns if col not in [i.new_name for i in col_customization_dict.values()]]

In [30]:
# multicolumns = {'zoning_district': 4, 'commercial_overlay': 2, 'special_purpose_district': 3}

In [31]:
from sqlalchemy import Table, MetaData, Column, Integer, String, ForeignKey, LargeBinary, Float, Date
from sqlalchemy.orm import declarative_base

# Base = declarative_base()
# metadata = MetaData()

# Reflect the existing database tables once
metadata.reflect(bind=engine)

# Function to map custom dtype to SQLAlchemy types
def map_custom_dtype(dtype):
    if dtype == 'Integer':
        return Integer
    elif dtype == 'String':
        return String
    elif dtype == 'Float':
        return Float
    elif dtype == 'Date':
        return Date
    elif dtype == 'LargeBinary':
        return LargeBinary
    else:
        raise ValueError(f"Unsupported dtype: {dtype}")

# Function to dynamically create the table class
def create_dynamic_table_class(table_name, col_customization_dict):
    attrs = {
        '__tablename__': table_name,
        'id': Column(Integer, primary_key=True, autoincrement=True),
        'geometry': Column(String),  
        'wkb': Column(LargeBinary),  # Use LargeBinary for WKB
        'Shape_Leng' : Column(Float), # Add columns not listed in the data dictionary
        'Shape_Area' : Column(Float),
    }
    
    for k, v in col_customization_dict.items():
        if any([name for name in multicolumns.keys() if name in k]):
            k = re.sub('_[0-9]$', '', k)
        col_type = map_custom_dtype(v.dtype)
        if v.is_fk:
            attrs[k] = Column(Integer, ForeignKey(f'{v.new_name}_lookup.id'))
        elif v.is_category:
            print(f'Creating id column for {v.new_name}')
            attrs[v.new_name] = Column(col_type)
            attrs[f"{v.new_name}_id"] = Column(Integer, ForeignKey(f'{k}_lookup.id'))
        else:
            attrs[v.new_name] = Column(col_type)
    
    return type(table_name, (Base,), attrs)

# Create the MapPLUTO_24v4_clipped table class
MapPLUTO24v4Clipped = create_dynamic_table_class('MapPLUTO_24v4_clipped', col_customization_dict)

# Reflect the metadata again to ensure it includes the new table class
metadata.reflect(bind=engine)

# Create all tables in the database
Base.metadata.create_all(engine)


Creating id column for borough
Creating id column for community_district
Creating id column for school_district
Creating id column for city_council_district
Creating id column for zip_code
Creating id column for fire_company
Creating id column for police_precinct
Creating id column for health_center_district
Creating id column for health_area
Creating id column for sanitation_district_number
Creating id column for zoning_district_1
Creating id column for zoning_district_2
Creating id column for zoning_district_3
Creating id column for zoning_district_4
Creating id column for commercial_overlay_1
Creating id column for commercial_overlay_2
Creating id column for special_purpose_district_1
Creating id column for special_purpose_district_2
Creating id column for special_purpose_district_3
Creating id column for limited_height_district
Creating id column for building_class
Creating id column for land_use_category
Creating id column for type_of_ownership_code
Creating id column for owner_na

In [32]:
datetime_cols = [col.new_name for col in col_customization_dict.values() if col.dtype == 'Date']
# datetime_cols = [col for col in datetime_cols if col is not None]
datetime_cols

['apportionment_date']

In [33]:
# batch_size = 1000
# for start in range(0, len(gdf), batch_size):
#     batch = gdf.iloc[start:start + batch_size]
#     for _, row in batch.iterrows():
#         print(row['zoning_district'][0])

In [34]:
def split_address(row_data):
    if 'staddr' in row_data.keys():
        if row_data['staddr'] is None:
            row_data['building_num'], row_data['street'] = None, None
            row_data.pop('staddr')
            return row_data
        else:
            if row_data['staddr'][0].isdigit():
                try:
                    addr = row_data['staddr'].split(' ', 1)
                    if len(addr) == 1:
                        addr = [None] + addr
                    row_data['building_num'], row_data['street'] = addr
                except Exception as e:
                    print(e)
                    print(row_data['staddr'])
            else:
                row_data['building_num'], row_data['street_name'] = None, row_data['staddr']
        row_data.pop('staddr')
    return row_data


In [36]:
import pandas as pd

In [None]:
from sqlalchemy.orm import sessionmaker
from shapely import wkb

# Create a session
session = SessionLocal()

# gdf = geodata['MapPLUTO_24v4_clipped']
def format_float(value):
    return str(value).rstrip('0').rstrip('.') if '.' in str(value) else str(value)


batch_size = 500
for start in range(0, len(gdf), batch_size):
    batch = gdf.iloc[start:start + batch_size]
    for idx, row in batch.iterrows():
        try:
            if row['apportionment_date']:
                row['apportionment_date'] = parseDateString(row['apportionment_date'])
            if row['address']:
                row = split_address(row)
            for col in gdf.columns:
                val = row[col]
                if isinstance(val, pd.Series):
                    try:
                        first_value = row[col].iloc[0]
                        row[col] = first_value
                    except Exception as e:
                        print(f"Error processing Series in column {col} at row {idx}: {e}")
                # Replace NA values with None so that SQLAlchemy inserts them as NULL:
                if pd.isna(val):
                    # print(f"Found NA value in column {col} at row {idx}, inserting as NULL")
                    row[col] = None
            # Prepare the geometry and entry object
            geometry_wkb = row['geometry'].wkb if row['geometry'] else None
            pluto_entry = MapPLUTO24v4Clipped(
                geometry=geometry_wkb,
                **{col: row[col] for col in gdf.columns if col not in ['geometry']}
            )
            session.add(pluto_entry)
        except Exception as e:
            print(f"Error at row index {idx}")
            for col in gdf.columns:
                try:
                    print(f"Column: {col}, Value: {row[col]}, Type: {type(row[col])}")
                except Exception as sub_e:
                    print(f"Error printing column {col}: {sub_e}")
            raise e  # re-raise after logging for further debugging
    session.commit()


# # Prepare the data for insertion
# batch_size = 1000
# for start in range(0, len(gdf), batch_size):
#     batch = gdf.iloc[start:start + batch_size]
#     for _, row in batch.iterrows():
#         print(row)
#         if row['apportionment_date']:
#             row['apportionment_date'] = parseDateString(row['apportionment_date'])
#         # if row['address']:
#         #     row = split_address(row)
#         for col in gdf.columns:
#             val = row[col]
#             # if type(val) == str and val.isdigit() and col_customization_dict[col].dtype == 'String':
#             #     row[col] = int(val)
#             if isinstance(val, pd.Series):
#                 print(f"length: {len(val)}")
#                 print(f"Column {col} is a Series: first value is {val.iloc[0]} of length {len(val)}")
#                 try:
#                     first_value = row[col].iloc[0]
#                     new = first_value
#                     row[col] = new
#                 except Exception as e:
#                     print(e)
#                     print('Printing:')
#                     for i in row[col]:
#                         print(i, type(i))
#                 print("Before type is", type(row[col]))
#                 print("Type is", type(row[col]))
#         # rest of your code...
#         geometry_wkb = row['geometry'].wkb if row['geometry'] else None
#         pluto_entry = MapPLUTO24v4Clipped(
#             geometry=geometry_wkb,
#             **{col: row[col] for col in gdf.columns if col not in ['geometry']}
#         )
#         session.add(pluto_entry)
#     # for _, row in batch.iterrows():
#     #     if row['apportionment_date']:
#     #         row['apportionment_date'] = parseDateString(row['apportionment_date'])
#     #     geometry_wkb = row['geometry'].wkb if row['geometry'] else None
#     #     pluto_entry = MapPLUTO24v4Clipped(
#     #         geometry=geometry_wkb,
#     #         **{col: row[col] for col in gdf.columns if col not in ['geometry']}
#     #     )
#     #     session.add(pluto_entry)
#     session.commit()

# # Close the session
# session.close()

In [None]:
gdf.iloc[0]

In [None]:
del gdf

In [None]:
print(SQLITE_PATH)

* Make a test plot to verify that the geodata was stored correctly

In [None]:
import geopandas as gpd
import pandas as pd
from shapely import wkb
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import networkx as nx
from sqlalchemy import create_engine, event, text

# Read the data from the database
query = "SELECT zip_code, geometry FROM MapPLUTO_24v4_clipped"
df = pd.read_sql(query, engine)

# Debug: Print the DataFrame columns
print("DataFrame columns:", df.columns)

# Convert the geometry column from WKB to Shapely geometries
df['geometry'] = df['geometry'].apply(lambda x: wkb.loads(x) if x else None)

# Convert the DataFrame to a GeoDataFrame
gdf = gpd.GeoDataFrame(df, geometry='geometry')

# Print the GeoDataFrame
print(gdf.head())

# Ensure that zip_code is preserved during the dissolve process
merged_gdf = gdf.dissolve(by='zip_code', aggfunc={'zip_code': 'first'})  # Explicit aggregation of zip_code

# Check if zip_code is now present after dissolving
print(merged_gdf.columns)  # Should include 'zip_code'

# Create a new adjacency graph based on the merged geometries
G = nx.Graph()

# Add nodes and edges based on adjacency of merged shapes
for i, shape1 in merged_gdf.iterrows():
    for j, shape2 in merged_gdf.iterrows():
        if i != j and shape1.geometry.touches(shape2.geometry):
            G.add_edge(i, j)

# Perform graph coloring to ensure adjacent shapes don't share the same color
color_map = nx.coloring.greedy_color(G, strategy="largest_first")

# Plot the map with the colors assigned
fig, ax = plt.subplots(1, 1, figsize=(10, 10))

# Normalize the color map to cover the full range of the node indices
norm = mcolors.Normalize(vmin=min(color_map.values()), vmax=max(color_map.values()))
sm = plt.cm.ScalarMappable(cmap=plt.cm.tab20, norm=norm)

# Color the merged geometries based on the graph coloring using the full palette
merged_gdf['color'] = merged_gdf.index.map(color_map)
merged_gdf.plot(ax=ax, color=[sm.to_rgba(i) for i in merged_gdf['color']], edgecolor='black', linewidth=0, legend=False)

# Add labels at the center of each merged shape
for _, row in merged_gdf.iterrows():
    centroid = row.geometry.centroid
    ax.text(centroid.x, centroid.y, str(row['zip_code']), fontsize=2, ha='center', va='center')

# Add a colorbar to visualize the full range of colors
cbar = fig.colorbar(sm, ax=ax)
cbar.set_label('Color Range (Graph Coloring)', rotation=270, labelpad=20)

plt.savefig("/home/james/Massive/PROJECTDATA/map_output_zip_shuffled2.pdf", format="pdf")

plt.show()

In [None]:
import time
from sqlalchemy.exc import OperationalError

def populate_lookup_table(engine, lookup_table, source_table_name, lookup_table_name, text_column_name, chunk_size=1000, max_retries=5, drop_old_columns=False):
    """
    Populate a lookup table in chunks with retries for database lock issues.
    """
    def retry(func, *args, **kwargs):
        """Retry function with backoff for SQLite locks."""
        for attempt in range(max_retries):
            try:
                return func(*args, **kwargs)
            except OperationalError as e:
                if "database is locked" in str(e):
                    print(f"Database is locked. Retrying ({attempt + 1}/{max_retries})...")
                    time.sleep(0.2 * (attempt + 1))  # Gradual backoff
                else:
                    raise
        raise Exception("Exceeded maximum retries due to database locks.")
    
    with engine.connect() as connection:
        # Ensure the new column exists
        try:
            retry(connection.execute, text(f"ALTER TABLE {source_table_name} ADD COLUMN {text_column_name}_id INTEGER"))
        except Exception as e:
            print(f"Column creation skipped or failed: {e}")

        # Process unique values in chunks
        unique_query = f"SELECT DISTINCT {text_column_name} FROM {source_table_name}"
        unique_values_iter = pd.read_sql(unique_query, engine, chunksize=chunk_size)
        
        all_unique_values = []
        for chunk in unique_values_iter:
            all_unique_values.extend(chunk[text_column_name].dropna().tolist())

        # Insert into the lookup table in batches
        unique_values = list(set(all_unique_values))
        for i in range(0, len(unique_values), chunk_size):
            batch_values = unique_values[i:i + chunk_size]
            stmt = insert(lookup_table).values([{'name_or_code': value} for value in batch_values]).on_conflict_do_nothing()
            try:
                retry(connection.execute, stmt)
            except Exception as e:
                print(f"Error inserting batch: {e}")

        # Update the source table with foreign key references
        update_stmt = text(f"""
        UPDATE {source_table_name}
        SET {text_column_name}_id = (
            SELECT id 
            FROM {lookup_table_name}
            WHERE name_or_code = {source_table_name}.{text_column_name}
        )
        """)
        try:
            retry(connection.execute, update_stmt)
        except Exception as e:
            print(f"Error updating foreign keys: {e}")
        connection.commit()
        # Remove the original text column (optional)
        if drop_old_columns:
            print(f"Dropping old column {text_column_name} from {source_table_name}...")
            connection.execute(text(f"ALTER TABLE {source_table_name} DROP COLUMN {text_column_name}"))
        connection.commit()

In [None]:
# import time
# from sqlalchemy.exc import OperationalError

# def populate_lookup_table(engine, lookup_table, source_table_name, lookup_table_name, text_column_name, chunk_size=100, max_retries=5):
#     """
#     Populate a lookup table in chunks with retries for database lock issues.
#     """
#     def retry(func, *args, **kwargs):
#         """Retry function with backoff for SQLite locks."""
#         for attempt in range(max_retries):
#             try:
#                 return func(*args, **kwargs)
#             except OperationalError as e:
#                 if "database is locked" in str(e):
#                     print(f"Database is locked. Retrying ({attempt + 1}/{max_retries})...")
#                     time.sleep(0.2 * (attempt + 1))  # Gradual backoff
#                 else:
#                     raise
#         raise Exception("Exceeded maximum retries due to database locks.")
    
#     with engine.connect() as connection:
#         # Ensure the new column exists
#         try:
#             retry(connection.execute, text(f"ALTER TABLE {source_table_name} ADD COLUMN {text_column_name}_id INTEGER"))
#         except Exception as e:
#             print(f"Column creation skipped or failed: {e}")

#         # Process unique values in chunks
#         unique_query = f"SELECT DISTINCT {text_column_name} FROM {source_table_name}"
#         unique_values_iter = pd.read_sql(unique_query, engine, chunksize=chunk_size)
        
#         for chunk in unique_values_iter:
#             unique_values = chunk[text_column_name].dropna().tolist()

#             # Insert into the lookup table in small batches
#             for value in unique_values:
#                 stmt = insert(lookup_table).values({'name_or_code': value}).on_conflict_do_nothing()
#                 try:
#                     retry(connection.execute, stmt)
#                 except Exception as e:
#                     print(f"Error inserting value '{value}': {e}")

#         # Update the source table with foreign key references
#         update_stmt = text(f"""
#         UPDATE {source_table_name}
#         SET {text_column_name}_id = (
#             SELECT id 
#             FROM {lookup_table_name}
#             WHERE name_or_code = {source_table_name}.{text_column_name}
#         )
#         """)
#         try:
#             retry(connection.execute, update_stmt)
#         except Exception as e:
#             print(f"Error updating foreign keys: {e}")
#         connection.commit()
#         # Remove the original text column (optional)
#         # connection.execute(text(f"ALTER TABLE {source_table_name} DROP COLUMN {text_column_name}"))
#         connection.commit()


In [None]:
# to_recode = [col for col in col_customization_dict.values() if not col.is_fk and col.is_category]
to_recode = {k:v for k,v in lookups.items() if not v.is_fk and v.is_category}
to_recode

In [None]:
for colname in to_recode.keys():
    print(f"Processing {colname}...")
    print( to_recode[colname])
    lookup_table = to_recode[colname].orm
    source_table_name = 'MapPLUTO_24v4_clipped'
    lookup_table_name = lookup_table.name
    text_column_name = colname
    s = time.process_time() # start timer
    populate_lookup_table(engine, lookup_table, source_table_name, lookup_table_name, text_column_name)
    e = time.process_time() # end time
    print(f"Populating {colname} took {e-s} seconds")


In [None]:
to_recode[colname]