# Create and populate the database

In [13]:
# # Import standard libraries
# import os
# import re
# import json
# import codecs
# import requests
import dill
# from urllib.request import urlopen

# # Import third-party libraries
# import geopandas as gpd
# from geoalchemy2 import Geometry
# import pandas as pd
from sqlalchemy import create_engine, Column, Integer, String, Date, MetaData, event, Table, text, LargeBinary
from sqlalchemy.dialects.sqlite import insert
from sqlalchemy.orm import sessionmaker
# from sqlalchemy.event import listen
# from sqlalchemy.engine import Engine
# from sqlalchemy.ext.declarative import declarative_base

# import sqlite3
# from sqlite3 import dbapi2 as sqlite

# import fiona
# from fiona.crs import from_epsg

from src.helpers import *
from src.dbutils import *
from src.ORMutils import *
from src.models import *
from src.geo import *
from src.pdfutils import *

In [14]:
# Load the environment
with open("environment_data/table_dicts.pkl", "rb") as f:
    env = dill.load(f)

# Restore the environment
globals().update(env)

In [15]:
# Load the environment
with open("environment_data/select.pkl", "rb") as f:
    env = dill.load(f)

# Restore the environment
globals().update(env)

In [16]:
for name, dataset in datasets.items():
    print(dataset.short_name)

mapPLUTO
assessments
tax_liens
housing_violations
assessment_actions
housing_database
NTAs2020
NTA_population_2020
NTA_demographics_2020
census_blocks2020
CDTAs2020
puma2020
cert_of_occupancy


* ### Create the database engine that will be used throughout the rest of the notebook.

In [17]:
engine = create_engine(f'{SQLITE_PATH}?check_same_thread=False', echo=False)

SessionLocal = sessionmaker(bind=engine, autoflush=False, autocommit=False)

* ### Configure the database

In [18]:
@event.listens_for(engine, "connect")
def load_spatialite(dbapi_conn, connection_record):
    print("Loading SpatiaLite extension")
    dbapi_conn.enable_load_extension(True)
    dbapi_conn.load_extension("mod_spatialite")
    dbapi_conn.enable_load_extension(False)


with engine.connect() as conn:
    print("Connection established")
    result = conn.execute(text("SELECT spatialite_version()"))
    spatialite_version = result.fetchone()
    print(f"SpatiaLite version: {spatialite_version[0]}")

# Enable WAL mode
with engine.connect() as conn:
    conn.execute(text("PRAGMA journal_mode=WAL"))

# Initialize spatial metadata if not already present
with engine.connect() as conn:
    conn.execute(text("SELECT InitSpatialMetaData(1)"))

Loading SpatiaLite extension
Connection established
SpatiaLite version: 5.1.0


### Manually create the borough codes lookup table
* These are standardized and available many places, however I could not find a single official source of record to programatically get them from, since there are only five of them, I enter them manually.

In [19]:
borough_codes = {'Manhattan' : 1,
'Bronx' : 2,
'Brooklyn' : 3,
'Queens' : 4,
'Staten Island' : 5}

* Create the lookup table.

In [20]:
metadata = MetaData()
metadata.reflect(bind=engine)

def create_lookup_table_simple(engine=engine, metadata=metadata, lookup_table_name='new_lookup_table', lookup_column_name='name'):
    lookup_table = Table(
        lookup_table_name,
        metadata,
        Column('id', Integer, primary_key=True, autoincrement=False),
        Column(lookup_column_name, String, unique=True, nullable=False, default="NO DATA"),
        extend_existing = True
    )
    if table_exists(engine, lookup_table_name):
        print("Table exists")
    else:
        lookup_table.create(engine)
    return lookup_table

* Populate the table with the borough codes.

In [21]:
borough_lookup_table = create_lookup_table_simple(engine=engine, metadata=metadata, lookup_table_name='boroughs', lookup_column_name='borough')

with engine.connect() as connection:
    for key,value in borough_codes.items():
        stmt = insert(borough_lookup_table).values(id = value, borough = key).on_conflict_do_nothing()
        connection.execute(stmt)
    connection.commit()

* ### Create lookup tables variables identified as categorical and for which definitions were extracted from the metadata in the previous notebook.

In [None]:
# lookups = [col for col in column_customizations if col.definitions ]
# col_customization_dict

# lookups = {k:v for k,v in col_customization_dict.items() if k[v].definitions}
lookups = {k:v for k,v in col_customization_dict.items() if col_customization_dict[k].definitions}

completed_tables = [] # This is for tracking the names of tables that have been created, which will be used to avoid creating redundant tables for columns that are same-kind repeats (such as "district_1" and "district_2"), and thus will use the same lookups.

for name,table in lookups.items():
    print('table is', table)
    lookup_table_name= re.sub('_[0-9]+$', '', table.new_name.title())
    print("prefixes are", completed_tables)
    print("table.new_name is", table.new_name)
    print([table.new_name.startswith(prefix) for prefix in completed_tables])
    print(any([table.new_name.startswith(prefix) for prefix in completed_tables]))
    if any([table.new_name.startswith(prefix) for prefix in completed_tables]):
    # if table.new_name[0:len(table)*75] in completed_tables:
        print("Lookup table already created, continuing...")
        continue
    with engine.connect() as connection:
        lookup_table = create_lookup_table(engine=engine, lookup_table_name=lookup_table_name, text_column_name='name_or_code')
        if lookup_table is None:
            print(f"Table {lookup_table_name} was not properly retrieved or created. Perhaps it exists? Skipping for now..")
            continue
        for definition in table.definitions:
            print(definition)
            if len(definition) == 2:
                try:
                    stmt = insert(lookup_table).values(id=int(definition[0]), name_or_code=definition[1]).on_conflict_do_nothing()
                except ValueError:
                    stmt = insert(lookup_table).values(name_or_code=definition[0], info=definition[1]).on_conflict_do_nothing()
            elif len(definition) == 3:
                try:
                    stmt = insert(lookup_table).values(id=int(definition[0]), name_or_code=definition[1], info=definition[2]).on_conflict_do_nothing()
                except Exception as e:
                    print(e)
                    print(definition)
                    # stmt = insert(lookup_table).values(id=definition[0], name_or_code=definition[1], info=definition[2]).on_conflict_do_nothing()
            else:
                print(definition)
                raise ValueError("Was only expecting two or three columns")
            connection.execute(stmt)
        connection.commit()
    name_prefix = table.new_name[0:round(len(table.new_name)*.75)] # Hopefully this is a safe threshold to identify when columns are repeats of the same type
    completed_tables.append(name_prefix)


table is ColCustomization(short_name='Borough', new_name='borough', dtype='String', synonyms=[], definitions=[['BX', 'Bronx'], ['BK', 'Brooklyn'], ['MN', 'Manhattan'], ['QN', 'Queens']], drop=False, is_fk=True)
shortened: Borough
prefixes are []
table.new_name is borough
[]
False
Table exists
Table Borough was not properly retrieved or created. Perhaps it exists? Skipping for now..
table is ColCustomization(short_name='SanitBoro', new_name='sanitation_district_boro', dtype='Float', synonyms=[], definitions=[['1', 'Manhattan'], ['2', 'Bronx'], ['3', 'Brooklyn'], ['4', 'Queens']], drop=False, is_fk=True)
shortened: Sanitation_District_Boro
prefixes are []
table.new_name is sanitation_district_boro
[]
False
Table exists
Table Sanitation_District_Boro was not properly retrieved or created. Perhaps it exists? Skipping for now..
table is ColCustomization(short_name='SPDist1', new_name='special_purpose_district_1', dtype='String', synonyms=[], definitions=[['125th', 'Special 125th Street Dist

In [23]:
lookups

{'borough': ColCustomization(short_name='Borough', new_name='borough', dtype='String', synonyms=[], definitions=[['BX', 'Bronx'], ['BK', 'Brooklyn'], ['MN', 'Manhattan'], ['QN', 'Queens']], drop=False, is_fk=True),
 'sanitation_district_boro': ColCustomization(short_name='SanitBoro', new_name='sanitation_district_boro', dtype='Float', synonyms=[], definitions=[['1', 'Manhattan'], ['2', 'Bronx'], ['3', 'Brooklyn'], ['4', 'Queens']], drop=False, is_fk=True),
 'special_purpose_district_1': ColCustomization(short_name='SPDist1', new_name='special_purpose_district_1', dtype='String', synonyms=[], definitions=[['125th', 'Special 125th Street District'], ['BNY', 'Special Brooklyn Navy Yard District'], ['BPC', 'Special Battery Park City District'], ['BR', 'Special Bay Ridge District'], ['BSC', 'Special Bay Street Corridor District'], ['C', 'Special Grand Concourse Preservation District'], ['CD', 'Special City Island District'], ['CI', 'Special Coney Island District'], ['CL', 'Special Clinton D

In [24]:
{k:v for k,v in col_customization_dict.items() if col_customization_dict[k].definitions}

{'borough': ColCustomization(short_name='Borough', new_name='borough', dtype='String', synonyms=[], definitions=[['BX', 'Bronx'], ['BK', 'Brooklyn'], ['MN', 'Manhattan'], ['QN', 'Queens']], drop=False, is_fk=True),
 'sanitation_district_boro': ColCustomization(short_name='SanitBoro', new_name='sanitation_district_boro', dtype='Float', synonyms=[], definitions=[['1', 'Manhattan'], ['2', 'Bronx'], ['3', 'Brooklyn'], ['4', 'Queens']], drop=False, is_fk=True),
 'special_purpose_district_1': ColCustomization(short_name='SPDist1', new_name='special_purpose_district_1', dtype='String', synonyms=[], definitions=[['125th', 'Special 125th Street District'], ['BNY', 'Special Brooklyn Navy Yard District'], ['BPC', 'Special Battery Park City District'], ['BR', 'Special Bay Ridge District'], ['BSC', 'Special Bay Street Corridor District'], ['C', 'Special Grand Concourse Preservation District'], ['CD', 'Special City Island District'], ['CI', 'Special Coney Island District'], ['CL', 'Special Clinton D