# Create and populate the database

In [None]:
# # Import standard libraries
# import os
import re
import json
import codecs
# import requests
import time
import dill
time.sleep(3)

from urllib.request import urlopen

# # Import third-party libraries
# import geopandas as gpd
# from geoalchemy2 import Geometry
import pandas as pd
from sqlalchemy import create_engine, Column, Integer, String, Date, MetaData, event, Table, text, LargeBinary, ForeignKey
from sqlalchemy.dialects.sqlite import insert
from sqlalchemy.orm import sessionmaker
# from sqlalchemy.event import listen
# from sqlalchemy.engine import Engine
# from sqlalchemy.ext.declarative import declarative_base

# import sqlite3
# from sqlite3 import dbapi2 as sqlite

# import fiona
# from fiona.crs import from_epsg

from src.helpers import *
from src.dbutils import *
from src.ORMutils import *
from src.models import *
from src.geo import *
from src.pdfutils import *

* #### Load the objects created in previous notebooks

In [None]:
# Load the environment
with open("environment_data/table_dicts.pkl", "rb") as f:
    env = dill.load(f)

# Restore the environment
globals().update(env)

In [None]:
# for name, dataset in dataset_info_dict.items():
#     if dataset.col_customizations is not None:
#         for key in dataset.col_customizations.keys():
#             print(dataset.col_customizations[key].short_name)
#     else:
#         print(f'No column info set for {dataset.column_metadata}')

* ### Create the database engine that will be used throughout the rest of the notebook.

In [None]:
engine = create_engine(f'{SQLITE_PATH}?check_same_thread=False', echo=False)

SessionLocal = sessionmaker(bind=engine, autoflush=False, autocommit=False)

    

* ### Configure the database

In [None]:
@event.listens_for(engine, "connect")
def load_spatialite(dbapi_conn, connection_record):
    print("Loading SpatiaLite extension")
    dbapi_conn.enable_load_extension(True)
    dbapi_conn.load_extension("mod_spatialite")
    dbapi_conn.enable_load_extension(False)


with engine.connect() as conn:
    print("Connection established")
    result = conn.execute(text("SELECT spatialite_version()"))
    spatialite_version = result.fetchone()
    print(f"SpatiaLite version: {spatialite_version[0]}")

# Enable WAL mode
with SessionLocal() as session:
    session.execute(text("PRAGMA journal_mode=WAL"))
    session.execute(text("PRAGMA synchronous = NORMAL"))
    session.execute(text("PRAGMA temp_store = MEMORY"))

# Initialize spatial metadata if not already present
# with engine.connect() as conn:
#     conn.execute(text("SELECT InitSpatialMetaData(1)"))
with SessionLocal() as session:
    result = session.execute(text("SELECT spatialite_version()"))
    spatialite_version = result.fetchone()
    print(f"SpatiaLite version: {spatialite_version[0]}")



* Initialize spatialite metadata for better performance later (Thanks Copilot)

In [None]:
with engine.connect() as conn:
    conn.execute(text("SELECT InitSpatialMetaData(1)"))

In [None]:
metadata = MetaData()
Base.metadata.reflect(bind=engine) 

* ### Create lookup tables variables identified as categorical and for which definitions were extracted from the metadata in the previous notebook.

* There are borough codes in the PLUTO dataset, but annyoingly, in contrast to most other datasets, the borough code is a two letter inital like "BK" or "BX". Also in the PLUTO dataset, "Sanitation Borough" does use the standard numeric codes that most other NYC OpenData datasets use. All this is is to say that it requires special handling separate from my system to extract categories and create lookup tables for them programatically.

In [None]:
lookups = {k:v for k,v in col_customization_dict.items() if col_customization_dict[k].is_category == True} # Codes for overlays are to go in the same table as other zoning codes

completed_tables = [] # This is for tracking the names of tables that have been created, which will be used to avoid creating redundant tables for columns that are same-kind repeats (such as "district_1" and "district_2"), and thus will use the same lookups.

for name,table in lookups.items():
    print(f"processing {table}")
    lookup_table_name= re.sub('_[0-9]+$', '', table.new_name)
    print(f"lookup_table_name: {lookup_table_name}")
    if any([table.new_name.startswith(prefix) and table.new_name[-1].isdigit() for prefix in completed_tables]):
        print(f"Lookup table {lookup_table_name} already created, continuing...")
        continue
    with engine.connect() as connection:
        print(f"Creating lookup table {lookup_table_name}...")
        lookup_table = create_lookup_table(Base.metadata, lookup_table_name=lookup_table_name, text_column_name='name_or_code')
        print(f"Created lookup table: {lookup_table}")
        name_prefix = lookup_table_name
        completed_tables.append(name_prefix)
        lookups[name].orm = lookup_table

Base.metadata.create_all(engine)



In [None]:
multicolumns = {'zoning_district': 4, 'commercial_overlay': 2, 'special_purpose_district': 3}
for name,repetitions in multicolumns.items():
    print(f"Setting {name} columns")
    cols = {k:v for k,v in col_customization_dict.items() if col_customization_dict[k].new_name.startswith(name)}
    main_col = [v for k,v in col_customization_dict.items() if col_customization_dict[k].new_name.endswith("_1")][0]
    for key in cols.keys():
        lookups[key].orm = main_col.orm

In [None]:
for name,table in lookups.items():
    lookup_table = table.orm
    if lookup_table is None:
        print(f"Skipping {name}...")
        continue
    print(lookup_table)
    with engine.connect() as connection:
        for definition in table.definitions:
            if len(definition) == 2:
                try:
                    stmt = insert(lookup_table).values(id=int(definition[0]), name_or_code=definition[1]).on_conflict_do_nothing()
                except ValueError:
                    stmt = insert(lookup_table).values(name_or_code=definition[0], info=definition[1]).on_conflict_do_nothing()
            elif len(definition) == 3:
                try:
                    stmt = insert(lookup_table).values(id=int(definition[0]), name_or_code=definition[1], info=definition[2]).on_conflict_do_nothing()
                except Exception as e:
                    print(e)
                    print(definition)
                    # stmt = insert(lookup_table).values(id=definition[0], name_or_code=definition[1], info=definition[2]).on_conflict_do_nothing()
            else:
                print(definition)
                raise ValueError("Was only expecting two or three columns")
            connection.execute(stmt)
        connection.commit()
    name_prefix = table.new_name[0:round(len(table.new_name)*.75)] # Hopefully this is a safe threshold to identify when columns are repeats of the same type

## Import the MaPLUTO data:
* List the layers in the file
* In this case there is only one layer, so it isn't necessary to know and specify which one to import, but including anyway for future reference.

In [None]:
# Import the MapPLUTO data from geo database file (.gdb)
gdb_path = f"{PROJECT_DATA}/files_to_use/MapPLUTO24v4.gdb"


* Import the geodatabase (.gdb) file.

In [None]:

geodata = {}
# List layers in the GDB file
layers = fiona.listlayers(gdb_path)
print("Layers in the GDB file:")
for layer in layers:
    print(layer)
    gdf = gpd.read_file(gdb_path, layer=layer)
    # gdf['borough'] = gdf['Borough'].replace(replacement_dict)
    try:
        gdf['wkb'] = gdf['geometry'].apply(lambda geom: geom.wkb if geom else None)
    except KeyError:
        pass
    geodata[layer] = gdf


In [None]:
geodata.keys()


In [None]:
col_customization_dict

* Create the table in the Sqlite database and insert the (modified) data from the gdb file.

In [None]:
gdf = geodata['MapPLUTO_24v4_clipped']
is_whole_number = {(gdf[col].notna() & (gdf[col] % 1 == 0)).all() for col in gdf.columns if gdf[col].dtype == 'float'}
gdf.columns

In [None]:
# # Function to check if all non-NaN values are whole numbers
# def is_whole_number_series(s):
#     return (s.dropna() % 1 == 0).all() 

# Iterate over columns and change dtype to int where applicable
for col in gdf.columns:
    if  gdf[col].dtype == float and is_whole_number_series(gdf[col]):
        print(f'Column {col} is {is_whole_number_series(gdf[col])}')
        print(f'Converting {col} to integer')
        gdf[col] = gdf[col].astype('Int64')  # 'Int64' for nullable integer type in Pandas
    else:
        print(f"Skipping {col}")


In [None]:
from sqlalchemy import inspect
inspector = inspect(engine)
print(inspector.get_table_names())  # Ensure "basement_type_or_grade_lookup" is listed


In [None]:
rename_mappings = {v.short_name: v.new_name for v in col_customization_dict.values()}

In [None]:
gdf = gdf.rename(columns=rename_mappings)

In [None]:
print(gdf.columns)

In [None]:
# A few of the column names did not exactly match up due to slightly different field names than specified in the data dictionary, so these need to be renamed manually:

more_mappings = {
    "HealthCenterDistrict": "health_center_district",
    "SanitDistrict": "sanitation_district_number",
    "Sanitboro": "sanitation_district_boro",
    "FIRM07_FLAG": "2007_flood_insurance_rate_map_indicator",
    "PFIRM15_FLAG": "2015_preliminary_flood_insurance_rate_map",
}
gdf = gdf.rename(columns=more_mappings)

In [None]:
print(gdf.columns)

In [None]:
from sqlalchemy import Table, MetaData, Column, Integer, String, ForeignKey, LargeBinary, Float, Date
from sqlalchemy.orm import declarative_base

# Reflect the existing database tables once
metadata.reflect(bind=engine)

# Function to map custom dtype to SQLAlchemy types
def map_custom_dtype(dtype):
    if dtype == 'Integer':
        return Integer
    elif dtype == 'String':
        return String
    elif dtype == 'Float':
        return Float
    elif dtype == 'Date':
        return Date
    elif dtype == 'LargeBinary':
        return LargeBinary
    else:
        raise ValueError(f"Unsupported dtype: {dtype}")

# Function to dynamically create the table class
def create_dynamic_table_class(table_name, col_customization_dict):
    attrs = {
        '__tablename__': table_name,
        'id': Column(Integer, primary_key=True, autoincrement=True),
        'geometry': Column(String),  
        'wkb': Column(LargeBinary),  # Use LargeBinary for WKB
        'Shape_Leng' : Column(Float), # Add columns not listed in the data dictionary
        'Shape_Area' : Column(Float),
    }
    
    for k, v in col_customization_dict.items():
        if any([name for name in multicolumns.keys() if name in k]):
            k = re.sub('_[0-9]$', '', k)
        col_type = map_custom_dtype(v.dtype)
        if v.is_fk:
            attrs[k] = Column(Integer, ForeignKey(f'{v.new_name}_lookup.id'))
        elif v.is_category:
            print(f'Creating id column for {v.new_name}')
            attrs[v.new_name] = Column(col_type)
            attrs[f"{v.new_name}_id"] = Column(Integer, ForeignKey(f'{k}_lookup.id'))
        else:
            attrs[v.new_name] = Column(col_type)
    
    return type(table_name, (Base,), attrs)

# Create the MapPLUTO_24v4_clipped table class
MapPLUTO24v4Clipped = create_dynamic_table_class('MapPLUTO_24v4_clipped', col_customization_dict)

# Reflect the metadata again to ensure it includes the new table class
metadata.reflect(bind=engine)

# Create all tables in the database
Base.metadata.create_all(engine)


In [None]:
from sqlalchemy.orm import sessionmaker
from shapely import wkb

# Create a session
session = SessionLocal()

# gdf = geodata['MapPLUTO_24v4_clipped']
def format_float(value):
    return str(value).rstrip('0').rstrip('.') if '.' in str(value) else str(value)

batch_size = 100000
with SessionLocal() as session:
    for start in range(0, len(gdf), batch_size):
        batch = gdf.iloc[start:start + batch_size]
        for idx, row in batch.iterrows():
            try:
                if row['apportionment_date']:
                    row['apportionment_date'] = parseDateString(row['apportionment_date'])
                for col in gdf.columns:
                    val = row[col]
                    if isinstance(val, pd.Series):
                        try:
                            first_value = row[col].iloc[0]
                            row[col] = first_value
                        except Exception as e:
                            print(f"Error processing Series in column {col} at row {idx}: {e}")
                    # Replace NA values with None so that SQLAlchemy inserts them as NULL:
                    if pd.isna(val):
                        row[col] = None
                # Prepare the geometry and entry object
                geometry_wkb = row['geometry'].wkb if row['geometry'] else None
                pluto_entry = MapPLUTO24v4Clipped(
                    geometry=geometry_wkb,
                    **{col: row[col] for col in gdf.columns if col not in ['geometry']}
                )
                session.add(pluto_entry)
            except Exception as e:
                print(f"Error at row index {idx}")
                for col in gdf.columns:
                    try:
                        print(f"Column: {col}, Value: {row[col]}, Type: {type(row[col])}")
                    except Exception as sub_e:
                        print(f"Error printing column {col}: {sub_e}")
                raise e  # re-raise after logging for further debugging
        session.commit()


In [None]:
del gdf

* Make a test plot to verify that the geodata was stored correctly

In [None]:
import geopandas as gpd
import pandas as pd
from shapely import wkb
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import networkx as nx
from sqlalchemy import create_engine, event, text

# Read the data from the database
query = "SELECT zip_code, geometry FROM MapPLUTO_24v4_clipped"
df = pd.read_sql(query, engine)

# Debug: Print the DataFrame columns
print("DataFrame columns:", df.columns)

# Convert the geometry column from WKB to Shapely geometries
df['geometry'] = df['geometry'].apply(lambda x: wkb.loads(x) if x else None)

# Convert the DataFrame to a GeoDataFrame
gdf = gpd.GeoDataFrame(df, geometry='geometry')

# Print the GeoDataFrame
print(gdf.head())

# Ensure that zip_code is preserved during the dissolve process
merged_gdf = gdf.dissolve(by='zip_code', aggfunc={'zip_code': 'first'})  # Explicit aggregation of zip_code

# Check if zip_code is now present after dissolving
print(merged_gdf.columns)  # Should include 'zip_code'

# Create a new adjacency graph based on the merged geometries
G = nx.Graph()

# Add nodes and edges based on adjacency of merged shapes
for i, shape1 in merged_gdf.iterrows():
    for j, shape2 in merged_gdf.iterrows():
        if i != j and shape1.geometry.touches(shape2.geometry):
            G.add_edge(i, j)

# Perform graph coloring to ensure adjacent shapes don't share the same color
color_map = nx.coloring.greedy_color(G, strategy="largest_first")

# Plot the map with the colors assigned
fig, ax = plt.subplots(1, 1, figsize=(10, 10))

# Normalize the color map to cover the full range of the node indices
norm = mcolors.Normalize(vmin=min(color_map.values()), vmax=max(color_map.values()))
sm = plt.cm.ScalarMappable(cmap=plt.cm.tab20, norm=norm)

# Color the merged geometries based on the graph coloring using the full palette
merged_gdf['color'] = merged_gdf.index.map(color_map)
merged_gdf.plot(ax=ax, color=[sm.to_rgba(i) for i in merged_gdf['color']], edgecolor='black', linewidth=0, legend=False)

# Add labels at the center of each merged shape
for _, row in merged_gdf.iterrows():
    centroid = row.geometry.centroid
    ax.text(centroid.x, centroid.y, str(row['zip_code']), fontsize=2, ha='center', va='center')

# Add a colorbar to visualize the full range of colors
cbar = fig.colorbar(sm, ax=ax)
cbar.set_label('Color Range (Graph Coloring)', rotation=270, labelpad=20)

plt.savefig("/home/james/Massive/PROJECTDATA/map_output_zip_shuffled2.pdf", format="pdf")

plt.show()

In [None]:
# to_recode = [col for col in col_customization_dict.values() if not col.is_fk and col.is_category]
to_recode = {k:v for k,v in lookups.items() if not v.is_fk and v.is_category}
to_recode

In [None]:
for colname in to_recode.keys():
    print(f"Processing {colname}...")
    lookup_table = to_recode[colname].orm
    source_table_name = 'MapPLUTO_24v4_clipped'
    lookup_table_name = lookup_table.name
    text_column_name = colname
    s = time.process_time() # start timer
    populate_lookup_table(engine, lookup_table, source_table_name, lookup_table_name, text_column_name)
    e = time.process_time() # end time
    print(f"Populating {colname} took {e-s} seconds")


# Add more datasets to db

In [None]:
datatype_mappings = {"meta_data" : String, "calendar_date" : Date, "number" : Float, "text" : String, "point" : String}

In [None]:
import jsonlines
import orjson
import time
from shapely import from_wkt  # Vectorized conversion function in Shapely 2.0
from geoalchemy2.shape import from_shape
from sqlalchemy.engine import Engine

def convert_wkt(rows_to_insert):
    # Batch convert geometries.
    raw_wkts = [r.get('_raw_geocoded_column') for r in rows_to_insert]
    try:
        shapely_geoms = from_wkt(raw_wkts)
    except Exception as e:
        print(f"Error converting batch geometry at row {idx}: {e}")
        shapely_geoms = [None] * len(rows_to_insert)
    geoms = [
        from_shape(geom, srid=4326) if geom is not None else None 
        for geom in shapely_geoms
    ]
    for r, geom in zip(rows_to_insert, geoms):
        r['geocoded_column'] = geom
        r.pop('_raw_geocoded_column', None)

def insert_dataset(dataset, jsonfile, columns, batch_size=10000, commit_interval=100):
    """
    commit_interval: How many batches to accumulate before a commit.
    """
    session = SessionLocal()
    col_names = list(columns.keys())
    rows_to_insert = []
    batch_counter = 0
    
    # Define a custom loads function for orjson.
    def custom_loads(s):
        return orjson.loads(s.encode("utf-8"))
    
    with jsonlines.open(jsonfile, mode='r', loads=custom_loads) as reader:
        for idx, row in enumerate(reader):
            if idx == 0:
                datetime_cols = [key for key in columns if columns[key] is Date]
                DynamicTable = create_table_for_dataset(
                    columns=dataset.col_types, 
                    prefix=dataset.short_name, 
                    engine=engine
                )
                # Prepare the insert statement (using SQLite's OR IGNORE to skip duplicates)
                insert_stmt = DynamicTable.__table__.insert().prefix_with("OR IGNORE")
            
            # Row is assumed to be a list; clean text values.
            row = [textClean(val) if isinstance(val, str) else val for val in row]
            # Map list to dict using the expected order.
            row_data = {col_name: (row[i] if i < len(row) else None)
                        for i, col_name in enumerate(col_names)}
            
            # Keep the raw WKT for later geometry conversion.
            row_data['_raw_geocoded_column'] = row_data.get('geocoded_column')
            row_data['geocoded_column'] = None  # initialize to None for now.
            
            # Process datetime values.
            temp = {}
            for key in row_data:
                if key == '_raw_geocoded_column':
                    temp[key] = row_data[key]
                else:
                    temp[key] = (parseDateString(row_data[key])
                                 if key in datetime_cols and row_data[key] is not None
                                 else row_data[key])
            row_data = temp
            
            rows_to_insert.append(row_data)
            
            if (idx + 1) % batch_size == 0:
                convert_wkt(rows_to_insert)
                # Batch convert geometries.
                session.execute(insert_stmt, rows_to_insert)
                rows_to_insert = []
                batch_counter += 1
                
                # Rather than commit for each batch, commit once every commit_interval batches.
                if batch_counter % commit_interval == 0:
                    commit_start = time.perf_counter()
                    session.commit()
                    commit_end = time.perf_counter()
                    print(f"Committed {commit_interval} batches in {commit_end - commit_start:.3f} seconds.")
                    # Optionally, you can also print the batch counter.
                    print(f"Batch {batch_counter} processed.")

        # Process any leftover rows.
        if rows_to_insert:
            convert_wkt(rows_to_insert)
            session.execute(insert_stmt, rows_to_insert)
        
        # Final commit for any outstanding operations.
        session.commit()
    session.close()
    

In [None]:
for k,v in dataset_info_dict.items():
    print(f'{k} : {v}')

In [None]:

for name,dataset in dataset_info_dict.items():
    if dataset.format == 'json':
        print(f'Starting dataset {dataset.short_name}')
        print(f'The dataset to be processed is {dataset}')
        jsonfile = dataset.dataset_path
        columns = dataset.col_types
        print(f'Columns are {columns}')
        insert_dataset(dataset, jsonfile, columns, batch_size = 25000, commit_interval=100)
   