# Create and populate the database

In [None]:
# # Import standard libraries
# import os
# import re
import json
import codecs
# import requests
import time
import dill
time.sleep(3)

from urllib.request import urlopen

# # Import third-party libraries
# import geopandas as gpd
# from geoalchemy2 import Geometry
# import pandas as pd
from sqlalchemy import create_engine, Column, Integer, String, Date, MetaData, event, Table, text, LargeBinary, ForeignKey
from sqlalchemy.dialects.sqlite import insert
from sqlalchemy.orm import sessionmaker
# from sqlalchemy.event import listen
# from sqlalchemy.engine import Engine
# from sqlalchemy.ext.declarative import declarative_base

# import sqlite3
# from sqlite3 import dbapi2 as sqlite

# import fiona
# from fiona.crs import from_epsg

from src.helpers import *
from src.dbutils import *
from src.ORMutils import *
from src.models import *
from src.geo import *
from src.pdfutils import *

* #### Load the objects created in previous notebooks

In [None]:
# Load the environment
with open("environment_data/table_dicts.pkl", "rb") as f:
    env = dill.load(f)

# Restore the environment
globals().update(env)

In [None]:
# Load the environment
with open("environment_data/select.pkl", "rb") as f:
    env = dill.load(f)

# Restore the environment
globals().update(env)

In [None]:
for name, dataset in datasets.items():
    print(dataset.short_name)

* ### Create the database engine that will be used throughout the rest of the notebook.

In [None]:
engine = create_engine(f'{SQLITE_PATH}?check_same_thread=False', echo=False)

SessionLocal = sessionmaker(bind=engine, autoflush=False, autocommit=False)

* ### Configure the database

In [None]:
@event.listens_for(engine, "connect")
def load_spatialite(dbapi_conn, connection_record):
    print("Loading SpatiaLite extension")
    dbapi_conn.enable_load_extension(True)
    dbapi_conn.load_extension("mod_spatialite")
    dbapi_conn.enable_load_extension(False)


with engine.connect() as conn:
    print("Connection established")
    result = conn.execute(text("SELECT spatialite_version()"))
    spatialite_version = result.fetchone()
    print(f"SpatiaLite version: {spatialite_version[0]}")

# Enable WAL mode
with engine.connect() as conn:
    conn.execute(text("PRAGMA journal_mode=WAL"))

# Initialize spatial metadata if not already present
with engine.connect() as conn:
    conn.execute(text("SELECT InitSpatialMetaData(1)"))

In [None]:
Base = declarative_base()
metadata = MetaData()
Base.metadata.reflect(bind=engine) 

* ### Create lookup tables variables identified as categorical and for which definitions were extracted from the metadata in the previous notebook.

* There are borough codes in the PLUTO dataset, but annyoingly, in contrast to most other datasets, the borough code is a two letter inital like "BK" or "BX". Also in the PLUTO dataset, "Sanitation Borough" does use the standard numeric codes that most other NYC OpenData datasets use. All this is is to say that it requires special handling separate from my system to extract categories and create lookup tables for them programatically.

In [None]:
lookups = {k:v for k,v in col_customization_dict.items() if col_customization_dict[k].is_category == True and 'overlay' not in col_customization_dict[k].new_name} # Codes for overlays are to go in the same table as other zoning codes

completed_tables = [] # This is for tracking the names of tables that have been created, which will be used to avoid creating redundant tables for columns that are same-kind repeats (such as "district_1" and "district_2"), and thus will use the same lookups.

lookup_tables = {}

for name,table in lookups.items():
    print(f"processing {table}")
    lookup_table_name= re.sub('_[0-9]+$', '', table.new_name)
    if any([table.new_name.startswith(prefix) and table.new_name[-1].isdigit() for prefix in completed_tables]):
    # if table.new_name[0:len(table)*75] in completed_tables:
        print(f"Lookup table {lookup_table_name} already created, continuing...")
        continue
    with engine.connect() as connection:
        print(f"Creating lookup table {lookup_table_name}...")
        # lookup_table = create_lookup_table(engine=engine, lookup_table_name=lookup_table_name, text_column_name='name_or_code')
        lookup_table = create_lookup_table(Base.metadata, lookup_table_name=lookup_table_name, text_column_name='name_or_code')
        # name_prefix = table.new_name[0:round(len(table.new_name)*.75)] # Hopefully this is a safe threshold to identify when columns are repeats of the same type
        name_prefix = lookup_table_name
        completed_tables.append(name_prefix)
        # lookup_tables[lookup_table_name] = lookup_table
        lookups[name].orm = lookup_table

# metadata.create_all(engine)
Base.metadata.create_all(engine)



In [None]:
completed_tables

In [None]:
lookups

In [None]:
lookup_tables

In [None]:
for name,table in lookups.items():
    lookup_table = table.orm
    if lookup_table is None:
        print(f"Skipping {name}...")
        continue
    print(lookup_table)
    with engine.connect() as connection:
        for definition in table.definitions:
            if len(definition) == 2:
                try:
                    stmt = insert(lookup_table).values(id=int(definition[0]), name_or_code=definition[1]).on_conflict_do_nothing()
                except ValueError:
                    stmt = insert(lookup_table).values(name_or_code=definition[0], info=definition[1]).on_conflict_do_nothing()
            elif len(definition) == 3:
                try:
                    stmt = insert(lookup_table).values(id=int(definition[0]), name_or_code=definition[1], info=definition[2]).on_conflict_do_nothing()
                except Exception as e:
                    print(e)
                    print(definition)
                    # stmt = insert(lookup_table).values(id=definition[0], name_or_code=definition[1], info=definition[2]).on_conflict_do_nothing()
            else:
                print(definition)
                raise ValueError("Was only expecting two or three columns")
            connection.execute(stmt)
        connection.commit()
    name_prefix = table.new_name[0:round(len(table.new_name)*.75)] # Hopefully this is a safe threshold to identify when columns are repeats of the same type
    completed_tables.append(name_prefix)

In [None]:
col_customization_dict

In [None]:
# with engine.connect() as conn:
#     for row in class_codes:
#         print(row)
#         stmt = insert(lookup_tables['building_class']).values(name_or_code=row['code'], info=row['name']).prefix_with("OR IGNORE")
#         conn.execute(stmt)
#         conn.commit()

## Import the MaPLUTO data:
* List the layers in the file
* In this case there is only one layer, so it isn't necessary to know and specify which one to import, but including anyway for future reference.

In [None]:
# Import the MapPLUTO data from geo database file (.gdb)
gdb_path = f"{PROJECT_DATA}/files_to_use/MapPLUTO24v4.gdb"


* Import the geodatabase (.gdb) file.

In [None]:

geodata = {}
# List layers in the GDB file
layers = fiona.listlayers(gdb_path)
print("Layers in the GDB file:")
for layer in layers:
    print(layer)
    gdf = gpd.read_file(gdb_path, layer=layer)
    # gdf['borough'] = gdf['Borough'].replace(replacement_dict)
    try:
        gdf['wkb'] = gdf['geometry'].apply(lambda geom: geom.wkb if geom else None)
    except KeyError:
        pass
    geodata[layer] = gdf


In [None]:
geodata.keys()


In [None]:
col_customization_dict

* Create the table in the Sqlite database and insert the (modified) data from the gdb file.

In [None]:
gdf = geodata['MapPLUTO_24v4_clipped']
gdf.columns

In [None]:
[col for col in gdf.columns if col not in col_customization_dict.keys()]

In [None]:
from sqlalchemy import inspect
inspector = inspect(engine)
print(inspector.get_table_names())  # Ensure "basement_type_or_grade_lookup" is listed


In [None]:
lookup_tables

In [None]:
rename_mappings = {v.short_name: v.new_name for v in col_customization_dict.values()}
rename_mappings

In [None]:
gdf = gdf.rename(columns=rename_mappings)

In [None]:
print(gdf.columns)

In [None]:
# A few of the column names did not exactly match up due to slightly different field names than specified in the data dictionary, so these need to be renamed manually:

more_mappings = {
    "HealthCenterDistrict": "health_center_district",
    "SanitDistrict": "sanitation_district_number",
    "Sanitboro": "sanitation_district_boro",
    "FIRM07_FLAG": "2007_flood_insurance_rate_map_indicator",
    "PFIRM15_FLAG": "2015_preliminary_flood_insurance_rate_map",
}
gdf = gdf.rename(columns=more_mappings)

In [None]:
print(gdf.columns)

In [None]:
[col for col in gdf.columns if col not in [i.new_name for i in col_customization_dict.values()]]

In [None]:
multicolumns = {'zoning_district': 4, 'commercial_overlay': 2, 'special_purpose_district': 3)}

In [None]:
from sqlalchemy import Table, MetaData, Column, Integer, String, ForeignKey, LargeBinary, Float, Date
from sqlalchemy.orm import declarative_base

# Base = declarative_base()
# metadata = MetaData()

# Reflect the existing database tables once
metadata.reflect(bind=engine)

# Function to map custom dtype to SQLAlchemy types
def map_custom_dtype(dtype):
    if dtype == 'Integer':
        return Integer
    elif dtype == 'String':
        return String
    elif dtype == 'Float':
        return Float
    elif dtype == 'Date':
        return Date
    elif dtype == 'LargeBinary':
        return LargeBinary
    else:
        raise ValueError(f"Unsupported dtype: {dtype}")

# Function to dynamically create the table class
def create_dynamic_table_class(table_name, col_customization_dict):
    attrs = {
        '__tablename__': table_name,
        'id': Column(Integer, primary_key=True, autoincrement=True),
        'geometry': Column(String),  
        'wkb': Column(LargeBinary),  # Use LargeBinary for WKB
        'Shape_Leng' : Column(Float), # Add columns not listed in the data dictionary
        'Shape_Area' : Column(Float),
    }
    
    for k, v in col_customization_dict.items():
        col_type = map_custom_dtype(v.dtype)
        if v.is_fk:
            attrs[v.new_name] = Column(col_type, ForeignKey(f'{v.new_name}_lookup.id'))
        else:
            attrs[v.new_name] = Column(col_type)
    
    return type(table_name, (Base,), attrs)

# Create the MapPLUTO_24v4_clipped table class
MapPLUTO24v4Clipped = create_dynamic_table_class('MapPLUTO_24v4_clipped', col_customization_dict)

# Reflect the metadata again to ensure it includes the new table class
metadata.reflect(bind=engine)

# Create all tables in the database
Base.metadata.create_all(engine)


In [None]:
datetime_cols = [col.new_name for col in col_customization_dict.values() if col.dtype == 'Date']
# datetime_cols = [col for col in datetime_cols if col is not None]
datetime_cols

In [None]:
# batch_size = 1000
# for start in range(0, len(gdf), batch_size):
#     batch = gdf.iloc[start:start + batch_size]
#     for _, row in batch.iterrows():
#         print(row['zoning_district'][0])

In [None]:
from sqlalchemy.orm import sessionmaker
from shapely import wkb

# Create a session
session = SessionLocal()

# gdf = geodata['MapPLUTO_24v4_clipped']

# Prepare the data for insertion
batch_size = 1000
for start in range(0, len(gdf), batch_size):
    batch = gdf.iloc[start:start + batch_size]
    for _, row in batch.iterrows():
        if row['apportionment_date']:
            row['apportionment_date'] = parseDateString(row['apportionment_date'])
        for col in gdf.columns:
            val = row[col]
            if isinstance(val, pd.Series):
                print(f"length: {len(val)}")
                print(f"Column {col} is a Series: first value is {val.iloc[0]} of length {len(val)}")
                try:
                    first_value = row[col].iloc[0]
                    new = first_value
                    row[col] = new
                except Exception as e:
                    print(e)
                    print('Printing:')
                    for i in row[col]:
                        print(i, type(i))
                print("Before type is", type(row[col]))
                print("Type is", type(row[col]))
        # rest of your code...
        geometry_wkb = row['geometry'].wkb if row['geometry'] else None
        pluto_entry = MapPLUTO24v4Clipped(
            geometry=geometry_wkb,
            **{col: row[col] for col in gdf.columns if col not in ['geometry']}
        )
        session.add(pluto_entry)
    # for _, row in batch.iterrows():
    #     if row['apportionment_date']:
    #         row['apportionment_date'] = parseDateString(row['apportionment_date'])
    #     geometry_wkb = row['geometry'].wkb if row['geometry'] else None
    #     pluto_entry = MapPLUTO24v4Clipped(
    #         geometry=geometry_wkb,
    #         **{col: row[col] for col in gdf.columns if col not in ['geometry']}
    #     )
    #     session.add(pluto_entry)
    session.commit()

# Close the session
session.close()

In [None]:
del gdf

In [None]:
import time
from sqlalchemy.exc import OperationalError

def populate_lookup_table(engine, lookup_table, source_table_name, lookup_table_name, text_column_name, chunk_size=100, max_retries=5):
    """
    Populate a lookup table in chunks with retries for database lock issues.
    """
    def retry(func, *args, **kwargs):
        """Retry function with backoff for SQLite locks."""
        for attempt in range(max_retries):
            try:
                return func(*args, **kwargs)
            except OperationalError as e:
                if "database is locked" in str(e):
                    print(f"Database is locked. Retrying ({attempt + 1}/{max_retries})...")
                    time.sleep(0.2 * (attempt + 1))  # Gradual backoff
                else:
                    raise
        raise Exception("Exceeded maximum retries due to database locks.")
    
    with engine.connect() as connection:
        # Ensure the new column exists
        try:
            retry(connection.execute, text(f"ALTER TABLE {source_table_name} ADD COLUMN {text_column_name}_id INTEGER"))
        except Exception as e:
            print(f"Column creation skipped or failed: {e}")

        # Process unique values in chunks
        unique_query = f"SELECT DISTINCT {text_column_name} FROM {source_table_name}"
        unique_values_iter = pd.read_sql(unique_query, engine, chunksize=chunk_size)
        
        for chunk in unique_values_iter:
            unique_values = chunk[text_column_name].dropna().tolist()

            # Insert into the lookup table in small batches
            for value in unique_values:
                stmt = insert(lookup_table).values({text_column_name: value}).on_conflict_do_nothing()
                try:
                    retry(connection.execute, stmt)
                except Exception as e:
                    print(f"Error inserting value '{value}': {e}")

        # Update the source table with foreign key references
        update_stmt = text(f"""
        UPDATE {source_table_name}
        SET {text_column_name}_id = (
            SELECT id 
            FROM {lookup_table_name}
            WHERE {text_column_name} = {source_table_name}.{text_column_name}
        )
        """)
        try:
            retry(connection.execute, update_stmt)
        except Exception as e:
            print(f"Error updating foreign keys: {e}")
        connection.commit()
        # Remove the original text column (optional)
        connection.execute(text(f"ALTER TABLE {source_table_name} DROP COLUMN {text_column_name}"))
        connection.commit()


In [None]:
print(SQLITE_PATH)

* Make a test plot to verify that the geodata was stored correctly

In [None]:
import geopandas as gpd
import pandas as pd
from shapely import wkb
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import networkx as nx
from sqlalchemy import create_engine, event, text

# Read the data from the database
query = "SELECT zip_code, geometry FROM MapPLUTO_24v4_clipped"
df = pd.read_sql(query, engine)

# Debug: Print the DataFrame columns
print("DataFrame columns:", df.columns)

# Convert the geometry column from WKB to Shapely geometries
df['geometry'] = df['geometry'].apply(lambda x: wkb.loads(x) if x else None)

# Convert the DataFrame to a GeoDataFrame
gdf = gpd.GeoDataFrame(df, geometry='geometry')

# Print the GeoDataFrame
print(gdf.head())

# Ensure that zip_code is preserved during the dissolve process
merged_gdf = gdf.dissolve(by='zip_code', aggfunc={'zip_code': 'first'})  # Explicit aggregation of zip_code

# Check if zip_code is now present after dissolving
print(merged_gdf.columns)  # Should include 'zip_code'

# Create a new adjacency graph based on the merged geometries
G = nx.Graph()

# Add nodes and edges based on adjacency of merged shapes
for i, shape1 in merged_gdf.iterrows():
    for j, shape2 in merged_gdf.iterrows():
        if i != j and shape1.geometry.touches(shape2.geometry):
            G.add_edge(i, j)

# Perform graph coloring to ensure adjacent shapes don't share the same color
color_map = nx.coloring.greedy_color(G, strategy="largest_first")

# Plot the map with the colors assigned
fig, ax = plt.subplots(1, 1, figsize=(10, 10))

# Normalize the color map to cover the full range of the node indices
norm = mcolors.Normalize(vmin=min(color_map.values()), vmax=max(color_map.values()))
sm = plt.cm.ScalarMappable(cmap=plt.cm.tab20, norm=norm)

# Color the merged geometries based on the graph coloring using the full palette
merged_gdf['color'] = merged_gdf.index.map(color_map)
merged_gdf.plot(ax=ax, color=[sm.to_rgba(i) for i in merged_gdf['color']], edgecolor='black', linewidth=0, legend=False)

# Add labels at the center of each merged shape
for _, row in merged_gdf.iterrows():
    centroid = row.geometry.centroid
    ax.text(centroid.x, centroid.y, str(row['zip_code']), fontsize=2, ha='center', va='center')

# Add a colorbar to visualize the full range of colors
cbar = fig.colorbar(sm, ax=ax)
cbar.set_label('Color Range (Graph Coloring)', rotation=270, labelpad=20)

plt.savefig("/home/james/Massive/PROJECTDATA/map_output_zip_shuffled2.pdf", format="pdf")

plt.show()

In [None]:
import time
from sqlalchemy.exc import OperationalError

def populate_lookup_table(engine, lookup_table, source_table_name, lookup_table_name, text_column_name, chunk_size=100, max_retries=5):
    """
    Populate a lookup table in chunks with retries for database lock issues.
    """
    def retry(func, *args, **kwargs):
        """Retry function with backoff for SQLite locks."""
        for attempt in range(max_retries):
            try:
                return func(*args, **kwargs)
            except OperationalError as e:
                if "database is locked" in str(e):
                    print(f"Database is locked. Retrying ({attempt + 1}/{max_retries})...")
                    time.sleep(0.2 * (attempt + 1))  # Gradual backoff
                else:
                    raise
        raise Exception("Exceeded maximum retries due to database locks.")
    
    with engine.connect() as connection:
        # Ensure the new column exists
        try:
            retry(connection.execute, text(f"ALTER TABLE {source_table_name} ADD COLUMN {text_column_name}_id INTEGER"))
        except Exception as e:
            print(f"Column creation skipped or failed: {e}")

        # Process unique values in chunks
        unique_query = f"SELECT DISTINCT {text_column_name} FROM {source_table_name}"
        unique_values_iter = pd.read_sql(unique_query, engine, chunksize=chunk_size)
        
        for chunk in unique_values_iter:
            unique_values = chunk[text_column_name].dropna().tolist()

            # Insert into the lookup table in small batches
            for value in unique_values:
                stmt = insert(lookup_table).values({text_column_name: value}).on_conflict_do_nothing()
                try:
                    retry(connection.execute, stmt)
                except Exception as e:
                    print(f"Error inserting value '{value}': {e}")

        # Update the source table with foreign key references
        update_stmt = text(f"""
        UPDATE {source_table_name}
        SET {text_column_name}_id = (
            SELECT id 
            FROM {lookup_table_name}
            WHERE {text_column_name} = {source_table_name}.{text_column_name}
        )
        """)
        try:
            retry(connection.execute, update_stmt)
        except Exception as e:
            print(f"Error updating foreign keys: {e}")
        connection.commit()
        # Remove the original text column (optional)
        connection.execute(text(f"ALTER TABLE {source_table_name} DROP COLUMN {text_column_name}"))
        connection.commit()
