# Create and populate the database

In [None]:
# # Import standard libraries
# import os
# import re
import json
import codecs
# import requests
import dill
from urllib.request import urlopen

# # Import third-party libraries
# import geopandas as gpd
# from geoalchemy2 import Geometry
# import pandas as pd
from sqlalchemy import create_engine, Column, Integer, String, Date, MetaData, event, Table, text, LargeBinary
from sqlalchemy.dialects.sqlite import insert
from sqlalchemy.orm import sessionmaker
# from sqlalchemy.event import listen
# from sqlalchemy.engine import Engine
# from sqlalchemy.ext.declarative import declarative_base

# import sqlite3
# from sqlite3 import dbapi2 as sqlite

# import fiona
# from fiona.crs import from_epsg

from src.helpers import *
from src.dbutils import *
from src.ORMutils import *
from src.models import *
from src.geo import *
from src.pdfutils import *

* #### Load the objects created in previous notebooks

In [None]:
# Load the environment
with open("environment_data/table_dicts.pkl", "rb") as f:
    env = dill.load(f)

# Restore the environment
globals().update(env)

In [None]:
# Load the environment
with open("environment_data/select.pkl", "rb") as f:
    env = dill.load(f)

# Restore the environment
globals().update(env)

In [None]:
for name, dataset in datasets.items():
    print(dataset.short_name)

* ### Create the database engine that will be used throughout the rest of the notebook.

In [None]:
engine = create_engine(f'{SQLITE_PATH}?check_same_thread=False', echo=False)

SessionLocal = sessionmaker(bind=engine, autoflush=False, autocommit=False)

* ### Configure the database

In [None]:
@event.listens_for(engine, "connect")
def load_spatialite(dbapi_conn, connection_record):
    print("Loading SpatiaLite extension")
    dbapi_conn.enable_load_extension(True)
    dbapi_conn.load_extension("mod_spatialite")
    dbapi_conn.enable_load_extension(False)


with engine.connect() as conn:
    print("Connection established")
    result = conn.execute(text("SELECT spatialite_version()"))
    spatialite_version = result.fetchone()
    print(f"SpatiaLite version: {spatialite_version[0]}")

# Enable WAL mode
with engine.connect() as conn:
    conn.execute(text("PRAGMA journal_mode=WAL"))

# Initialize spatial metadata if not already present
with engine.connect() as conn:
    conn.execute(text("SELECT InitSpatialMetaData(1)"))

### Manually create the borough codes lookup table
* These are standardized and available many places, however I could not find a single official source of record to programatically get them from, since there are only five of them, I enter them manually.

In [None]:
# Borough = {'Manhattan' : 1,
# 'Bronx' : 2,
# 'Brooklyn' : 3,
# 'Queens' : 4,
# 'Staten Island' : 5}

* Create the lookup table.

In [None]:
# metadata = MetaData()
# metadata.reflect(bind=engine)

# def create_lookup_table_simple(engine=engine, metadata=metadata, lookup_table_name='new_lookup_table', lookup_column_name='name'):
#     lookup_table = Table(
#         lookup_table_name,
#         metadata,
#         Column('id', Integer, primary_key=True, autoincrement=False),
#         Column(lookup_column_name, String, unique=True, nullable=False, default="NO DATA"),
#         extend_existing = True
#     )
#     if table_exists(engine, lookup_table_name):
#         print("Table exists")
#     else:
#         lookup_table.create(engine)
#     return lookup_table

* Populate the table with the borough codes.

In [None]:
# borough_lookup_table = create_lookup_table_simple(engine=engine, metadata=metadata, lookup_table_name='boroughs', lookup_column_name='borough')

# with engine.connect() as connection:
#     for key,value in borough_codes.items():
#         stmt = insert(borough_lookup_table).values(id = value, borough = key).on_conflict_do_nothing()
#         connection.execute(stmt)
#     connection.commit()

* ### Create lookup tables variables identified as categorical and for which definitions were extracted from the metadata in the previous notebook.

* There are borough codes in the PLUTO dataset, but annyoingly, in contrast to most other datasets, the borough code is a two letter inital like "BK" or "BX". Also in the PLUTO dataset, "Sanitation Borough" does use the standard numeric codes that most other NYC OpenData datasets use. All this is is to say that it requires special handling separate from my system to extract categories and create lookup tables for them programatically.

In [None]:
# # Get columns related to borough to identify ones that need to be standardized.

# borough_cols = [col_customization_dict[k] for k in col_customization_dict.keys() if "boro" in k]
# borough_cols

In [None]:
# lookups = [col for col in column_customizations if col.definitions ]
# col_customization_dict

# lookups = {k:v for k,v in col_customization_dict.items() if k[v].definitions}
lookups = {k:v for k,v in col_customization_dict.items() if col_customization_dict[k].definitions}

completed_tables = [] # This is for tracking the names of tables that have been created, which will be used to avoid creating redundant tables for columns that are same-kind repeats (such as "district_1" and "district_2"), and thus will use the same lookups.


# columns = [
#     ('category', String, {'unique': True, 'nullable': False, 'default': "NO DATA"}),
#     ('info', String, {'unique': False, 'nullable': True, 'default': None})
# ]

# create_lookup_table(engine, 'my_lookup_table', columns)

for name,table in lookups.items():
    print('table is', table)
    lookup_table_name= re.sub('_[0-9]+$', '', table.new_name.title())
    print("prefixes are", completed_tables)
    print("table.new_name is", table.new_name)
    print([table.new_name.startswith(prefix) for prefix in completed_tables])
    print(any([table.new_name.startswith(prefix) for prefix in completed_tables]))
    if any([table.new_name.startswith(prefix) for prefix in completed_tables]):
    # if table.new_name[0:len(table)*75] in completed_tables:
        print("Lookup table already created, continuing...")
        continue
    with engine.connect() as connection:
        lookup_table = create_lookup_table(engine=engine, lookup_table_name=lookup_table_name, text_column_name='name_or_code')
        if lookup_table is None:
            print(f"Table {lookup_table_name} was not properly retrieved or created. Perhaps it exists? Skipping for now..")
            continue
        for definition in table.definitions:
            print(definition)
            if len(definition) == 2:
                try:
                    stmt = insert(lookup_table).values(id=int(definition[0]), name_or_code=definition[1]).on_conflict_do_nothing()
                except ValueError:
                    stmt = insert(lookup_table).values(name_or_code=definition[0], info=definition[1]).on_conflict_do_nothing()
            elif len(definition) == 3:
                try:
                    stmt = insert(lookup_table).values(id=int(definition[0]), name_or_code=definition[1], info=definition[2]).on_conflict_do_nothing()
                except Exception as e:
                    print(e)
                    print(definition)
                    # stmt = insert(lookup_table).values(id=definition[0], name_or_code=definition[1], info=definition[2]).on_conflict_do_nothing()
            else:
                print(definition)
                raise ValueError("Was only expecting two or three columns")
            connection.execute(stmt)
        connection.commit()
    name_prefix = table.new_name[0:round(len(table.new_name)*.75)] # Hopefully this is a safe threshold to identify when columns are repeats of the same type
    completed_tables.append(name_prefix)


In [None]:
lookups

In [None]:
{k:v for k,v in col_customization_dict.items() if col_customization_dict[k].definitions}

### Get explanations of zoning codes.
* I could only find this information in pdf form.
* I discovered how hard PDFs can be to parse.
* I had to do a lot of customization for just this specific pdf. I could have just manually cut and pasted the data from the pdf in the amount of time it took me to do that.
* I still think it was good to do for reproducibility reasons, but in the future I will try to avoid working with datasets that have important information only in PDF format.
* The following functions extract the tables from the pdf, detecting footnotes, and then subsitute the foonote number for the footnote text within the dataframe (so that it will end up as part of the relevant record in the databasee).

In [None]:
url = "https://www.nyc.gov/assets/bronxcb8/pdf/zoning_table_all.pdf"
filename = "zoning_table_all.pdf"  # Path to save the pdf containing the info we need

downloader(
            url=url,
            download_path=f"{PROJECT_DATA}/dictionaries/",
            outfile_name=filename,
            bigfile=False,
        )

* Run the above functions to extract the data from the pdf.

In [None]:
tables_and_footnotes = parse_zoning_details(f"{PROJECT_DATA}/dictionaries/{filename}")

In [None]:
# # Create a MetaData instance
metadata = MetaData()
metadata.reflect(bind=engine)

zoning_districts_lookup = create_lookup_table(engine, "zoning_districts", "code")
# # Reflect the table
# zoning_districts_lookup = Table("zoning_districts", metadata, autoload_with=engine)

for tablename in tables_and_footnotes.keys():
    print(tablename)
    df = tables_and_footnotes[tablename]['df']
    df.name = df.index.name
    with engine.connect() as conn:
        for series_name, series in df.items():
            tdf = pd.DataFrame(series)
            tdf.reset_index(inplace=True)
            jstring = pd.DataFrame(tdf).to_json()
            stmt = insert(zoning_districts_lookup).values(code=series_name, info=jstring).prefix_with("OR IGNORE")
            conn.execute(stmt)
            conn.commit()

### The PDF parsed above still has some definitions that are in text outside the tables. From `zoning_table_all.pdf`:

>C1-1 through C1-5 and C2-1 through C2-5 are commercial districts which are mapped as overlays within residential districts. When a commercial overlay is mapped within an R1 through R5 district, except an R5D district, the commercial FAR is 1.0; within an R5D district or an R6 through R10 district, the commercial FAR is 2.0. The residential FAR for a commercial overlay district is determined by the residential district regulations.

* I need to manually create the object to hold this information and put it in the database

In [None]:
more_zones = {}
info = "Commercial districts which are mapped as overlays within residential districts. When a commercial overlay is mapped within an R1 through R5 district, except an R5D district, the commercial FAR is 1.0; within an R5D district or an R6 through R10 district, the commercial FAR is 2.0. The residential FAR for a commercial overlay district is determined by the residential district regulations."
for i in range(1,6):
    more_zones[f'C1-{i}'] = info
    more_zones[f'C2-{i}'] = info

In [None]:
with engine.connect() as conn:
    for key in more_zones.keys():
        print(more_zones[key])
        stmt = insert(zoning_districts_lookup).values(code=key, info=more_zones[key]).prefix_with("OR IGNORE")
        conn.execute(stmt)
        conn.commit()

### Get a few more code meanings 
* From [NYC Department of Tax and Finance Data Dictionary](https://www.nyc.gov/assets/finance/downloads/tar/tarfieldcodes.pdf):
    * LandUse
    * OwnerType
    * Easment code
* Additional information about commercial zoning that I have not included can be [found here](https://www.nyc.gov/assets/planning/download/pdf/zoning/districts-tools/commercial_zoning_data_tables.pdf).
* Additional information about residential zoning that I have not included can be [found here](https://www.nyc.gov/assets/planning/download/pdf/zoning/districts-tools/residence_zoning_data_tables.pdf)

## Import the MaPLUTO data:
* List the layers in the file
* In this case there is only one layer, so it isn't necessary to know and specify which one to import, but including anyway for future reference.

In [None]:
# Import the MapPLUTO data from geo database file (.gdb)
gdb_path = f"{PROJECT_DATA}/files_to_use/MapPLUTO24v4.gdb"


* Import the geodatabase (.gdb) file.

In [None]:
# replacement_dict = {"MN": 1, "BX": 2, "BN": 3, "QN": 4, "SI": 5}

geodata = {}
# List layers in the GDB file
layers = fiona.listlayers(gdb_path)
print("Layers in the GDB file:")
for layer in layers:
    print(layer)
    gdf = gpd.read_file(gdb_path, layer=layer)
    # gdf['borough'] = gdf['Borough'].replace(replacement_dict)
    try:
        gdf['wkb'] = gdf['geometry'].apply(lambda geom: geom.wkb if geom else None)
    except KeyError:
        pass
    geodata[layer] = gdf


In [None]:
geodata.keys()


In [None]:
col_customization_dict

* Create the table in the Sqlite database and insert the (modified) data from the gdb file.

In [None]:
# Function to map Pandas/GeoPandas dtypes to SQLAlchemy types
def map_dtype(dtype):
    if pd.api.types.is_integer_dtype(dtype):
        return Integer
    elif pd.api.types.is_float_dtype(dtype):
        return Float
    elif pd.api.types.is_string_dtype(dtype):
        return String
    elif pd.api.types.is_binary_dtype(dtype):
        return LargeBinary
    else:
        raise ValueError(f"Unsupported dtype: {dtype}")

# Function to dynamically create the Pluto class
def create_pluto_class(gdf, name):
    attrs = {
        '__tablename__': name,
        'id': Column(Integer, primary_key=True, autoincrement=True),
        'geometry': Column(LargeBinary)  # Use LargeBinary for WKB
    }
    # Add columns dynamically based on the DataFrame's columns
    for col, dtype in zip(gdf.columns, gdf.dtypes):
        if col not in ['geometry']:  # Columns created sepearately from the automated pipeline
            attrs[col] = Column(map_dtype(dtype))
    
    return type('Pluto', (Base,), attrs)


# Base = sqlalchemy.orm.declarative_base()()

# for name,gdf in geodata.items():
    # Create the Pluto class
Pluto = create_pluto_class(geodata['MapPLUTO_24v4_clipped'], 'MapPLUTO_24v4_clipped')
# Create the table in the database
Base.metadata.create_all(engine)

In [None]:
from sqlalchemy.orm import sessionmaker
from shapely import wkb

# Create a session
session = SessionLocal()

gdf = geodata['MapPLUTO_24v4_clipped']

# Prepare the data for insertion
batch_size = 1000
for start in range(0, len(gdf), batch_size):
    batch = gdf.iloc[start:start + batch_size]
    for _, row in batch.iterrows():
        geometry_wkb = row['geometry'].wkb if row['geometry'] else None
        pluto_entry = Pluto(
            geometry=geometry_wkb,
            **{col: row[col] for col in gdf.columns if col not in ['geometry']}
        )
        session.add(pluto_entry)
    session.commit()

# Close the session
session.close()

In [None]:
del gdf

In [None]:
print(SQLITE_PATH)

* Make a test plot to verify that the geodata was stored correctly

In [None]:
import geopandas as gpd
import pandas as pd
from shapely import wkb
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import networkx as nx
from sqlalchemy import create_engine, event, text

# Read the data from the database
query = "SELECT ZipCode, geometry FROM MapPLUTO_24v4_clipped"
df = pd.read_sql(query, engine)

# Debug: Print the DataFrame columns
print("DataFrame columns:", df.columns)

# Convert the geometry column from WKB to Shapely geometries
df['geometry'] = df['geometry'].apply(lambda x: wkb.loads(x) if x else None)

# Convert the DataFrame to a GeoDataFrame
gdf = gpd.GeoDataFrame(df, geometry='geometry')

# Print the GeoDataFrame
print(gdf.head())

# Ensure that zip_code is preserved during the dissolve process
merged_gdf = gdf.dissolve(by='ZipCode', aggfunc={'ZipCode': 'first'})  # Explicit aggregation of zip_code

# Check if zip_code is now present after dissolving
print(merged_gdf.columns)  # Should include 'zip_code'

# Create a new adjacency graph based on the merged geometries
G = nx.Graph()

# Add nodes and edges based on adjacency of merged shapes
for i, shape1 in merged_gdf.iterrows():
    for j, shape2 in merged_gdf.iterrows():
        if i != j and shape1.geometry.touches(shape2.geometry):
            G.add_edge(i, j)

# Perform graph coloring to ensure adjacent shapes don't share the same color
color_map = nx.coloring.greedy_color(G, strategy="largest_first")

# Plot the map with the colors assigned
fig, ax = plt.subplots(1, 1, figsize=(10, 10))

# Normalize the color map to cover the full range of the node indices
norm = mcolors.Normalize(vmin=min(color_map.values()), vmax=max(color_map.values()))
sm = plt.cm.ScalarMappable(cmap=plt.cm.tab20, norm=norm)

# Color the merged geometries based on the graph coloring using the full palette
merged_gdf['color'] = merged_gdf.index.map(color_map)
merged_gdf.plot(ax=ax, color=[sm.to_rgba(i) for i in merged_gdf['color']], edgecolor='black', linewidth=0, legend=False)

# Add labels at the center of each merged shape
for _, row in merged_gdf.iterrows():
    centroid = row.geometry.centroid
    ax.text(centroid.x, centroid.y, str(row['ZipCode']), fontsize=2, ha='center', va='center')

# Add a colorbar to visualize the full range of colors
cbar = fig.colorbar(sm, ax=ax)
cbar.set_label('Color Range (Graph Coloring)', rotation=270, labelpad=20)

plt.savefig("/home/james/Massive/PROJECTDATA/map_output_zip_shuffled2.pdf", format="pdf")

plt.show()