# Design the database based on the data
* #### This notebook parses metadata associated with some of the datasets, most especially the PLUTO dataset, which contains columns that are also in many other datasets I looked at on NYCOpenData.
* #### In some cases I had to search around to find more complete definitions than were included in the data dictionary associated with the dataset.

In [None]:
import pdfplumber
import pandas as pd
import copy
import re
import time
import dill
time.sleep(5) # This seems to be necessary to avoid an error about dill not being loaded in the next cell. Sometimes even that is not enough and this cell needs to be run again.
from bisect import bisect_left
from itertools import tee
from src.models import ColCustomization
import src.helpers
from src.helpers import *
from src.pdfutils import *
from geoalchemy2 import Geometry


In [None]:
# Load the environment
with open("environment_data/select.pkl", "rb") as f:
    env = dill.load(f)

# Restore the environment
globals().update(env)

In [None]:
dataset_info_dict

In [None]:
column_types = dataset_info_dict['assessments'].col_types
print(column_types)
print(len(column_types))

In [None]:
import json
import ijson
import pandas as pd
from sqlalchemy import Integer, String, Date, Float, JSON
# from your_module import Geometry  # Assuming you have Geometry defined somewhere

def is_number(value):
    """Check if a string represents a number (integer or float)."""
    try:
        float(value)
        return True
    except ValueError:
        return False

def set_dtypes(filename, column_types):
    print(f'Starting {filename}')
    if not filename.endswith('.json'):
        print(f'{filename} is not a json file, skipping for now...')
        return column_types

    # Build the initial mapping of column names to SQLAlchemy types.
    # Here, non-numeric types are handled in a nested conditional expression.
    new_column_types = {
        column: (
            Integer if dtype == "number" or column in ['created_at', 'updated_at'] else
            String if dtype == "text" or column == "sid" else
            Date if dtype == "calendar_date" else
            JSON if dtype == "location" else
            Geometry(geometry_type='POINT', srid=4326) if dtype == "point" else
            Geometry(geometry_type='MULTIPOLYGON', srid=4326) if dtype == "multipolygon" else
            (Geometry(geometry_type='POLYGON', srid=4326) if dtype == "polygon" else dtype)

        )
        for column, dtype in column_types.items() if not column.startswith(':@')
    }
    print(f'Initial new_column_types: {new_column_types}')

    # Compute numeric column indices (assumes order of keys corresponds to data positions)
    column_names = list(column_types.keys())
    print(f'column_names: {column_names}')
    numeric_indices = [
        idx for idx, col in enumerate(column_names)
        if column_types[col] == "number"
    ]

    # Open the file and iterate through rows one by one.
    with open(filename, "r") as f:
        # Assuming the JSON file is a top-level array.
        rows = ijson.items(f, "item")
        for row in rows:
            # (Optional) Convert the values for known types.
            # This section can be expanded as needed.
            for col, dtype in column_types.items():
                # When the dataset is represented as a dict per row:
                if dtype == "Integer":
                    row[col] = int(row[col]) if row[col] is not None else None
                elif dtype == "Float":
                    row[col] = float(row[col]) if row[col] is not None else None
                elif dtype == "Date":
                    row[col] = pd.to_datetime(row[col]) if row[col] is not None else None

            # Check numeric columns for float support.
            # If any numeric column has a value with decimals, update the type to Float.
            for idx in numeric_indices:
                # Assume row is a list-like object.
                value = row[idx]

                if value is None:
                    continue  # Skip null values

                # Check integer vs. float
                if isinstance(value, (int, float)):
                    if float(value) % 1 != 0:
                        # Update the column type to Float.
                        colname = column_names[idx]
                        new_column_types[colname] = Float
                        break  # No need to check further for this column

                elif isinstance(value, str):
                    value = value.strip()
                    # If the numeric value in string contains a decimal, cast it as Float.
                    if not value.isdigit() and is_number(value):
                        if float(value) % 1 != 0:
                            colname = column_names[idx]
                            new_column_types[colname] = Float
                            break

    return new_column_types

In [None]:
for dataset_info in dataset_info_dict.values():
    # column_types = dataset_info.col_types
    dataset_info.col_types = set_dtypes(filename=dataset_info.dataset_path, column_types = dataset_info.col_types)

In [None]:
dataset_info.col_types

* #### The MapPLUTO dictionary contains most of the information we need to interpret various codes and categories meaningfully.
* #### Unfortunately, it is in PDF format (as are many of the data dictionaries on NYCOpenData), which made extracting all the relevant data a real pain, and I don't expect most of these functions will be fully reusable for other PDFs I may encounter in the future. My hope is that it will still give me a head start when I need to make custom functions for future PDFs

In [None]:
# filename

In [None]:
# f"{PROJECT_DATA}/dictionaries/mapPLUTO_data_dictionary.pdf"
filename = '/home/james/Massive/PROJECTDATA/nyc_real_estate_data/dictionaries/mapPLUTO_data_dictionary.pdf'

* Looking at the PLUTO data dictionary, it seems that most category variables are labeled as "alpahnumeric" even if they only contain numbers, such as zip codes.
* There are some exceptions, police precincts and districts are numeric and listed as such. However as there a limited number of repeating variables, I wil treat them as categorical as well.

In [None]:
pdf_by_section = map_pdf(filename, same_line_tolerance=0.3, start_page=3) 

In [None]:
pdf_by_section

In [None]:
category_markers = ['code', 'category', 'class', 'district', 'precinct', 'company', 'name', 'health_area', 'type', 'borough', 'name', 'health_area', 'health_center_district', 'overlay']

column_customizations=[]

for section in pdf_by_section:
    in_table = False
    in_description = False
    header_added = False
    table = None
    col_mods = None  # Initialize col_mods here
    for value in section:
        line = ' '.join([word['text'] for word in value])
        print(line)
        if (line.startswith("Field Name: CENSUS") or line.startswith("Field Name: HEALTH")) and col_mods is not None: # Handle special cases where the dividing line (rectangle object) is not present in between column descriptions
            if col_mods.short_name is not None:
                print("Appending col_mods", col_mods)
                column_customizations.append(col_mods)
        if line.startswith('Field Name:') and len(value) > 2: # Exclude the explanation of "Field Name" itself on page 3
            col_mods = ColCustomization(short_name=value[-1]['text'][1:-1]) # Get the field name minus the enclosing parentheses
            full_name = ' '.join(word['text'] for word in value[2:-1])
            print('full_name', full_name)
            new_name = clean_name(full_name.lower())
            print('new_name', new_name)
            col_mods.is_category = any([word in new_name for word in category_markers])
            col_mods.new_name = new_name
            if any([w in col_mods.new_name for w in ['year', 'number', 'precinct']]):
                col_mods.dtype = "Integer"
            if 'date' in col_mods.new_name:
                col_mods.dtype = "Date"
            print('col_mods', col_mods)
        elif line.startswith('Format:') and not col_mods.dtype:
            if "Alphanumeric" in line:
                col_mods.dtype = "String"
            if "Numeric" in line and not col_mods.dtype:
                col_mods.dtype = "Float"
        elif line.startswith('Description:'):
            in_description = True
        if in_description is True:
            # print("LINE is", line)
            # if (line.startswith('Value') or line.startswith('VALUE') or line.startswith('BOROUGH JIA NAME')) and len(value) <= 3 and header_added is True: # Check if the line is a redundant table header, for when tables are split across pages
            #     continue
            if (line.startswith('Value') or line.startswith('VALUE')) and len(value) <= 3 and header_added is False: # Maximum number of words in a column heading
                print("Detected table")
                col_starts = get_word_starts_x(value)
                in_table = True
                table = [(line, value)]
                header_added = True # This is for dealing with tables that go across pages, and have the header again on the second page.
            elif in_table is True and (abs(col_starts[0] - get_word_starts_x(value)[0]) < .5 or abs(col_starts[1] - get_word_starts_x(value)[0]) < .5):
                table.append((line, value))
                print("Appended line", line)
            elif in_table is True:
                print("Table is", table)
                table = parse_table(table)
                print("Now the table is", table)
                print('table[0][0] is', table[0][0])
                if table[0][0].isdigit():
                    print("Digits detected!")
                    col_mods.is_fk = True
                col_mods.definitions = table
                in_table = False
                header_added = False
            else:
                pass
        else:
            pass
    if col_mods is not None:
        if not col_mods.definitions and table:
            col_mods.definitions = parse_table(table)
        if col_mods.definitions:
            col_mods.is_category = True
        if col_mods.dtype == "Float" and col_mods.is_category == True:
            col_mods.dtype = "Integer"
        print("Appending col_mods", col_mods)
        column_customizations.append(col_mods)
    else:
        print("col_mods was NONE!, col_mods is: ", col_mods)


In [None]:

# def parse_zoning(pdf_path):
#     all_tables = {}
#     with pdfplumber.open(pdf_path) as pdf:
#         for page_num, page in enumerate(pdf.pages, start=1):
#             # Extract raw text as lines
#             lines = page.extract_text().splitlines()
#             # Extract tables
#             tables = page.extract_tables()
#             for table_index, table in enumerate(tables):
#                 # Find the position of the table in the raw text
#                 table_start_line = find_table_start(lines, table)
#                 # Extract the line before the table, if available
#                 label_line = (
#                     lines[table_start_line - 2] if table_start_line > 0 else None
#                 )
#                 table = [row for row in table if "Abbreviation" not in row]
#                 if label_line is not None:
#                     if "APPENDIX" in label_line:
#                         label_line = re.sub("APPENDIX.*: ", "", label_line)
#                         label_line = re.sub(" +", "_", label_line.lower())
#                         label_line = re.sub("s$", "", label_line.lower()) # remove trailing plural s so as to match column names
#                         prev_label_line = label_line
#                     elif "PLUTO DATA DICTIONARY" in label_line:
#                         label_line = None
#                     elif "APPENDIX" not in label_line:
#                         print("what's this?: ", print('label_line is', label_line))
#                         table = [row for row in table if "Abbreviation" not in row]
#                     if label_line != None:
#                         all_tables[label_line] = table
#                     else:
#                         all_tables[prev_label_line] = all_tables[prev_label_line] + table
#                 else:
#                     print('table_index is', table_index)
#                     print('missed:', lines[table_start_line])
#     return all_tables


# def find_table_start(lines, table):
#     """
#     Identify the start of the table in the text by matching table rows
#     """
#     for i, line in enumerate(lines):
#         # Convert the table's first row into a string and search for it in the text
#         table_row = " ".join(str(cell) for cell in table[1] if cell)  # Skip empty cells
#         if line in table_row:
#             return i
#     return -1

* Add tables from appendixes

In [None]:
table_dicts = parse_zoning(filename)

In [None]:

# Preprocess dictionary keys by truncating last letter (for singular/plural matching)
truncated_keys = {key[:-1]: value for key, value in table_dicts.items()}

# Create a sorted list of `new_name` for efficient prefix search
sorted_new_names = sorted(item.new_name for item in column_customizations)
col_customization_dict = {item.new_name: item for item in column_customizations}

# Function to find matching prefixes using bisect
def find_matching_keys(prefix):
    i = bisect_left(sorted_new_names, prefix)
    matches = []
    while i < len(sorted_new_names) and sorted_new_names[i].startswith(prefix):
        matches.append(sorted_new_names[i])
        i += 1
    return matches

# Apply updates
for key, value in truncated_keys.items():
    print(key)
    matches = find_matching_keys(key)
    print(matches)
    for match in matches:
        col_customization_dict[match].definitions = value  # Update definitions
        col_customization_dict[match].is_category = True


In [None]:
# Manually set BBL to not be a category
col_customization_dict['borough_tax_block_and_lot'].is_category = False

# Parse Appendix D:
### Extract the last table, which isn't actually a table, just text arranged in a table-like way.

In [None]:
# Example usage
header_x_thresh = 10
header_y_thresh = 20
body_x_thresh = 10
body_y_thresh = 10
column_gap_thresh = 20  # Adjust based on observed spacing
ncol = 3

with pdfplumber.open(filename) as pdf:
    words = pdf.pages[-1].extract_words()  # Extract words from page 0
    merged_rows = merge_words_into_rows(words, header_x_thresh, header_y_thresh, body_x_thresh, body_y_thresh)

In [None]:
last_table = []
for idx,row in enumerate(merged_rows[1:]):
    new_row = []
    for idx2,cell in enumerate(row[0]):
        new_row.append(merge_text_in_cell(cell))
    last_table.append(new_row)

In [None]:
col_customization_dict['land_use_category'].definitions = last_table

### Get explanations of zoning codes.
* I could only find this information in pdf form.
* I discovered how hard PDFs can be to parse.
* I had to do a lot of customization for just this specific pdf. I could have just manually cut and pasted the data from the pdf in the amount of time it took me to do that.
* I still think it was good to do for reproducibility reasons, but in the future I will try to avoid working with datasets that have important information only in PDF format.
* The following functions extract the tables from the pdf, detecting footnotes, and then subsitute the foonote number for the footnote text within the dataframe (so that it will end up as part of the relevant record in the databasee).

In [None]:
url = "https://www.nyc.gov/assets/bronxcb8/pdf/zoning_table_all.pdf"
filename = "zoning_table_all.pdf"  # Path to save the pdf containing the info we need

downloader(
            url=url,
            download_path=f"{PROJECT_DATA}/dictionaries/",
            outfile_name=filename,
            bigfile=False,
        )

* Run the above functions to extract the data from the pdf.

In [None]:
tables_and_footnotes = parse_zoning_details(f"{PROJECT_DATA}/dictionaries/{filename}")

In [None]:
tables_and_footnotes

In [None]:
for tablename in tables_and_footnotes.keys():
    print(tablename)
    df = tables_and_footnotes[tablename]['df']
    df.name = df.index.name
    # with engine.connect() as conn:
    for series_name, series in df.items():
        tdf = pd.DataFrame(series)
        tdf.reset_index(inplace=True)
        jstring = pd.DataFrame(tdf).to_json()
        col_customization_dict['zoning_district_1'].definitions.append([series_name, jstring])


### The PDF parsed above still has some definitions that are in text outside the tables. From `zoning_table_all.pdf`:

>C1-1 through C1-5 and C2-1 through C2-5 are commercial districts which are mapped as overlays within residential districts. When a commercial overlay is mapped within an R1 through R5 district, except an R5D district, the commercial FAR is 1.0; within an R5D district or an R6 through R10 district, the commercial FAR is 2.0. The residential FAR for a commercial overlay district is determined by the residential district regulations.

* I need to manually create the object to hold this information and put it in the database

In [None]:
more_zones = {}
info = "Commercial districts which are mapped as overlays within residential districts. When a commercial overlay is mapped within an R1 through R5 district, except an R5D district, the commercial FAR is 1.0; within an R5D district or an R6 through R10 district, the commercial FAR is 2.0. The residential FAR for a commercial overlay district is determined by the residential district regulations."
for i in range(1,6):
    more_zones[f'C1-{i}'] = info
    more_zones[f'C2-{i}'] = info

In [None]:
for key in more_zones.keys():
    col_customization_dict['commercial_overlay_1'].definitions.append([key, more_zones[key]])

### Get a few more code meanings 
* From [NYC Department of Tax and Finance Data Dictionary](https://www.nyc.gov/assets/finance/downloads/tar/tarfieldcodes.pdf):
    * LandUse
    * OwnerType
    * Easment code
* Additional information about commercial zoning that I have not included can be [found here](https://www.nyc.gov/assets/planning/download/pdf/zoning/districts-tools/commercial_zoning_data_tables.pdf).
* Additional information about residential zoning that I have not included can be [found here](https://www.nyc.gov/assets/planning/download/pdf/zoning/districts-tools/residence_zoning_data_tables.pdf)

## Get the meanings of the building classification codes from the City of New York website.

In [None]:
import urllib.request #, urllib.parse, urllib.error
from bs4 import BeautifulSoup

webpage = "https://www.nyc.gov/assets/finance/jump/hlpbldgcode.html"

def get_table_rows(url):
    html = urllib.request.urlopen(url).read()
    soup = BeautifulSoup(html, 'html.parser')
    return soup('tr')


trs = get_table_rows(webpage)

class_codes = []
d = None
for tr in trs:    
    # Check if 'a' with 'name' exists
    a = tr.find('a', attrs={'name': True})
    if a:
        if d:
            class_codes.append(d)
        supercategory = tr.find_all('th')[1].text.capitalize()
        d = {"supercategory": supercategory}
    
    # Check if 'td' exists and update 'd'
    cells = tr.find_all('td')
    if cells:
        d = {}
        code, name = cells[:2]
        d['code'] = code.text.strip()
        d['name'] = name.text.capitalize().strip()
        class_codes.append(d)


In [None]:
for row in class_codes:
    col_customization_dict['building_class'].definitions.append([row['code'], row['name']])

### Manually add columns that I will later add to the dataset, such as building number and address derived from the "address" column

In [None]:
col_customization_dict['building_num'] = ColCustomization(short_name='building_num', new_name='building_num', is_category=False, dtype='Integer')
col_customization_dict['street'] = ColCustomization(short_name='street', new_name='street', is_category=True, dtype='String')

In [None]:
dataset_info_dict['mapPLUTO'].col_customizations = col_customization_dict

In [None]:
for name,info in dataset_info_dict.items():
    # print(info.col_customizations)
    if info.col_types.items():
        if not info.col_customizations:
            info.col_customizations = {short_name : ColCustomization(short_name=short_name, dtype=dtype) for short_name,dtype in info.col_types.items()}
        for key,val in info.cardinality_ratios.items():
            if val > 20 and info.col_customizations is not None and info.col_types[key] == String:
                info.col_customizations[key].is_category = True


In [None]:
with open("environment_data/table_dicts.pkl", "wb") as f:
    dill.dump(
        {
            "col_customization_dict": col_customization_dict,
            "dataset_info_dict": dataset_info_dict,
            "PROJECT_PATH": PROJECT_PATH,
            "PROJECT_DATA": PROJECT_DATA,
            "SQLITE_PATH": SQLITE_PATH,
            "DATADIR": DATADIR,
            "PROJECT_NAME": PROJECT_NAME,
            "PROJECT_DATA": PROJECT_DATA,
        },
        f,
    )