# Design the database based on the data
* #### This notebook parses metadata associated with some of the datasets, most especially the PLUTO dataset, which contains columns that are also in many other datasets I looked at on NYCOpenData.
* #### In some cases I had to search around to find more complete definitions than were included in the data dictionary associated with the dataset.

In [None]:
import pdfplumber
import pandas as pd
import copy
import re
import time
import dill
time.sleep(5)
from bisect import bisect_left
from itertools import tee
from src.models import ColCustomization
import src.helpers
import src.pdfutils

In [None]:
# Load the environment
with open("environment_data/select.pkl", "rb") as f:
    env = dill.load(f)

# Restore the environment
globals().update(env)

* #### The MapPLUTO dictionary contains most of the information we need to interpret various codes and categories meaningfully.
* #### Unfortunately, it is in PDF format (as are many of the data dictionaries on NYCOpenData), which made extracting all the relevant data a real pain, and I don't expect most of these functions will be fully reusable for other PDFs I may encounter in the future. My hope is that it will still give me a head start when I need to make custom functions for future PDFs

In [None]:
# f"{PROJECT_DATA}/dictionaries/mapPLUTO_data_dictionary.pdf"
filename = '/home/james/Massive/PROJECTDATA/nyc_real_estate_data/dictionaries/mapPLUTO_data_dictionary.pdf'

* Looking at the PLUTO data dictionary, it seems that most category variables are labeled as "alpahnumeric" even if they only contain numbers, such as zip codes.
* There are some exceptions, police precincts and districts are numeric and listed as such. However as there a limited number of repeating variables, I wil treat them as categorical as well.

In [None]:
pdf_by_section = src.pdfutils.map_pdf(filename, same_line_tolerance=0.3, start_page=3) 

In [None]:
pdf_by_section

In [None]:
patterns = [
    (re.compile(r'[ ,–]+',  flags=re.IGNORECASE), '_'),
    (re.compile(r'#',  flags=re.IGNORECASE), 'num'),
    (re.compile(r'/',  flags=re.IGNORECASE), '_or_'),
    (re.compile(r'&',  flags=re.IGNORECASE), 'and'),
    (re.compile(r'!(altered)_[0-9]$',  flags=re.IGNORECASE), ''),
    (re.compile(r"\bboro(?!ugh)",  flags=re.IGNORECASE), 'borough')
]


category_markers = ['code', 'category', 'class', 'district', 'precinct', 'company', 'name', 'health_area', 'type', 'borough', 'name', 'health_area', 'health_center_district', 'overlay']

column_customizations=[]

for section in pdf_by_section:
    in_table = False
    in_description = False
    header_added = False
    table = None
    col_mods = None  # Initialize col_mods here
    for value in section:
        line = ' '.join([word['text'] for word in value])
        print(line)
        if (line.startswith("Field Name: CENSUS") or line.startswith("Field Name: HEALTH")) and col_mods is not None: # Handle special cases where the dividing line (rectangle object) is not present in between column descriptions
            if col_mods.short_name is not None:
                print("Appending col_mods", col_mods)
                column_customizations.append(col_mods)
        if line.startswith('Field Name:') and len(value) > 2: # Exclude the explanation of "Field Name" itself on page 3
            col_mods = ColCustomization(short_name=value[-1]['text'][1:-1]) # Get the field name minus the enclosing parentheses
            full_name = ' '.join(word['text'] for word in value[2:-1])
            print('full_name', full_name)
            new_name = src.pdfutils.clean_name(full_name.lower(), patterns=patterns)
            print('new_name', new_name)
            col_mods.is_category = any([word in new_name for word in category_markers])
            col_mods.new_name = new_name
            if any([w in col_mods.new_name for w in ['year', 'number', 'precinct']]):
                col_mods.dtype = "Integer"
            if 'date' in col_mods.new_name:
                col_mods.dtype = "Date"
            print('col_mods', col_mods)
        elif line.startswith('Format:') and not col_mods.dtype:
            if "Alphanumeric" in line:
                col_mods.dtype = "String"
            if "Numeric" in line and not col_mods.dtype:
                col_mods.dtype = "Float"
        elif line.startswith('Description:'):
            in_description = True
        if in_description is True:
            # print("LINE is", line)
            # if (line.startswith('Value') or line.startswith('VALUE') or line.startswith('BOROUGH JIA NAME')) and len(value) <= 3 and header_added is True: # Check if the line is a redundant table header, for when tables are split across pages
            #     continue
            if (line.startswith('Value') or line.startswith('VALUE')) and len(value) <= 3 and header_added is False: # Maximum number of words in a column heading
                print("Detected table")
                col_starts = src.pdfutils.get_word_starts_x(value)
                in_table = True
                table = [(line, value)]
                header_added = True # This is for dealing with tables that go across pages, and have the header again on the second page.
            elif in_table is True and (abs(col_starts[0] - src.pdfutils.get_word_starts_x(value)[0]) < .5 or abs(col_starts[1] - src.pdfutils.get_word_starts_x(value)[0]) < .5):
                table.append((line, value))
                print("Appended line", line)
            elif in_table is True:
                print("Table is", table)
                table = src.pdfutils.parse_table(table)
                print("Now the table is", table)
                print('table[0][0] is', table[0][0])
                if table[0][0].isdigit():
                    print("Digits detected!")
                    col_mods.is_fk = True
                col_mods.definitions = table
                in_table = False
                header_added = False
            else:
                pass
        else:
            pass
    if col_mods is not None:
        if not col_mods.definitions and table:
            col_mods.definitions = src.pdfutils.parse_table(table)
        if col_mods.definitions:
            col_mods.is_category = True
        if col_mods.dtype == "Float" and col_mods.is_category == True:
            col_mods.dtype = "Integer"
        print("Appending col_mods", col_mods)
        column_customizations.append(col_mods)
    else:
        print("col_mods was NONE!, col_mods is: ", col_mods)


In [None]:
column_customizations

In [None]:

def parse_zoning(pdf_path):
    all_tables = {}
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, start=1):
            # Extract raw text as lines
            lines = page.extract_text().splitlines()
            # Extract tables
            tables = page.extract_tables()
            for table_index, table in enumerate(tables):
                # Find the position of the table in the raw text
                table_start_line = find_table_start(lines, table)
                # Extract the line before the table, if available
                label_line = (
                    lines[table_start_line - 2] if table_start_line > 0 else None
                )
                table = [row for row in table if "Abbreviation" not in row]
                if label_line is not None:
                    if "APPENDIX" in label_line:
                        label_line = re.sub("APPENDIX.*: ", "", label_line)
                        label_line = re.sub(" +", "_", label_line.lower())
                        prev_label_line = label_line
                    elif "PLUTO DATA DICTIONARY" in label_line:
                        label_line = None
                    elif "APPENDIX" not in label_line:
                        print("what's this?: ", print('label_line is', label_line))
                        table = [row for row in table if "Abbreviation" not in row]
                    if label_line != None:
                        all_tables[label_line] = table
                    else:
                        all_tables[prev_label_line] = all_tables[prev_label_line] + table
                else:
                    print('table_index is', table_index)
                    print('missed:', lines[table_start_line])
    return all_tables


def find_table_start(lines, table):
    """
    Identify the start of the table in the text by matching table rows
    """
    for i, line in enumerate(lines):
        # Convert the table's first row into a string and search for it in the text
        table_row = " ".join(str(cell) for cell in table[1] if cell)  # Skip empty cells
        if line in table_row:
            return i
    return -1

* Add tables from appendixes

In [None]:
table_dicts = parse_zoning(filename)

In [None]:
column_customizations

In [None]:

# Preprocess dictionary keys by truncating last letter (for singular/plural matching)
truncated_keys = {key[:-1]: value for key, value in table_dicts.items()}

# Create a sorted list of `new_name` for efficient prefix search
sorted_new_names = sorted(item.short_name for item in column_customizations)
col_customization_dict = {item.short_name: item for item in column_customizations}

# Function to find matching prefixes using bisect
def find_matching_keys(prefix):
    i = bisect_left(sorted_new_names, prefix)
    matches = []
    while i < len(sorted_new_names) and sorted_new_names[i].startswith(prefix):
        matches.append(sorted_new_names[i])
        i += 1
    return matches

# Apply updates
for key, value in truncated_keys.items():
    matches = find_matching_keys(key)
    for match in matches:
        col_customization_dict[match].definitions = value  # Update definitions
        col_customization_dict[match].is_category = True


In [None]:
col_customization_dict

# Parse Appendix D:
### Extract the last table, which isn't actually a table, just text arranged in a table-like way.

In [None]:

def group_by_top_with_tolerance(elements, tolerance=.1):
    groups = []
    for elem in sorted(elements, key=lambda x: x["top"]):
        matched = False
        for group in groups:
            if abs(group[0]["top"] - elem["top"]) <= tolerance:
                group.append(elem)
                matched = True
                break
        if not matched:
            groups.append([elem])
    return groups


def restructure_data(data):
    result = []
    for group in data:
        subgroups = []
        subgroup = [group[0]]
        for item in group[1:]:
            if item["x0"] - subgroup[-1]["x1"] <= 10:
                subgroup.append(item)
            else:
                subgroups.append(subgroup)
                subgroup = [item]
        subgroups.append(subgroup)
        result.append(subgroups)
    return result

def merge_sublists(data, x_misalignment_tolerance=.1):
    # Extract the first sublist
    first_sublist = copy.deepcopy(data[0])

    # Iterate over the remaining sublists
    for sublist in data[1:]:
        for subsublist in sublist:
            # Determine the x-range of the sub-sub-list
            start = min(item["x0"] for item in subsublist)
            # stop = max(item["x1"] for item in subsublist)

            # Find the appropriate sub-sub-list in the first sublist to append to
            for target_subsublist in first_sublist:
                target_start = min(item["x0"] for item in target_subsublist)
                target_stop = max(item["x1"] for item in target_subsublist)

                if target_start - x_misalignment_tolerance <= start <= target_stop + x_misalignment_tolerance:
                    target_subsublist.extend(subsublist)
                    break

    return [first_sublist]


def fix_row(row, x_misalignment_tolerance=.1, y_misalignment_tolerance=.1):
    first_sort = sorted(row, key=lambda x: (x["top"], x["x0"]))  # `row` instead of `lst`
    grouped_by_top = group_by_top_with_tolerance(first_sort, tolerance=y_misalignment_tolerance)
    restructured_data = restructure_data(grouped_by_top)
    merged_data = merge_sublists(restructured_data, x_misalignment_tolerance=x_misalignment_tolerance)
    
    return merged_data

In [None]:
import pdfplumber

def trim_lines_outside_table(lines, table_top_boundary_text=None, table_bottom_boundary_text=None):
    """Returns the index of the first line to contain the specified table_top_boundary_text

    Args:
        lines (_type_): _description_
        table_top_boundary_text (_type_, optional): _description_. Defaults to None.

    Returns:
        _type_: _description_
    """
    for idx, line in enumerate(lines):
        if table_top_boundary_text is not None and table_top_boundary_text in " ".join(
            [word["text"] for word in line]
        ):
            top_trim_line = idx
            continue
        elif (
            table_bottom_boundary_text is not None
            and table_bottom_boundary_text in " ".join([word["text"] for word in line])
        ):
            bottom_trim_line = idx
            continue
        else:
            continue

    trimmed_lines = [
        line
        for idx, line in enumerate(lines)
        if idx > top_trim_line and idx < bottom_trim_line
    ]
    return trimmed_lines


def group_words_by_row(words, y_thresh=5):
    """Groups words into rows based on vertical proximity, allowing small deviations in top values."""
    words = sorted(words, key=lambda w: w['top'])  # Sort words top-to-bottom
    rows = []
    for word in words:
        added = False
        for row in rows:
            # Compare with first word in the row for stability
            if abs(word['top'] - max([w['top'] for w in row])) <= y_thresh:
                row.append(word)
                added = True
                break
        if not added:
            rows.append([word])

    return rows


def merge_words_in_row(row, x_thresh=10):
    """
    Merges words in a single row, considering the provided x_thresh for horizontal grouping.
    
    Returns:
    - A list of merged text blocks, each with the merged text and bounding box.
    """
    row.sort(key=lambda w: (w['x0'], w['top']))  # Sort words left-to-right
    merged_blocks = []
    current_block = []
    for word in row:
        if current_block and (word['x0'] - current_block[-1]['x1']) <= x_thresh:
            current_block.append(word)
        else:
            if current_block:
                current_block.sort(key=lambda w: w['top']) # Sort block by top coordinate to get text in each table cell correctly ordered.
                merged_blocks.append(current_block)
            current_block = [word]

    if current_block:
        merged_blocks.append(current_block)

    return [
        {
            "text": " ".join(w["text"] for w in block),
            "x0": min(w["x0"] for w in block),
            "x1": max(w["x1"] for w in block),
            "top": min(w["top"] for w in block),
            "bottom": max(w["bottom"] for w in block),
        }
        for block in merged_blocks
    ]


from collections import defaultdict

def merge_lines_in_row(lines, y_thresh):
    merged_lines = []
    
    for line in lines:
        if not merged_lines:
            merged_lines.append(line)
            continue
        
        prev_line = merged_lines[-1]
        
        # Compute merging condition
        min_top_current = min(word["top"] for word in line)
        max_bottom_prev = max(word["bottom"] for word in prev_line)
        
        if min_top_current - max_bottom_prev < y_thresh:
            # Merge into the previous line
            merged_lines[-1].extend(line)
        else:
            # Start a new line
            merged_lines.append(line)

    # Now merge words by `x0` within each line
    result = []
    
    for line in merged_lines:
        grouped = defaultdict(list)
        
        for (_, word) in enumerate(line):
            grouped[word["x0"]].append(word)
        
        merged_words = []
        
        for x0 in sorted(grouped.keys()):  # Preserve order
            words = grouped[x0]
            merged_text = " ".join(w["text"] for w in words)
            x1 = max(w["x1"] for w in words)
            top = min(w["top"] for w in words)
            bottom = max(w["bottom"] for w in words)
            
            merged_words.append({"text": merged_text, "x0": x0, "x1": x1, "top": top, "bottom": bottom})
        
        result.append(merged_words)

    return result



def detect_header_by_uppercase(rows):
    """Identifies the header row by checking if all words are uppercase."""
    header_row = []
    body_rows = []

    for row in rows:
        if all(word["text"].isupper() for word in row):  # All words must be uppercase
            header_row = header_row + row
        else:
            body_rows.append(row)

    return header_row, body_rows


def merge_words_into_rows(words, header_x_thresh, header_y_thresh, body_x_thresh, body_y_thresh):
    """
    Groups words into rows and merges horizontally close words.
    """
    rows = group_words_by_row(words, header_y_thresh)
    print("ROWS ARE", rows)
    trimmed_rows = trim_lines_outside_table(rows, table_top_boundary_text="APPENDIX D: LAND USE CATEGORIES", table_bottom_boundary_text="NOTES:")
    header_row, body_rows = detect_header_by_uppercase(trimmed_rows)
    merged_header = merge_words_in_row(header_row, header_x_thresh)
    # merged_rows = [merge_words_in_row(row, body_x_thresh) for row in body_rows]
    merged_rows = [fix_row(row) for row in body_rows]
    # merged_rows = merge_lines_in_row(merged_rows, body_y_thresh)
    all_rows = [merged_header] + merged_rows
    return all_rows
    # return merged_rows

def assign_columns_to_blocks(merged_rows, column_gap_thresh=20, ncol=3):
    """
    Assigns a column index to each merged text block by detecting significant gaps in x0 values.
    
    Parameters:
    - merged_rows: List of lists of merged word blocks.
    - column_gap_thresh: Minimum gap to consider as a column boundary.
    
    Returns:
    - A list where each element is a tuple (column_index, word_block_dict).
    """
    all_x_values = sorted(set(block["x0"] for row in merged_rows for block in row))

    # Detect gaps to determine column boundaries
    column_boundaries = [all_x_values[0]]
    for i in range(1, len(all_x_values)):
        if all_x_values[i] - all_x_values[i - 1] > column_gap_thresh:
            column_boundaries.append(all_x_values[i])

    # def get_column_index(x0):
    #     """Finds the appropriate column index for a given x0 value."""
    #     for i, boundary in enumerate(column_boundaries):
    #         if x0 < boundary:
    #             return max(i - 1, 0)
    #     return len(column_boundaries) - 1

    structured_output = []
    for idx,row in enumerate(merged_rows):
        row_output = [cell for cell in row]
        structured_output.append(row_output)

    return structured_output


In [None]:
# Example usage
header_x_thresh = 10
header_y_thresh = 20
body_x_thresh = 10
body_y_thresh = 10
column_gap_thresh = 20  # Adjust based on observed spacing
ncol = 3

with pdfplumber.open(filename) as pdf:
    words = pdf.pages[-1].extract_words()  # Extract words from page 0
    merged_rows = merge_words_into_rows(words, header_x_thresh, header_y_thresh, body_x_thresh, body_y_thresh)

In [None]:
def merge_objects_in_cell(list_of_objects):
    return {
        "text": " ".join(w["text"] for w in list_of_objects),
        "x0": min(w["x0"] for w in list_of_objects),
        "x1": max(w["x1"] for w in list_of_objects),
        "top": min(w["top"] for w in list_of_objects),
        "bottom": max(w["bottom"] for w in list_of_objects),
    }

def merge_text_in_cell(list_of_objects):
    return " ".join(w["text"] for w in list_of_objects)

last_table = []
for idx,row in enumerate(merged_rows[1:]):
    new_row = []
    for idx2,cell in enumerate(row[0]):
        new_row.append(merge_text_in_cell(cell))
    last_table.append(new_row)

In [None]:
last_table

In [None]:
[k for k in col_customization_dict.keys() if 'Land' in k]

In [None]:
col_customization_dict['LandUse'].definitions = last_table

In [None]:
col_customization_dict

### Get explanations of zoning codes.
* I could only find this information in pdf form.
* I discovered how hard PDFs can be to parse.
* I had to do a lot of customization for just this specific pdf. I could have just manually cut and pasted the data from the pdf in the amount of time it took me to do that.
* I still think it was good to do for reproducibility reasons, but in the future I will try to avoid working with datasets that have important information only in PDF format.
* The following functions extract the tables from the pdf, detecting footnotes, and then subsitute the foonote number for the footnote text within the dataframe (so that it will end up as part of the relevant record in the databasee).

In [None]:
url = "https://www.nyc.gov/assets/bronxcb8/pdf/zoning_table_all.pdf"
filename = "zoning_table_all.pdf"  # Path to save the pdf containing the info we need

src.helpers.downloader(
            url=url,
            download_path=f"{PROJECT_DATA}/dictionaries/",
            outfile_name=filename,
            bigfile=False,
        )

* Run the above functions to extract the data from the pdf.

In [None]:
tables_and_footnotes = src.pdfutils.parse_zoning_details(f"{PROJECT_DATA}/dictionaries/{filename}")

In [None]:
tables_and_footnotes

In [None]:
# # Create a MetaData instance
# metadata = MetaData()
# Base.metadata.reflect(bind=engine)

# zoning_district_lookup = create_lookup_table(engine, "zoning_district", "code")
# # Reflect the table
# zoning_district_lookup = Table("zoning_district", metadata, autoload_with=engine)

for tablename in tables_and_footnotes.keys():
    print(tablename)
    df = tables_and_footnotes[tablename]['df']
    df.name = df.index.name
    # with engine.connect() as conn:
    for series_name, series in df.items():
        tdf = pd.DataFrame(series)
        tdf.reset_index(inplace=True)
        jstring = pd.DataFrame(tdf).to_json()
        col_customization_dict['ZoneDist1'].definitions.append([series_name, jstring])


In [None]:
print(col_customization_dict)

### The PDF parsed above still has some definitions that are in text outside the tables. From `zoning_table_all.pdf`:

>C1-1 through C1-5 and C2-1 through C2-5 are commercial districts which are mapped as overlays within residential districts. When a commercial overlay is mapped within an R1 through R5 district, except an R5D district, the commercial FAR is 1.0; within an R5D district or an R6 through R10 district, the commercial FAR is 2.0. The residential FAR for a commercial overlay district is determined by the residential district regulations.

* I need to manually create the object to hold this information and put it in the database

In [None]:
more_zones = {}
info = "Commercial districts which are mapped as overlays within residential districts. When a commercial overlay is mapped within an R1 through R5 district, except an R5D district, the commercial FAR is 1.0; within an R5D district or an R6 through R10 district, the commercial FAR is 2.0. The residential FAR for a commercial overlay district is determined by the residential district regulations."
for i in range(1,6):
    more_zones[f'C1-{i}'] = info
    more_zones[f'C2-{i}'] = info

In [None]:
for key in more_zones.keys():
    print(more_zones[key])
    col_customization_dict['ZoneDist1'].definitions.append([key, more_zones[key]])

### Get a few more code meanings 
* From [NYC Department of Tax and Finance Data Dictionary](https://www.nyc.gov/assets/finance/downloads/tar/tarfieldcodes.pdf):
    * LandUse
    * OwnerType
    * Easment code
* Additional information about commercial zoning that I have not included can be [found here](https://www.nyc.gov/assets/planning/download/pdf/zoning/districts-tools/commercial_zoning_data_tables.pdf).
* Additional information about residential zoning that I have not included can be [found here](https://www.nyc.gov/assets/planning/download/pdf/zoning/districts-tools/residence_zoning_data_tables.pdf)

## Get the meanings of the building classification codes from the City of New York website.

In [None]:
import urllib.request #, urllib.parse, urllib.error
from bs4 import BeautifulSoup

webpage = "https://www.nyc.gov/assets/finance/jump/hlpbldgcode.html"

def get_table_rows(url):
    html = urllib.request.urlopen(url).read()
    soup = BeautifulSoup(html, 'html.parser')
    return soup('tr')


trs = get_table_rows(webpage)

class_codes = []
d = None
for tr in trs:    
    # Check if 'a' with 'name' exists
    a = tr.find('a', attrs={'name': True})
    if a:
        if d:
            class_codes.append(d)
        supercategory = tr.find_all('th')[1].text.capitalize()
        d = {"supercategory": supercategory}
    
    # Check if 'td' exists and update 'd'
    cells = tr.find_all('td')
    if cells:
        d = {}
        code, name = cells[:2]
        d['code'] = code.text.strip()
        d['name'] = name.text.capitalize().strip()
        class_codes.append(d)


In [None]:
for row in class_codes:
    col_customization_dict['BldgClass'].definitions.append([row['code'], row['name']])

In [None]:
with open("environment_data/table_dicts.pkl", "wb") as f:
    dill.dump({'col_customization_dict': col_customization_dict, 'table_dicts': table_dicts}, f)

In [None]:
col_customization_dict