# Design the database based on the data
* #### This notebook parses metadata associated with some of the datasets, most especially the PLUTO dataset, which contains columns that are also in many other datasets I looked at on NYCOpenData.

In [None]:
import pdfplumber
import re
import dill
from bisect import bisect_left
from itertools import tee
from src.models import ColCustomization
import src.helpers
import src.pdfutils

In [None]:
filename = '/home/james/Massive/PROJECTDATA/nyc_real_estate_data/dictionaries/mapPLUTO_data_dictionary.pdf'

* Looking at the PLUTO data dictionary, it seems that most category variables are labeled as "alpahnumeric" even if they only contain numbers, such as zip codes.
* There are some exceptions, police precincts and districts are numeric and listed as such. However as there a limited number of repeating variables, I wil treat them as categorical as well.

In [None]:
pdf_by_section = src.pdfutils.map_pdf(filename, same_line_tolerance=0.3, start_page=3) 

In [None]:
patterns = [
    (re.compile(r'[ ,–]+'), '_'),
    (re.compile(r'#'), 'num'),
    (re.compile(r'/'), 'or'),
    (re.compile(r'&'), 'and')
]

fk_markers = ['code', 'category', 'class', 'district', 'precinct', 'company', 'name', 'health_area', 'type', 'borough', 'name', 'health_area', 'health_center_district']

column_customizations=[]

for section in pdf_by_section:
    in_table = False
    in_description = False
    table = None
    col_mods = None  # Initialize col_mods here
    for value in section:
        line = ' '.join([word['text'] for word in value])
        if line.startswith('Field Name:') and len(value) > 2: # Exclude the explanation of "Field Name" itself on page 3
            col_mods = ColCustomization(short_name=value[-1]['text'][1:-1]) # Get the field name minus the enclosing parentheses
            full_name = ' '.join(word['text'] for word in value[2:-1])
            new_name = src.pdfutils.clean_name(full_name.lower(), patterns=patterns)
            is_fk = any([word in new_name for word in fk_markers])
            col_mods.new_name = new_name
            if any([w in new_name for w in ['year', 'number', 'precinct']]):
                col_mods.dtype = "Integer"
            if 'date' in new_name:
                col_mods.dtype = "Date"
        elif line.startswith('Format:') and not col_mods.dtype:
            if "Alphanumeric" in line:
                col_mods.dtype = "String"
            if "Numeric" in line and not col_mods.dtype:
                col_mods.dtype = "Float"
        elif line.startswith('Description:'):
            in_description = True
        if in_description is True:
            if (line.startswith('Value') or line.startswith('VALUE')) and len(value) <= 3: # Maximum number of words in a column heading
                col_starts = src.pdfutils.get_word_starts_x(value)
                in_table = True
                table = [(line, value)]
            elif in_table is True and (abs(col_starts[0] - src.pdfutils.get_word_starts_x(value)[0]) < .5 or abs(col_starts[1] - src.pdfutils.get_word_starts_x(value)[0]) < .5):
                table.append((line, value))
            elif in_table is True:
                col_mods.definitions = src.pdfutils.parse_table(table)
                in_table = False
            else:
                pass
        else:
            pass
    if col_mods is not None:
        if not col_mods.definitions and table:
            col_mods.definitions = src.pdfutils.parse_table(table)
        if col_mods.definitions:
            col_mods.is_fk = True
        column_customizations.append(col_mods)


In [None]:

def parse_zoning(pdf_path):
    all_tables = {}
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, start=1):
            # Extract raw text as lines
            lines = page.extract_text().splitlines()
            # Extract tables
            tables = page.extract_tables()
            for table_index, table in enumerate(tables):
                # Find the position of the table in the raw text
                table_start_line = find_table_start(lines, table)
                # Extract the line before the table, if available
                label_line = (
                    lines[table_start_line - 2] if table_start_line > 0 else None
                )
                table = [row for row in table if "Abbreviation" not in row]
                if label_line is not None:
                    if "APPENDIX" in label_line:
                        label_line = re.sub("APPENDIX.*: ", "", label_line)
                        label_line = re.sub(" +", "_", label_line.lower())
                        prev_label_line = label_line
                    elif "PLUTO DATA DICTIONARY" in label_line:
                        label_line = None
                    elif "APPENDIX" not in label_line:
                        print("what's this?: ", print('label_line is', label_line))
                        table = [row for row in table if "Abbreviation" not in row]
                    if label_line != None:
                        all_tables[label_line] = table
                    else:
                        all_tables[prev_label_line] = all_tables[prev_label_line] + table
                else:
                    print('table_index is', table_index)
                    print('missed:', lines[table_start_line])
    return all_tables


def find_table_start(lines, table):
    """
    Identify the start of the table in the text by matching table rows
    """
    for i, line in enumerate(lines):
        # Convert the table's first row into a string and search for it in the text
        table_row = " ".join(str(cell) for cell in table[1] if cell)  # Skip empty cells
        if line in table_row:
            return i
    return -1

* Add tables from appendixes

In [None]:
table_dicts = parse_zoning(filename)

In [None]:

# Preprocess dictionary keys by truncating last letter (for singular/plural matching)
truncated_keys = {key[:-1]: value for key, value in table_dicts.items()}

# Create a sorted list of `new_name` for efficient prefix search
sorted_new_names = sorted(item.new_name for item in column_customizations)
item_dict = {item.new_name: item for item in column_customizations}

# Function to find matching prefixes using bisect
def find_matching_keys(prefix):
    i = bisect_left(sorted_new_names, prefix)
    matches = []
    while i < len(sorted_new_names) and sorted_new_names[i].startswith(prefix):
        matches.append(sorted_new_names[i])
        i += 1
    return matches

# Apply updates
for key, value in truncated_keys.items():
    matches = find_matching_keys(key)
    for match in matches:
        item_dict[match].definitions = value  # Update definitions
        item_dict[match].is_fk = True


* Parse Appendix D

### Extract the last table, which isn't actually a table, just text arranged in a table-like way.

In [None]:
def restructure_data(data, x_threshold=10):
    result = []
    for group in data:
        subgroups = []
        subgroup = [group[0]]
        for item in group[1:]:
            if item["x0"] - subgroup[-1]["x1"] <= x_threshold:
                subgroup.append(item)
            else:
                subgroups.append(subgroup)
                subgroup = [item]
        subgroups.append(subgroup)
        result.append(subgroups)
    return result

def merge_sublists(data):
    # Extract the first sublist
    first_sublist = data[0]

    # Iterate over the remaining sublists
    for sublist in data[1:]:
        for subsublist in sublist:
            # Determine the x-range of the sub-sub-list
            start = min(item["x0"] for item in subsublist)
            stop = max(item["x1"] for item in subsublist)

            # Find the appropriate sub-sub-list in the first sublist to append to
            for target_subsublist in first_sublist:
                target_start = min(item["x0"] for item in target_subsublist)
                target_stop = max(item["x1"] for item in target_subsublist)

                if target_start <= start <= target_stop:
                    target_subsublist.extend(subsublist)
                    break

    return [first_sublist]



def fix_row(row):
    first_sort = sorted(row, key=lambda x: (x["top"], x["x0"]))
    # Partition based on top:
    from itertools import groupby
    grouped_items = groupby(first_sort, lambda x: x["top"])
    result = {}
    for key, item in grouped_items:
        result[key] = list(item)
    grouped_by_top = [result[k] for k in result.keys()]
    restructured_data = restructure_data(grouped_by_top)
    merged_data = merge_sublists(restructured_data)
    print("merged_data is", merged_data)
    return merged_data


In [None]:
import pdfplumber

def trim_lines_outside_table(lines, table_top_boundary_text=None, table_bottom_boundary_text=None):
    """Returns the index of the first line to contain the specified table_top_boundary_text

    Args:
        lines (_type_): _description_
        table_top_boundary_text (_type_, optional): _description_. Defaults to None.

    Returns:
        _type_: _description_
    """
    for idx, line in enumerate(lines):
        if table_top_boundary_text is not None and table_top_boundary_text in " ".join(
            [word["text"] for word in line]
        ):
            top_trim_line = idx
            continue
        elif (
            table_bottom_boundary_text is not None
            and table_bottom_boundary_text in " ".join([word["text"] for word in line])
        ):
            bottom_trim_line = idx
            continue
        else:
            continue

    trimmed_lines = [
        line
        for idx, line in enumerate(lines)
        if idx > top_trim_line and idx < bottom_trim_line
    ]
    return trimmed_lines


def group_words_by_row(words, y_thresh=5):
    """Groups words into rows based on vertical proximity, allowing small deviations in top values."""
    words = sorted(words, key=lambda w: w['top'])  # Sort words top-to-bottom
    rows = []
    for word in words:
        added = False
        for row in rows:
            # Compare with first word in the row for stability
            if abs(word['top'] - max([w['top'] for w in row])) <= y_thresh:
                row.append(word)
                added = True
                break
        if not added:
            rows.append([word])

    return rows

# def group_words_by_row(words, y_thresh=5):
#     """Groups words into rows based on vertical proximity, allowing small deviations in top values."""
#     words = sorted(words, key=lambda w: w['top'])  # Sort words top-to-bottom
#     rows = []

#     for word in words:
#         added = False
#         for row in rows:
#             # Compare with first word in the row for stability
#             if abs(word['top'] - row[0]['top']) <= y_thresh:
#                 row.append(word)
#                 added = True
#                 break
#         if not added:
#             rows.append([word])

#     return rows

def merge_words_in_row(row, x_thresh=10):
    """
    Merges words in a single row, considering the provided x_thresh for horizontal grouping.
    
    Returns:
    - A list of merged text blocks, each with the merged text and bounding box.
    """
    row.sort(key=lambda w: (w['x0'], w['top']))  # Sort words left-to-right
    merged_blocks = []
    current_block = []
    print("row is:", row)
    for word in row:
        print('word is:', word)
        if current_block and (word['x0'] - current_block[-1]['x1']) <= x_thresh:
            current_block.append(word)
            print(f"Appended {word}")
        else:
            if current_block:
                current_block.sort(key=lambda w: w['top']) # Sort block by top coordinate to get text in each table cell correctly ordered.
                merged_blocks.append(current_block)
            current_block = [word]

    if current_block:
        print("current_block is", current_block)
        merged_blocks.append(current_block)

    return [
        {
            "text": " ".join(w["text"] for w in block),
            "x0": min(w["x0"] for w in block),
            "x1": max(w["x1"] for w in block),
            "top": min(w["top"] for w in block),
            "bottom": max(w["bottom"] for w in block),
        }
        for block in merged_blocks
    ]


from collections import defaultdict

def merge_lines_in_row(lines, y_thresh):
    merged_lines = []
    
    for line in lines:
        if not merged_lines:
            merged_lines.append(line)
            continue
        
        prev_line = merged_lines[-1]
        
        # Compute merging condition
        min_top_current = min(word["top"] for word in line)
        max_bottom_prev = max(word["bottom"] for word in prev_line)
        
        if min_top_current - max_bottom_prev < y_thresh:
            # Merge into the previous line
            merged_lines[-1].extend(line)
        else:
            # Start a new line
            merged_lines.append(line)

    # Now merge words by `x0` within each line
    result = []
    
    for line in merged_lines:
        grouped = defaultdict(list)
        
        for (_, word) in enumerate(line):
            grouped[word["x0"]].append(word)
        
        merged_words = []
        
        for x0 in sorted(grouped.keys()):  # Preserve order
            words = grouped[x0]
            merged_text = " ".join(w["text"] for w in words)
            x1 = max(w["x1"] for w in words)
            top = min(w["top"] for w in words)
            bottom = max(w["bottom"] for w in words)
            
            merged_words.append({"text": merged_text, "x0": x0, "x1": x1, "top": top, "bottom": bottom})
        
        result.append(merged_words)

    return result



def detect_header_by_uppercase(rows):
    """Identifies the header row by checking if all words are uppercase."""
    header_row = []
    body_rows = []

    for row in rows:
        if all(word["text"].isupper() for word in row):  # All words must be uppercase
            header_row = header_row + row
        else:
            body_rows.append(row)

    return header_row, body_rows


def merge_words_into_rows(words, header_x_thresh, header_y_thresh, body_x_thresh, body_y_thresh):
    """
    Groups words into rows and merges horizontally close words.
    """
    rows = group_words_by_row(words, header_y_thresh)
    print("ROWS ARE", rows)
    trimmed_rows = trim_lines_outside_table(rows, table_top_boundary_text="APPENDIX D: LAND USE CATEGORIES", table_bottom_boundary_text="NOTES:")
    header_row, body_rows = detect_header_by_uppercase(trimmed_rows)
    merged_header = merge_words_in_row(header_row, header_x_thresh)
    # merged_rows = [merge_words_in_row(row, body_x_thresh) for row in body_rows]
    merged_rows = [fix_row(row) for row in body_rows]
    # merged_rows = merge_lines_in_row(merged_rows, body_y_thresh)
    all_rows = [merged_header] + merged_rows
    return all_rows
    # return merged_rows

def assign_columns_to_blocks(merged_rows, column_gap_thresh=20, ncol=3):
    """
    Assigns a column index to each merged text block by detecting significant gaps in x0 values.
    
    Parameters:
    - merged_rows: List of lists of merged word blocks.
    - column_gap_thresh: Minimum gap to consider as a column boundary.
    
    Returns:
    - A list where each element is a tuple (column_index, word_block_dict).
    """
    all_x_values = sorted(set(block["x0"] for row in merged_rows for block in row))

    # Detect gaps to determine column boundaries
    column_boundaries = [all_x_values[0]]
    for i in range(1, len(all_x_values)):
        if all_x_values[i] - all_x_values[i - 1] > column_gap_thresh:
            column_boundaries.append(all_x_values[i])

    def get_column_index(x0):
        """Finds the appropriate column index for a given x0 value."""
        for i, boundary in enumerate(column_boundaries):
            if x0 < boundary:
                return max(i - 1, 0)
        return len(column_boundaries) - 1

    structured_output = []
    for idx,row in enumerate(merged_rows):
        # row_output = [((idx, (get_column_index(block["x0"]))), block) for block in row] # Store row and column indices for each block with that block
        # row_output = [cell['text'] for cell in row]
        row_output = [cell for cell in row]
        # row_output = [row for row in row_output if row[0] <= ncol]
        structured_output.append(row_output)

    return structured_output


In [None]:
# Example usage
header_x_thresh = 10
header_y_thresh = 20
body_x_thresh = 10
body_y_thresh = 10
column_gap_thresh = 20  # Adjust based on observed spacing
ncol = 3

with pdfplumber.open(filename) as pdf:
    words = pdf.pages[-1].extract_words()  # Extract words from page 0
    merged_rows = merge_words_into_rows(words, header_x_thresh, header_y_thresh, body_x_thresh, body_y_thresh)

In [None]:
merged_rows

In [None]:
merged_rows[0]

In [None]:
# def split_into_sublists(row, x_thresh=10):
#     """
#     Splits the list of words into sublists based on a threshold difference between x1 of a word and x0 of the next word.
    
#     Returns:
#     - A list of sublists, each containing words that are close to each other horizontally.
#     """
#     sublists = []
#     current_sublist = []

#     for i, word in enumerate(row):
#         if current_sublist:
#             last_word = current_sublist[-1]
#             if (word['x0'] - last_word['x1']) > x_thresh:
#                 sublists.append(current_sublist)
#                 current_sublist = [word]
#             else:
#                 current_sublist.append(word)
#         else:
#             current_sublist = [word]

#     if current_sublist:
#         sublists.append(current_sublist)

#     return sublists

# def merge_words_in_sublists(sublists):
#     """
#     Merges words within each sublist, sorting by x0 and then by top when x0 values are the same.
    
#     Returns:
#     - A list of merged text blocks, each with the merged text and bounding box.
#     """
#     merged_blocks = []

#     for sublist in sublists:
#         sublist.sort(key=lambda w: (w['x0'], w['top']))
#         merged_text = " ".join(w["text"] for w in sublist)
#         merged_block = {
#             "text": merged_text,
#             "x0": min(w["x0"] for w in sublist),
#             "x1": max(w["x1"] for w in sublist),
#             "top": min(w["top"] for w in sublist),
#             "bottom": max(w["bottom"] for w in sublist),
#         }
#         merged_blocks.append(merged_block)

#     return merged_blocks

# def process_row(row, x_thresh=10):
#     """
#     Processes a single row by splitting it into sublists and merging words within each sublist.
    
#     Returns:
#     - A list of merged text blocks for the row.
#     """
#     sublists = split_into_sublists(row, x_thresh)
#     merged_blocks = merge_words_in_sublists(sublists)
#     return merged_blocks

# # Example usage
# # last_table = [process_row(row, x_thresh=10) for row in merged_rows]

In [None]:
last_table

In [None]:
# Example usage
header_x_thresh = 10
header_y_thresh = 20
body_x_thresh = 10
body_y_thresh = 10
column_gap_thresh = 20  # Adjust based on observed spacing
ncol = 3

with pdfplumber.open(filename) as pdf:
    words = pdf.pages[-1].extract_words()  # Extract words from page 0
    merged_rows = merge_words_into_rows(words, header_x_thresh, header_y_thresh, body_x_thresh, body_y_thresh)
    last_table = assign_columns_to_blocks(merged_rows, column_gap_thresh)

In [None]:
merged_rows

In [None]:
last_table

In [None]:
item_dict['building_class'].definitions = last_table[1:] # Exclude the first row, which is the column headings

In [None]:
item_dict

In [None]:
with open("environment_data/table_dicts.pkl", "wb") as f:
    dill.dump({'column_customizations': column_customizations, 'table_dicts': table_dicts, 'last_table': last_table}, f)