In [1]:
import pdfplumber
import re
import dill
from itertools import tee
from src.models import ColCustomization
import src.helpers
import src.pdfutils

In [2]:
filename = '/home/james/Massive/PROJECTDATA/nyc_real_estate_data/dictionaries/mapPLUTO_data_dictionary.pdf'

* Looking at the PLUTO data dictionary, it seems that most category variables are labeled as "alpahnumeric" even if they only contain numbers, such as zip codes.
* There are some exceptions, police precincts and districts are numeric and listed as such. However as there a limited number of repeating variables, I wil treat them as categorical as well.

In [3]:
fulltext = src.pdfutils.map_pdf(filename, same_line_tolerance=0.3, start_page=3) 

In [4]:
patterns = [
    (re.compile(r'[ ,–]+'), '_'),
    (re.compile(r'#'), 'num'),
    (re.compile(r'/'), 'or'),
    (re.compile(r'&'), 'and')
]

fk_markers = ['code', 'category', 'class', 'district', 'precinct', 'company', 'name', 'health_area', 'type', 'borough', 'name', 'health_area', 'health_center_district']

def clean_name(full_name, patterns):
    new_name = full_name.lower()
    for pattern, replacement in patterns:
        new_name = pattern.sub(replacement, new_name)
    return new_name

def get_word_starts_x(line):
   starts = [word['x0'] for word in line]
   return starts

def parse_table(table):
    rows = []
    k1,k2 = table[0][0].split(' ', 1)
    for i in table[1:]:
        row = i[0].split(' ', 1)
        # rows.append(Definition(code = row[0], definition = row[1]))
        rows.append({k1.lower(): row[0], k2.lower(): row[1] })
    return rows

column_customizations=[]

for section in fulltext:
    in_table = False
    in_description = False
    table = None
    col_mods = None  # Initialize col_mods here
    for key,value in section.items():
        line = key[1]
        if line.startswith('Field Name:') and len(value[1][1]) > 2: # Exclude the explanation of "Field Name" itself on page 3
            col_mods = ColCustomization(short_name=value[1][1][-1]['text'][1:-1]) # Get the field name minus the enclosing parentheses
            full_name = ' '.join(word['text'] for word in value[1][1][2:-1])
            new_name = clean_name(full_name.lower(), patterns=patterns)
            is_fk = any([word in new_name for word in fk_markers])
            col_mods.new_name = new_name
            if any([w in new_name for w in ['year', 'number', 'precinct']]):
                col_mods.dtype = "Integer"
            if 'date' in new_name:
                col_mods.dtype = "Date"
        elif line.startswith('Format:') and not col_mods.dtype:
            if "Alphanumeric" in line:
                col_mods.dtype = "String"
            if "Numeric" in line and not col_mods.dtype:
                col_mods.dtype = "Float"
        elif line.startswith('Description:'):
            in_description = True
        if in_description is True:
            if (line.startswith('Value') or line.startswith('VALUE')) and len(value[1][1]) <= 3: # Maximum number of words in a column heading
                col_starts = get_word_starts_x(value[1][1])
                in_table = True
                table = [(line, value)]
            elif in_table is True and (abs(col_starts[0] - get_word_starts_x(value[1][1])[0]) < .5) :
                table.append((line, value))
            elif in_table is True:
                table = parse_table(table)
                col_mods.definitions = table
                in_table = False
            else:
                print("Nothing was done with line:", line)
    if col_mods is not None:
        if not col_mods.definitions and table:
            col_mods.definitions = parse_table(table)
        if col_mods.definitions:
            col_mods.is_fk = True
        column_customizations.append(col_mods)


Nothing was done with line: Description: The borough in which the tax lot is located.
Nothing was done with line: This field contains a two-character borough code.
Nothing was done with line: borough but are serviced by a different borough. The BOROUGH codes associated
Nothing was done with line: with these areas are the boroughs in which they are legally located.
Nothing was done with line: Marble Hill is serviced by the Bronx, but is legally located in Manhattan and has a
Nothing was done with line: BOROUGH of MN. Rikers Island is serviced by Queens, but is legally located in the
Nothing was done with line: Bronx and has a BOROUGH of BX.
Nothing was done with line: Description: The tax block in which the tax lot is located.
Nothing was done with line: This field contains a one to five-digit tax block number.
Nothing was done with line: Each tax block is unique within a borough (see BOROUGH).
Nothing was done with line: Description: The number of the tax lot.
Nothing was done with lin

In [5]:
column_customizations

[ColCustomization(short_name='Borough', new_name='borough', dtype='String', synonyms=[], definitions=[{'value': 'BX', 'description': 'Bronx'}, {'value': 'BK', 'description': 'Brooklyn'}, {'value': 'MN', 'description': 'Manhattan'}, {'value': 'QN', 'description': 'Queens'}, {'value': 'SI', 'description': 'Staten Island'}], drop=False, is_fk=True),
 ColCustomization(short_name='Block', new_name='tax_block', dtype='Float', synonyms=[], definitions={}, drop=False, is_fk=False),
 ColCustomization(short_name='Lot', new_name='tax_lot', dtype='Float', synonyms=[], definitions={}, drop=False, is_fk=False),
 ColCustomization(short_name='CD', new_name='community_district', dtype='Float', synonyms=[], definitions={}, drop=False, is_fk=False),
 ColCustomization(short_name='CT2010', new_name='census_tract_2010', dtype='String', synonyms=[], definitions={}, drop=False, is_fk=False),
 ColCustomization(short_name='CB2010', new_name='census_block_2010', dtype='String', synonyms=[], definitions={}, drop=

In [6]:

def parse_zoning(pdf_path):
    all_tables = {}
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, start=1):
            # Extract raw text as lines
            lines = page.extract_text().splitlines()
            # Extract tables
            tables = page.extract_tables()
            for table_index, table in enumerate(tables):
                # Find the position of the table in the raw text
                table_start_line = find_table_start(lines, table)
                # Extract the line before the table, if available
                label_line = (
                    lines[table_start_line - 2] if table_start_line > 0 else None
                )
                table = [row for row in table if "Abbreviation" not in row]
                if label_line is not None:
                    if "APPENDIX" in label_line:
                        label_line = re.sub("APPENDIX.*: ", "", label_line)
                        label_line = re.sub(" +", "_", label_line.lower())
                        prev_label_line = label_line
                    elif "PLUTO DATA DICTIONARY" in label_line:
                        label_line = None
                    elif "APPENDIX" not in label_line:
                        print("what's this?: ", print('label_line is', label_line))
                        table = [row for row in table if "Abbreviation" not in row]
                    if label_line != None:
                        all_tables[label_line] = table
                    else:
                        all_tables[prev_label_line] = all_tables[prev_label_line] + table
                else:
                    print('table_index is', table_index)
                    print('missed:', lines[table_start_line])
    return all_tables


def find_table_start(lines, table):
    """
    Identify the start of the table in the text by matching table rows
    """
    for i, line in enumerate(lines):
        # Convert the table's first row into a string and search for it in the text
        table_row = " ".join(str(cell) for cell in table[1] if cell)  # Skip empty cells
        if line in table_row:
            return i
    return -1

* Add tables from appendixes

In [7]:
table_dicts = parse_zoning(filename)

In [8]:
print(table_dicts)

{'special_purpose_districts': [['125th', 'Special 125th Street District'], ['BNY', 'Special Brooklyn Navy Yard District'], ['BPC', 'Special Battery Park City District'], ['BR', 'Special Bay Ridge District'], ['BSC', 'Special Bay Street Corridor District'], ['C', 'Special Grand Concourse Preservation District'], ['CD', 'Special City Island District'], ['CI', 'Special Coney Island District'], ['CL', 'Special Clinton District'], ['CO', 'Special Coney Island Mixed Use District'], ['CP', 'Special College Point District'], ['CR - n', 'Special Coastal Risk District, where n is the number of the\ndistrict'], ['DB', 'Special Downtown Brooklyn District'], ['DFR', 'Special Downtown Far Rockaway District'], ['DJ', 'Special Downtown Jamaica District'], ['EC-n', 'Special Enhanced Commercial District, where n is the\nnumber of the district'], ['EHC', 'East Harlem Corridors'], ['FH', 'Special Forest Hills District'], ['FW', 'Special Flushing Waterfront District'], ['G', 'Special Gowanus Mixed Use Dist

* Parse Appendix D

### Extract the last table, which isn't actually a table, just text arranged in a table-like way.

In [9]:
import pdfplumber

def trim_lines_outside_table(lines, table_top_boundary_text=None, table_bottom_boundary_text=None):
    """Returns the index of the first line to contain the specified table_top_boundary_text

    Args:
        lines (_type_): _description_
        table_top_boundary_text (_type_, optional): _description_. Defaults to None.

    Returns:
        _type_: _description_
    """
    for idx, line in enumerate(lines):
        if table_top_boundary_text is not None and table_top_boundary_text in " ".join(
            [word["text"] for word in line]
        ):
            top_trim_line = idx
            continue
        elif (
            table_bottom_boundary_text is not None
            and table_bottom_boundary_text in " ".join([word["text"] for word in line])
        ):
            bottom_trim_line = idx
            continue
        else:
            continue

    trimmed_lines = [
        line
        for idx, line in enumerate(lines)
        if idx > top_trim_line and idx < bottom_trim_line
    ]
    return trimmed_lines

def group_words_by_row(words, y_thresh=5):
    """Groups words into rows based on vertical proximity, allowing small deviations in top values."""
    words = sorted(words, key=lambda w: w['top'])  # Sort words top-to-bottom
    rows = []

    for word in words:
        added = False
        for row in rows:
            # Compare with first word in the row for stability
            if abs(word['top'] - row[0]['top']) <= y_thresh:
                row.append(word)
                added = True
                break
        if not added:
            rows.append([word])

    return rows

def merge_words_in_row(row, x_thresh=10):
    """
    Merges words in a single row, considering the provided x_thresh for horizontal grouping.
    
    Returns:
    - A list of merged text blocks, each with the merged text and bounding box.
    """
    row.sort(key=lambda w: w['x0'])  # Sort words left-to-right
    merged_blocks = []
    current_block = []

    for word in row:
        if current_block and (word['x0'] - current_block[-1]['x1']) <= x_thresh:
            current_block.append(word)
        else:
            if current_block:
                current_block.sort(key=lambda w: w['top']) # Sort block by top coordinate to get text in each table cell correctly ordered.
                merged_blocks.append(current_block)
            current_block = [word]

    if current_block:
        merged_blocks.append(current_block)

    return [
        {
            "text": " ".join(w["text"] for w in block),
            "x0": min(w["x0"] for w in block),
            "x1": max(w["x1"] for w in block),
            "top": min(w["top"] for w in block),
            "bottom": max(w["bottom"] for w in block),
        }
        for block in merged_blocks
    ]


from collections import defaultdict

def merge_lines_in_row(lines, y_thresh):
    merged_lines = []
    
    for line in lines:
        if not merged_lines:
            merged_lines.append(line)
            continue
        
        prev_line = merged_lines[-1]
        
        # Compute merging condition
        min_top_current = min(word["top"] for word in line)
        max_bottom_prev = max(word["bottom"] for word in prev_line)
        
        if min_top_current - max_bottom_prev < y_thresh:
            # Merge into the previous line
            merged_lines[-1].extend(line)
        else:
            # Start a new line
            merged_lines.append(line)

    # Now merge words by `x0` within each line
    result = []
    
    for line in merged_lines:
        grouped = defaultdict(list)
        
        for (_, word) in enumerate(line):
            grouped[word["x0"]].append(word)
        
        merged_words = []
        
        for x0 in sorted(grouped.keys()):  # Preserve order
            words = grouped[x0]
            merged_text = " ".join(w["text"] for w in words)
            x1 = max(w["x1"] for w in words)
            top = min(w["top"] for w in words)
            bottom = max(w["bottom"] for w in words)
            
            merged_words.append({"text": merged_text, "x0": x0, "x1": x1, "top": top, "bottom": bottom})
        
        result.append(merged_words)

    return result



def detect_header_by_uppercase(rows):
    """Identifies the header row by checking if all words are uppercase."""
    header_row = []
    body_rows = []

    for row in rows:
        if all(word["text"].isupper() for word in row):  # All words must be uppercase
            header_row = header_row + row
            print("header_row words:", [w["text"] for w in row])
        else:
            body_rows.append(row)

    return header_row, body_rows


def merge_words_into_rows(words, header_x_thresh, header_y_thresh, body_x_thresh, body_y_thresh):
    """
    Groups words into rows and merges horizontally close words.
    """
    rows = group_words_by_row(words, header_y_thresh)
    trimmed_rows = trim_lines_outside_table(rows, table_top_boundary_text="APPENDIX D: LAND USE CATEGORIES", table_bottom_boundary_text="NOTES:")
    header_row, body_rows = detect_header_by_uppercase(trimmed_rows)
    merged_header = merge_words_in_row(header_row, header_x_thresh)
    print("merged header is", merged_header)
    merged_rows = [merge_words_in_row(row, body_x_thresh) for row in body_rows]
    merged_rows = merge_lines_in_row(merged_rows, body_y_thresh)
    all_rows = [merged_header] + merged_rows
    print("all_rows rows is", all_rows)
    return all_rows

def assign_columns_to_blocks(merged_rows, column_gap_thresh=20, ncol=3):
    """
    Assigns a column index to each merged text block by detecting significant gaps in x0 values.
    
    Parameters:
    - merged_rows: List of lists of merged word blocks.
    - column_gap_thresh: Minimum gap to consider as a column boundary.
    
    Returns:
    - A list where each element is a tuple (column_index, word_block_dict).
    """
    all_x_values = sorted(set(block["x0"] for row in merged_rows for block in row))

    # Detect gaps to determine column boundaries
    column_boundaries = [all_x_values[0]]
    for i in range(1, len(all_x_values)):
        if all_x_values[i] - all_x_values[i - 1] > column_gap_thresh:
            column_boundaries.append(all_x_values[i])

    def get_column_index(x0):
        """Finds the appropriate column index for a given x0 value."""
        for i, boundary in enumerate(column_boundaries):
            if x0 < boundary:
                return max(i - 1, 0)
        return len(column_boundaries) - 1

    structured_output = []
    for idx,row in enumerate(merged_rows):
        row_output = [((idx, (get_column_index(block["x0"]))), block) for block in row] # Store row and column indices for each block with that block
        # row_output = [row for row in row_output if row[0] <= ncol]
        structured_output.append(row_output)

    return structured_output




In [10]:
# Example usage
header_x_thresh = 10
header_y_thresh = 20
body_x_thresh = 5
body_y_thresh = 10
column_gap_thresh = 20  # Adjust based on observed spacing
ncol = 3

with pdfplumber.open(filename) as pdf:
    words = pdf.pages[-1].extract_words()  # Extract words from page 0
    merged_rows = merge_words_into_rows(words, header_x_thresh, header_y_thresh, body_x_thresh, body_y_thresh)
    last_table = assign_columns_to_blocks(merged_rows, column_gap_thresh)




header_row words: ['DCP', 'LAND', 'DCP', 'LAND', 'DOF/DCP', 'BUILDING', 'CLASSES']
header_row words: ['USE', 'USE', 'CODE', 'CATEGORIES']
merged header is [{'text': 'DCP LAND USE CODE', 'x0': 77.4, 'x1': 109.23494400000001, 'top': 150.38112, 'bottom': 199.34112000000005}, {'text': 'DCP LAND USE CATEGORIES', 'x0': 122.28, 'x1': 195.725808, 'top': 162.98112000000003, 'bottom': 199.34112000000005}, {'text': 'DOF/DCP BUILDING CLASSES', 'x0': 212.76, 'x1': 371.62891199999984, 'top': 162.98112000000003, 'bottom': 174.02112}]
all_rows rows is [[{'text': 'DCP LAND USE CODE', 'x0': 77.4, 'x1': 109.23494400000001, 'top': 150.38112, 'bottom': 199.34112000000005}, {'text': 'DCP LAND USE CATEGORIES', 'x0': 122.28, 'x1': 195.725808, 'top': 162.98112000000003, 'bottom': 199.34112000000005}, {'text': 'DOF/DCP BUILDING CLASSES', 'x0': 212.76, 'x1': 371.62891199999984, 'top': 162.98112000000003, 'bottom': 174.02112}], [{'text': '01', 'x0': 77.4, 'x1': 88.44, 'top': 214.10928, 'bottom': 225.1492799999999

In [11]:
for row in last_table:
    print(row)  # Prints the structured output


[((0, 0), {'text': 'DCP LAND USE CODE', 'x0': 77.4, 'x1': 109.23494400000001, 'top': 150.38112, 'bottom': 199.34112000000005}), ((0, 1), {'text': 'DCP LAND USE CATEGORIES', 'x0': 122.28, 'x1': 195.725808, 'top': 162.98112000000003, 'bottom': 199.34112000000005}), ((0, 3), {'text': 'DOF/DCP BUILDING CLASSES', 'x0': 212.76, 'x1': 371.62891199999984, 'top': 162.98112000000003, 'bottom': 174.02112})]
[((1, 0), {'text': '01', 'x0': 77.4, 'x1': 88.44, 'top': 214.10928, 'bottom': 225.14927999999998}), ((1, 1), {'text': 'One & Two Family Buildings', 'x0': 122.28, 'x1': 199.07092799999995, 'top': 214.10928, 'bottom': 237.86928}), ((1, 3), {'text': 'A*,B*,Z0', 'x0': 212.76, 'x1': 256.799664, 'top': 214.10928, 'bottom': 225.14927999999998})]
[((2, 0), {'text': '02', 'x0': 77.4, 'x1': 88.44, 'top': 252.02927999999997, 'bottom': 263.06927999999994}), ((2, 1), {'text': 'Multi-Family Walk-Up Buildings', 'x0': 122.28, 'x1': 181.685136, 'top': 252.02927999999997, 'bottom': 288.38928}), ((2, 3), {'text'

In [12]:
with open("environment_data/table_dicts.pkl", "wb") as f:
    dill.dump({'column_customizations': column_customizations, 'table_dicts': table_dicts, 'last_table': last_table}, f)