In [56]:
import pdfplumber
import re
import dill
from itertools import tee
import src.models
import src.helpers
import src.pdfutils

In [57]:
filename = '/home/james/Massive/PROJECTDATA/nyc_real_estate_data/dictionaries/mapPLUTO_data_dictionary.pdf'

* Looking at the PLUTO data dictionary, it seems that most category variables are labeled as "alpahnumeric" even if they only contain numbers, such as zip codes.
* There are some exceptions, police precincts and districts are numeric and listed as such. However as there a limited number of repeating variables, I wil treat them as categorical as well.

In [58]:
fulltext = src.pdfutils.map_pdf(filename)

In [59]:
for key in fulltext.keys():
    print(key)

(0, 'Table of Contents')
(1, 'DISCLAIMER .............................................................................................................................................. 3')
(2, 'INTRODUCTION ........................................................................................................................................ 3')
(3, 'BOROUGH (Borough) ................................................................................................................................ 4')
(4, 'TAX BLOCK (Block) .................................................................................................................................. 4')
(5, 'TAX LOT (Lot) ........................................................................................................................................... 4')
(6, 'COMMUNITY DISTRICT (CD) ................................................................................................................ 5')
(7, 'CENSUS TRACT 2020 (BCT202

In [60]:
def get_word_starts_x(line):
   starts = [word['x0'] for word in line]
   return starts


tables = {}
in_description = False
in_table = False


for key,value in fulltext.items():
    line = key[1]
    line_start = value[0]['range'][0]
    line_start = value[0]['range'][1]
    # Make sure description is set to False at the beginning of each definition, as 'Field Name' is the first part of each definition
    if line.startswith('Field Name:'):
        in_description = False
        in_table = False
        if len(value[1][1]) > 2: # Exclude the explanation of "Field Name" itself on page 3
            field_name = value[1][1][-1]['text'][1:-1] # Get the field name minus the enclosing parentheses
        table = []
        continue
    # Detect the beginning of a description section, which might contain a table (anything outside it does not contain a table I am interested in)
    if line.startswith('Description:'):
        in_description = True
        prev_line_start = line_start
    if in_description is True:
        # print('in description:', line)
        # If inside a description section, check for lines that start with value or `VALUE`, as this is what the first line of a table alway starts with.
        # As there are lines that start with `VALUE` but are not tables, also check if the line starts at a larger x0 than the previous line.
        if (line.startswith('Value') or line.startswith('VALUE')) and len(value[1][1]) <= 3: # Maximum number of words in a column heading
            print("Start of table")
            print('Table heading', line)
            col_starts = get_word_starts_x(value[1][1])
            in_table = True
            print(col_starts)
            prev_line_start = line_start
            table = [(line, value)]
        elif in_table is True and (abs(col_starts[0] - get_word_starts_x(value[1][1])[0]) < .5) :
            print('Row: ', line)
            table.append((line, value))
        elif in_table is True:
            print("End of table")
            tables[field_name] = table
            in_table = False

Start of table
Table heading Value Description
[172.8, 325.8]
Row:  BX Bronx
Row:  BK Brooklyn
Row:  MN Manhattan
Row:  QN Queens
Row:  SI Staten Island
End of table
Start of table
Table heading Value Borough name
[172.8, 302.4, 343.79999999999995]
Row:  1 Manhattan
Row:  2 Bronx
Row:  3 Brooklyn
Row:  4 Queens
Row:  5 Staten Island
End of table
Start of table
Table heading VALUE DESCRIPTION
[172.8, 258.24]
Row:  01 One & Two Family Buildings
Row:  02 Multi-Family Walk-Up Buildings
Row:  03 Multi-Family Elevator Buildings
Row:  04 Mixed Residential & Commercial Buildings
Row:  05 Commercial & Office Buildings
Row:  06 Industrial & Manufacturing
Row:  07 Transportation & Utility
Row:  08 Public Facilities & Institutions
Row:  09 Open Space & Outdoor Recreation
Row:  10 Parking Facilities
Row:  11 Vacant Land
Start of table
Table heading Value Description
[168.24, 230.28]
Row:  C City ownership
Row:  M Mixed city & private ownership
Row:  O Other – owned by either a public authority or t

In [61]:
table_dicts = {}

for field,table in tables.items():
    rows = []
    k1,k2 = table[0][0].split(' ', 1)
    for i in table[1:]:
        row = i[0].split(' ', 1)
        rows.append({k1: row[0], k2: row[1] })
    table_dicts[field] = rows



In [62]:
for k,d in table_dicts.items():
    print(k, d)

Borough [{'Value': 'BX', 'Description': 'Bronx'}, {'Value': 'BK', 'Description': 'Brooklyn'}, {'Value': 'MN', 'Description': 'Manhattan'}, {'Value': 'QN', 'Description': 'Queens'}, {'Value': 'SI', 'Description': 'Staten Island'}]
SanitBoro [{'Value': '1', 'Borough name': 'Manhattan'}, {'Value': '2', 'Borough name': 'Bronx'}, {'Value': '3', 'Borough name': 'Brooklyn'}, {'Value': '4', 'Borough name': 'Queens'}, {'Value': '5', 'Borough name': 'Staten Island'}]
OwnerType [{'Value': 'C', 'Description': 'City ownership'}, {'Value': 'M', 'Description': 'Mixed city & private ownership'}, {'Value': 'O', 'Description': 'Other – owned by either a public authority or the state or federal'}]
AreaSource [{'Value': '0', 'Methodology': 'Not Available'}, {'Value': '2', 'Methodology': "Department of Finance's Property Tax System (PTS)"}, {'Value': '4', 'Methodology': "BUILDING CLASS starts with 'V' and NUMBER OF BUILDINGS is"}]
LotType [{'Value': '0', 'Description': 'Unknown'}, {'Value': '1', 'Descripti

In [63]:

def parse_zoning(pdf_path):
    all_tables = {}
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, start=1):
            # Extract raw text as lines
            lines = page.extract_text().splitlines()
            # Extract tables
            tables = page.extract_tables()
            for table_index, table in enumerate(tables):
                # Find the position of the table in the raw text
                table_start_line = find_table_start(lines, table)
                # Extract the line before the table, if available
                label_line = (
                    lines[table_start_line - 2] if table_start_line > 0 else None
                )
                table = [row for row in table if "Abbreviation" not in row]
                if label_line is not None:
                    if "APPENDIX" in label_line:
                        label_line = re.sub("APPENDIX.*: ", "", label_line)
                        label_line = re.sub(" +", "_", label_line.lower())
                        prev_label_line = label_line
                    elif "PLUTO DATA DICTIONARY" in label_line:
                        label_line = None
                    elif "APPENDIX" not in label_line:
                        print("what's this?: ", print('label_line is', label_line))
                        table = [row for row in table if "Abbreviation" not in row]
                    if label_line != None:
                        all_tables[label_line] = table
                    else:
                        all_tables[prev_label_line] = all_tables[prev_label_line] + table
                else:
                    print('table_index is', table_index)
                    print('missed:', lines[table_start_line])
    return all_tables


def find_table_start(lines, table):
    """
    Identify the start of the table in the text by matching table rows
    """
    for i, line in enumerate(lines):
        # Convert the table's first row into a string and search for it in the text
        table_row = " ".join(str(cell) for cell in table[1] if cell)  # Skip empty cells
        if line in table_row:
            return i
    return -1

* Add tables from appendixes

In [64]:
table_dicts = parse_zoning(filename)

In [65]:
print(table_dicts)

{'special_purpose_districts': [['125th', 'Special 125th Street District'], ['BNY', 'Special Brooklyn Navy Yard District'], ['BPC', 'Special Battery Park City District'], ['BR', 'Special Bay Ridge District'], ['BSC', 'Special Bay Street Corridor District'], ['C', 'Special Grand Concourse Preservation District'], ['CD', 'Special City Island District'], ['CI', 'Special Coney Island District'], ['CL', 'Special Clinton District'], ['CO', 'Special Coney Island Mixed Use District'], ['CP', 'Special College Point District'], ['CR - n', 'Special Coastal Risk District, where n is the number of the\ndistrict'], ['DB', 'Special Downtown Brooklyn District'], ['DFR', 'Special Downtown Far Rockaway District'], ['DJ', 'Special Downtown Jamaica District'], ['EC-n', 'Special Enhanced Commercial District, where n is the\nnumber of the district'], ['EHC', 'East Harlem Corridors'], ['FH', 'Special Forest Hills District'], ['FW', 'Special Flushing Waterfront District'], ['G', 'Special Gowanus Mixed Use Dist

* Parse Appendix D

In [66]:
# with pdfplumber.open(filename) as pdf:
#     page = pdf.pages[-1]
#     # Extract raw text as lines
#     lines = page.extract_text(layout=True).splitlines()
#     words = page.extract_words(use_text_flow=False)

In [67]:
# print(words)

In [68]:
# import pdfplumber

# def group_words_by_row(words, y_thresh=5):
#     """Groups words into rows based on proximity of their 'top' values."""
#     words = sorted(words, key=lambda w: w['top'])  # Sort by vertical position
#     rows = []
    
#     for word in words:
#         added = False
#         for row in rows:
#             if abs(word['top'] - row[0]['top']) <= y_thresh:  # Same row if close in top values
#                 row.append(word)
#                 added = True
#                 break
#         if not added:
#             rows.append([word])  # Start a new row
    
#     return rows

# def adjust_bounding_boxes(rows):
#     """Expands row bounding boxes so all words in a row share the same top/bottom."""
#     adjusted_rows = []
    
#     for row in rows:
#         min_top = min(word['top'] for word in row)
#         max_bottom = max(word['bottom'] for word in row)
        
#         adjusted_row = []
#         for word in row:
#             adjusted_word = word.copy()
#             adjusted_word['top'] = min_top
#             adjusted_word['bottom'] = max_bottom
#             adjusted_row.append(adjusted_word)
        
#         adjusted_rows.append(adjusted_row)
    
#     return adjusted_rows

# def merge_words_into_rows(words, x_thresh=10, y_thresh=5):
#     """
#     Groups words into rows and merges horizontally close words.
    
#     Returns:
#     - List of lists, where each inner list represents a row with merged text blocks.
#     """
#     # Step 1: Group words into rows
#     rows = group_words_by_row(words, y_thresh)
    
#     # Step 2: Adjust bounding boxes for row uniformity
#     adjusted_rows = adjust_bounding_boxes(rows)
    
#     # Step 3: Merge words within each row into text segments
#     row_blocks = []
    
#     for row in adjusted_rows:
#         print('ROW IS:', row)
#         row.sort(key=lambda w: w['x0'])  # Sort words left-to-right
#         line_blocks = []
#         line = []
        
#         for word in row:
#             if line and (word['x0'] - line[-1]['x1']) <= x_thresh:  # Merge if close horizontally
#                 line.append(word)
#             else:
#                 if line:
#                     line_blocks.append(line)
#                 line = [word]
#         if line:
#             line_blocks.append(line)
        
#         # Convert words to formatted output
#         row_blocks.append([
#             (" ".join(word['text'] for word in line),  # Merged text
#              (min(line, key=lambda w: w['x0'])['x0'],  # Bounding box (x0, top, x1, bottom)
#               min(line, key=lambda w: w['top'])['top'],
#               max(line, key=lambda w: w['x1'])['x1'],
#               max(line, key=lambda w: w['bottom'])['bottom']),
#              len(line))  # Word count
#             for line in line_blocks
#         ])
    
#     return row_blocks


In [69]:
# with pdfplumber.open(filename) as pdf:
#     page = pdf.pages[-1]
#     words = page.extract_words(use_text_flow=True)
#     text_blocks = merge_words_into_rows(words, x_thresh=10, y_thresh=20)

In [70]:
# import pdfplumber

# def group_words_by_row(words, y_thresh=5):
#     """Groups words into rows based on proximity of their 'top' values."""
#     words = sorted(words, key=lambda w: w['top'])  # Sort by vertical position
#     rows = []
    
#     for word in words:
#         added = False
#         for row in rows:
#             if abs(word['top'] - row[0]['top']) <= y_thresh:  # Same row if close in top values
#                 row.append(word)
#                 added = True
#                 break
#         if not added:
#             rows.append([word])  # Start a new row
    
#     return rows

# def detect_header_by_uppercase(rows):
#     """Identifies the header row by checking if all words are uppercase."""
#     header_row = []
#     body_rows = []
    
#     for row in rows:
#         if all(word['text'].isupper() for word in row):  # All words must be uppercase
#             header_row = row
#             print('header_row:', [w['text'] for w in header_row])
#         else:
#             body_rows.append(row)
    
#     return header_row, body_rows

# def merge_words_into_rows(words, x_thresh=10, y_thresh=5):
#     """
#     Groups words into rows and merges horizontally close words.
    
#     - Detects headers based on uppercase text in all words.
#     - Merges words and handles row alignment.
    
#     Returns:
#     - List of lists, where each inner list represents a row with merged text blocks.
#     """
#     # Step 1: Group words into rows
#     rows = group_words_by_row(words, y_thresh)
    
#     # Step 2: Detect the header row based on all uppercase words
#     header_row, body_rows = detect_header_by_uppercase(rows)

#     # Step 3: Merge words within each row into text segments
#     all_rows = [header_row] + body_rows  # Ensure headers come first
#     row_blocks = []
    
#     for row in all_rows:
#         row.sort(key=lambda w: w['x0'])  # Sort words left-to-right
#         line_blocks = []
#         line = []
        
#         for word in row:
#             if line and (word['x0'] - line[-1]['x1']) <= x_thresh:  # Merge if close horizontally
#                 line.append(word)
#             else:
#                 if line:
#                     line_blocks.append(line)
#                 line = [word]
#         if line:
#             line_blocks.append(line)
        
#         # Convert words to formatted output
#         row_blocks.append([
#             (" ".join(word['text'] for word in line),  # Merged text
#              (min(line, key=lambda w: w['x0'])['x0'],  # Bounding box (x0, top, x1, bottom)
#               min(line, key=lambda w: w['top'])['top'],
#               max(line, key=lambda w: w['x1'])['x1'],
#               max(line, key=lambda w: w['bottom'])['bottom']),
#              len(line))  # Word count
#             for line in line_blocks
#         ])
    
#     return row_blocks


In [71]:
# # Example Usage
# with pdfplumber.open(filename) as pdf:
#     page = pdf.pages[-1]
#     words = page.extract_words(use_text_flow=True)
#     table = merge_words_into_rows(words, x_thresh=10, y_thresh=20)

# for row in table:
#     print(row)

In [72]:
# print(text_blocks[5])

In [73]:
# for idx, block in enumerate(text_blocks):
#     print(idx, block)


In [74]:
# import pdfplumber

# def group_words_by_row(words, y_thresh=5):
#     """Groups words into rows based on proximity of their 'top' values."""
#     words = sorted(words, key=lambda w: w['top'])  # Sort by vertical position
#     rows = []
    
#     for word in words:
#         print(word)
#         added = False
#         for row in rows:
#             if abs(word['top'] - row[-1]['top']) <= y_thresh:  # Compare with last word in row
#                 row.append(word)
#                 added = True
#                 break
#         if not added:
#             rows.append([word])  # Start a new row
    
#     return rows

# def detect_header_by_uppercase(rows, y_thresh=10):
#     """Identifies the header row by checking if all words are uppercase, with y-threshold filtering."""
#     header_row = []
#     body_rows = []
    
#     for row in rows:
#         if all(word['text'].isupper() for word in row):  # All words must be uppercase for header
#             if row[0]['top'] < y_thresh:  # Ensure header is within the correct vertical space
#                 header_row.extend(row)  # Combine all words of the header row
#             else:
#                 body_rows.append(row)
#         else:
#             body_rows.append(row)
    
#     return header_row, body_rows

# def merge_words_in_row(row, x_thresh=10, y_thresh=5):
#     """
#     Merges words in a single row, considering the provided x_thresh and y_thresh.
    
#     Args:
#     - row: List of word dicts in the row.
#     - x_thresh: Horizontal threshold for merging words.
#     - y_thresh: Vertical threshold for grouping words into rows.
    
#     Returns:
#     - A list of merged text blocks, each with the merged text, bounding box, and word count.
#     """
#     row.sort(key=lambda w: w['x0'])  # Sort words left-to-right
#     line_blocks = []
#     line = []
    
#     for word in row:
#         if line and (word['x0'] - line[-1]['x1']) <= x_thresh:  # Merge if close horizontally
#             line.append(word)
#         else:
#             if line:
#                 line_blocks.append(line)
#             line = [word]
#     if line:
#         line_blocks.append(line)
    
#     # Convert words to formatted output
#     return [
#         (" ".join(word['text'] for word in line),  # Merged text
#          (min(line, key=lambda w: w['x0'])['x0'],  # Bounding box (x0, top, x1, bottom)
#           min(line, key=lambda w: w['top'])['top'],
#           max(line, key=lambda w: w['x1'])['x1'],
#           max(line, key=lambda w: w['bottom'])['bottom']),
#          len(line))  # Word count
#         for line in line_blocks
#     ]

# def merge_words_into_rows(words, header_x_thresh=10, header_y_thresh=10, body_x_thresh=15, body_y_thresh=5):
#     """
#     Groups words into rows and merges horizontally close words for header and body rows with different thresholds.
    
#     - Detects headers based on uppercase text in all words.
#     - Merges words and handles row alignment with separate thresholds for header and body rows.
    
#     Returns:
#     - List of lists, where each inner list represents a row with merged text blocks.
#     """
#     # Step 1: Group words into rows
#     rows = group_words_by_row(words, body_y_thresh)  # Use body row y_thresh for initial grouping
    
#     # Step 2: Detect the header row based on all uppercase words and vertical threshold
#     header_row, body_rows = detect_header_by_uppercase(rows, y_thresh=header_y_thresh)

#     # Step 3: Merge words in header and body rows with separate thresholds
#     all_rows = [header_row] + body_rows  # Ensure headers come first
#     row_blocks = []
    
#     # Merge header row with header-specific thresholds
#     if header_row:
#         row_blocks.append(merge_words_in_row(header_row, x_thresh=header_x_thresh, y_thresh=header_y_thresh))
    
#     # Merge body rows with body-specific thresholds
#     for row in body_rows:
#         row_blocks.append(merge_words_in_row(row, x_thresh=body_x_thresh, y_thresh=body_y_thresh))
    
#     # Sort rows by top value to maintain correct vertical order
#     # Note: Sorting by both 'top' and 'x0' (for tie-breaking in case 'top' is the same) ensures correct ordering
#     final_row_blocks = []
#     for row_block in row_blocks:
#         final_row_blocks.append(sorted(row_block, key=lambda w: (w[1][1], w[1][0])))  # Sort by 'top' and 'x0'
    
#     return final_row_blocks



In [75]:
import pdfplumber

def group_words_by_row(words, y_thresh=5):
    """Groups words into rows based on vertical proximity, allowing small deviations in top values."""
    words = sorted(words, key=lambda w: w['top'])  # Sort words top-to-bottom
    rows = []

    for word in words:
        added = False
        for row in rows:
            # Compare with first word in the row for stability
            if abs(word['top'] - row[0]['top']) <= y_thresh:
                row.append(word)
                added = True
                break
        if not added:
            rows.append([word])

    return rows

def merge_words_in_row(row, x_thresh=10):
    """
    Merges words in a single row, considering the provided x_thresh for horizontal grouping.
    
    Returns:
    - A list of merged text blocks, each with the merged text and bounding box.
    """
    row.sort(key=lambda w: w['x0'])  # Sort words left-to-right
    merged_blocks = []
    current_block = []

    for word in row:
        if current_block and (word['x0'] - current_block[-1]['x1']) <= x_thresh:
            current_block.append(word)
        else:
            if current_block:
                merged_blocks.append(current_block)
            current_block = [word]

    if current_block:
        merged_blocks.append(current_block)

    return [
        {
            "text": " ".join(w["text"] for w in block),
            "x0": min(w["x0"] for w in block),
            "x1": max(w["x1"] for w in block),
            "top": min(w["top"] for w in block),
            "bottom": max(w["bottom"] for w in block),
        }
        for block in merged_blocks
    ]

def merge_words_into_rows(words, x_thresh=10, y_thresh=5):
    """
    Groups words into rows and merges horizontally close words.
    """
    rows = group_words_by_row(words, y_thresh)
    merged_rows = [merge_words_in_row(row, x_thresh) for row in rows]
    print("Merged rows are", merged_rows)
    return merged_rows

def assign_columns_to_blocks(merged_rows, column_gap_thresh=20, ncol=3):
    """
    Assigns a column index to each merged text block by detecting significant gaps in x0 values.
    
    Parameters:
    - merged_rows: List of lists of merged word blocks.
    - column_gap_thresh: Minimum gap to consider as a column boundary.
    
    Returns:
    - A list where each element is a tuple (column_index, word_block_dict).
    """
    all_x_values = sorted(set(block["x0"] for row in merged_rows for block in row))

    # Detect gaps to determine column boundaries
    column_boundaries = [all_x_values[0]]
    for i in range(1, len(all_x_values)):
        if all_x_values[i] - all_x_values[i - 1] > column_gap_thresh:
            column_boundaries.append(all_x_values[i])

    def get_column_index(x0):
        """Finds the appropriate column index for a given x0 value."""
        for i, boundary in enumerate(column_boundaries):
            if x0 < boundary:
                return max(i - 1, 0)
        return len(column_boundaries) - 1

    structured_output = []
    for idx,row in enumerate(merged_rows):
        row_output = [((idx, (get_column_index(block["x0"]))), block) for block in row] # Store row and column indices for each block with that block
        # row_output = [row for row in row_output if row[0] <= ncol]
        structured_output.append(row_output)

    return structured_output




In [76]:
# Example usage
header_x_thresh = 10
header_y_thresh = 1
column_gap_thresh = 20  # Adjust based on observed spacing
ncol = 3

with pdfplumber.open(filename) as pdf:
    words = pdf.pages[-1].extract_words()  # Extract words from page 0
    merged_rows = merge_words_into_rows(words, header_x_thresh, header_y_thresh)
    structured_output = assign_columns_to_blocks(merged_rows, column_gap_thresh)

for row in structured_output:
    print(row)  # Prints the structured output



Merged rows are [[{'text': 'PLUTO DATA DICTIONARY', 'x0': 77.4, 'x1': 224.05204799999993, 'top': 67.58112000000006, 'bottom': 78.62112000000002}, {'text': 'November 2022 (22v3)', 'x0': 470.04, 'x1': 575.83632, 'top': 67.58112000000006, 'bottom': 78.62112000000002}], [{'text': 'APPENDIX D: LAND USE CATEGORIES', 'x0': 201.96528, 'x1': 410.138832, 'top': 112.46304000000009, 'bottom': 123.50304000000006}], [{'text': 'DCP', 'x0': 77.4, 'x1': 99.97680000000001, 'top': 150.38112, 'bottom': 161.42111999999997}], [{'text': 'LAND', 'x0': 77.4, 'x1': 108.52176, 'top': 162.98112000000003, 'bottom': 174.02112}, {'text': 'DCP LAND', 'x0': 122.28, 'x1': 178.713168, 'top': 162.98112000000003, 'bottom': 174.02112}, {'text': 'DOF/DCP BUILDING CLASSES', 'x0': 212.76, 'x1': 371.62891199999984, 'top': 162.98112000000003, 'bottom': 174.02112}], [{'text': 'USE', 'x0': 77.4, 'x1': 98.79993600000002, 'top': 175.70112000000006, 'bottom': 186.74112000000002}, {'text': 'USE', 'x0': 122.28, 'x1': 143.679936, 'top'

# Next iteration

In [135]:
import pdfplumber

def trim_lines_outside_table(lines, table_top_boundary_text=None, table_bottom_boundary_text=None):
    """Returns the index of the first line to contain the specified table_top_boundary_text

    Args:
        lines (_type_): _description_
        table_top_boundary_text (_type_, optional): _description_. Defaults to None.

    Returns:
        _type_: _description_
    """
    for idx, line in enumerate(lines):
        if table_top_boundary_text is not None and table_top_boundary_text in " ".join(
            [word["text"] for word in line]
        ):
            top_trim_line = idx
            continue
        elif (
            table_bottom_boundary_text is not None
            and table_bottom_boundary_text in " ".join([word["text"] for word in line])
        ):
            bottom_trim_line = idx
            continue
        else:
            continue

    trimmed_lines = [
        line
        for idx, line in enumerate(lines)
        if idx > top_trim_line and idx < bottom_trim_line
    ]
    return trimmed_lines

def group_words_by_row(words, y_thresh=5):
    """Groups words into rows based on vertical proximity, allowing small deviations in top values."""
    words = sorted(words, key=lambda w: w['top'])  # Sort words top-to-bottom
    rows = []

    for word in words:
        added = False
        for row in rows:
            # Compare with first word in the row for stability
            if abs(word['top'] - row[0]['top']) <= y_thresh:
                row.append(word)
                added = True
                break
        if not added:
            rows.append([word])

    return rows

def merge_words_in_row(row, x_thresh=10):
    """
    Merges words in a single row, considering the provided x_thresh for horizontal grouping.
    
    Returns:
    - A list of merged text blocks, each with the merged text and bounding box.
    """
    row.sort(key=lambda w: w['x0'])  # Sort words left-to-right
    merged_blocks = []
    current_block = []

    for word in row:
        if current_block and (word['x0'] - current_block[-1]['x1']) <= x_thresh:
            current_block.append(word)
        else:
            if current_block:
                current_block.sort(key=lambda w: w['top']) # Sort block by top coordinate to get text in each table cell correctly ordered.
                merged_blocks.append(current_block)
            current_block = [word]

    if current_block:
        merged_blocks.append(current_block)

    return [
        {
            "text": " ".join(w["text"] for w in block),
            "x0": min(w["x0"] for w in block),
            "x1": max(w["x1"] for w in block),
            "top": min(w["top"] for w in block),
            "bottom": max(w["bottom"] for w in block),
        }
        for block in merged_blocks
    ]


# def merge_lines_in_row(rows, y_thresh):
#     new_rows = []
#     prev_top = None
#     prev_bottom = None
#     for idx, row in enumerate(rows):
#         top = min(w['top'] for w in row)
#         bottom = min(w['bottom'] for w in row)
#         if prev_top - min(w['top'] for w in row) < y_thresh:

from collections import defaultdict

def merge_lines_in_row(lines, y_thresh):
    merged_lines = []
    
    for line in lines:
        if not merged_lines:
            merged_lines.append(line)
            continue
        
        prev_line = merged_lines[-1]
        
        # Compute merging condition
        min_top_current = min(word["top"] for word in line)
        max_bottom_prev = max(word["bottom"] for word in prev_line)
        
        if min_top_current - max_bottom_prev < y_thresh:
            # Merge into the previous line
            merged_lines[-1].extend(line)
        else:
            # Start a new line
            merged_lines.append(line)

    # Now merge words by `x0` within each line
    result = []
    
    for line in merged_lines:
        grouped = defaultdict(list)
        
        for (_, word) in enumerate(line):
            grouped[word["x0"]].append(word)
        
        merged_words = []
        
        for x0 in sorted(grouped.keys()):  # Preserve order
            words = grouped[x0]
            merged_text = " ".join(w["text"] for w in words)
            x1 = max(w["x1"] for w in words)
            top = min(w["top"] for w in words)
            bottom = max(w["bottom"] for w in words)
            
            merged_words.append({"text": merged_text, "x0": x0, "x1": x1, "top": top, "bottom": bottom})
        
        result.append(merged_words)

    return result



def detect_header_by_uppercase(rows):
    """Identifies the header row by checking if all words are uppercase."""
    header_row = []
    body_rows = []

    for row in rows:
        if all(word["text"].isupper() for word in row):  # All words must be uppercase
            header_row = header_row + row
            print("header_row words:", [w["text"] for w in row])
        else:
            body_rows.append(row)

    return header_row, body_rows


def merge_words_into_rows(words, header_x_thresh, header_y_thresh, body_x_thresh, body_y_thresh):
    """
    Groups words into rows and merges horizontally close words.
    """
    rows = group_words_by_row(words, header_y_thresh)
    trimmed_rows = trim_lines_outside_table(rows, table_top_boundary_text="APPENDIX D: LAND USE CATEGORIES", table_bottom_boundary_text="NOTES:")
    header_row, body_rows = detect_header_by_uppercase(trimmed_rows)
    merged_header = merge_words_in_row(header_row, header_x_thresh)
    print("merged header is", merged_header)
    merged_rows = [merge_words_in_row(row, body_x_thresh) for row in body_rows]
    merged_rows = merge_lines_in_row(merged_rows, body_y_thresh)
    all_rows = [merged_header] + merged_rows
    print("all_rows rows is", all_rows)
    return all_rows

def assign_columns_to_blocks(merged_rows, column_gap_thresh=20, ncol=3):
    """
    Assigns a column index to each merged text block by detecting significant gaps in x0 values.
    
    Parameters:
    - merged_rows: List of lists of merged word blocks.
    - column_gap_thresh: Minimum gap to consider as a column boundary.
    
    Returns:
    - A list where each element is a tuple (column_index, word_block_dict).
    """
    all_x_values = sorted(set(block["x0"] for row in merged_rows for block in row))

    # Detect gaps to determine column boundaries
    column_boundaries = [all_x_values[0]]
    for i in range(1, len(all_x_values)):
        if all_x_values[i] - all_x_values[i - 1] > column_gap_thresh:
            column_boundaries.append(all_x_values[i])

    def get_column_index(x0):
        """Finds the appropriate column index for a given x0 value."""
        for i, boundary in enumerate(column_boundaries):
            if x0 < boundary:
                return max(i - 1, 0)
        return len(column_boundaries) - 1

    structured_output = []
    for idx,row in enumerate(merged_rows):
        row_output = [((idx, (get_column_index(block["x0"]))), block) for block in row] # Store row and column indices for each block with that block
        # row_output = [row for row in row_output if row[0] <= ncol]
        structured_output.append(row_output)

    return structured_output




In [140]:
# Example usage
header_x_thresh = 10
header_y_thresh = 20
body_x_thresh = 5
body_y_thresh = 10
column_gap_thresh = 20  # Adjust based on observed spacing
ncol = 3

with pdfplumber.open(filename) as pdf:
    words = pdf.pages[-1].extract_words()  # Extract words from page 0
    merged_rows = merge_words_into_rows(words, header_x_thresh, header_y_thresh, body_x_thresh, body_y_thresh)
    structured_output = assign_columns_to_blocks(merged_rows, column_gap_thresh)




header_row words: ['DCP', 'LAND', 'DCP', 'LAND', 'DOF/DCP', 'BUILDING', 'CLASSES']
header_row words: ['USE', 'USE', 'CODE', 'CATEGORIES']
merged header is [{'text': 'DCP LAND USE CODE', 'x0': 77.4, 'x1': 109.23494400000001, 'top': 150.38112, 'bottom': 199.34112000000005}, {'text': 'DCP LAND USE CATEGORIES', 'x0': 122.28, 'x1': 195.725808, 'top': 162.98112000000003, 'bottom': 199.34112000000005}, {'text': 'DOF/DCP BUILDING CLASSES', 'x0': 212.76, 'x1': 371.62891199999984, 'top': 162.98112000000003, 'bottom': 174.02112}]
all_rows rows is [[{'text': 'DCP LAND USE CODE', 'x0': 77.4, 'x1': 109.23494400000001, 'top': 150.38112, 'bottom': 199.34112000000005}, {'text': 'DCP LAND USE CATEGORIES', 'x0': 122.28, 'x1': 195.725808, 'top': 162.98112000000003, 'bottom': 199.34112000000005}, {'text': 'DOF/DCP BUILDING CLASSES', 'x0': 212.76, 'x1': 371.62891199999984, 'top': 162.98112000000003, 'bottom': 174.02112}], [{'text': '01', 'x0': 77.4, 'x1': 88.44, 'top': 214.10928, 'bottom': 225.1492799999999

In [141]:
for row in structured_output:
    print(row)  # Prints the structured output


[((0, 0), {'text': 'DCP LAND USE CODE', 'x0': 77.4, 'x1': 109.23494400000001, 'top': 150.38112, 'bottom': 199.34112000000005}), ((0, 1), {'text': 'DCP LAND USE CATEGORIES', 'x0': 122.28, 'x1': 195.725808, 'top': 162.98112000000003, 'bottom': 199.34112000000005}), ((0, 3), {'text': 'DOF/DCP BUILDING CLASSES', 'x0': 212.76, 'x1': 371.62891199999984, 'top': 162.98112000000003, 'bottom': 174.02112})]
[((1, 0), {'text': '01', 'x0': 77.4, 'x1': 88.44, 'top': 214.10928, 'bottom': 225.14927999999998}), ((1, 1), {'text': 'One & Two Family Buildings', 'x0': 122.28, 'x1': 199.07092799999995, 'top': 214.10928, 'bottom': 237.86928}), ((1, 3), {'text': 'A*,B*,Z0', 'x0': 212.76, 'x1': 256.799664, 'top': 214.10928, 'bottom': 225.14927999999998})]
[((2, 0), {'text': '02', 'x0': 77.4, 'x1': 88.44, 'top': 252.02927999999997, 'bottom': 263.06927999999994}), ((2, 1), {'text': 'Multi-Family Walk-Up Buildings', 'x0': 122.28, 'x1': 181.685136, 'top': 252.02927999999997, 'bottom': 288.38928}), ((2, 3), {'text'

In [None]:
import pdfplumber

def group_words_by_row(words, y_thresh=5):
    """Groups words into rows based on vertical proximity, allowing small deviations in top values."""
    words = sorted(words, key=lambda w: w['top'])  # Sort words top-to-bottom
    rows = []

    for word in words:
        added = False
        for row in rows:
            # Compare with first word in the row for stability
            if abs(word['top'] - row[0]['top']) <= y_thresh:
                row.append(word)
                added = True
                break
        if not added:
            rows.append([word])

    return rows

def merge_words_in_row(row, x_thresh=10):
    """
    Merges words in a single row, considering the provided x_thresh for horizontal grouping.
    
    Returns:
    - A list of merged text blocks, each with the merged text and bounding box.
    """
    row.sort(key=lambda w: w['x0'])  # Sort words left-to-right
    merged_blocks = []
    current_block = []

    for word in row:
        if current_block and (word['x0'] - current_block[-1]['x1']) <= x_thresh:
            current_block.append(word)
        else:
            if current_block:
                merged_blocks.append(current_block)
            current_block = [word]

    if current_block:
        merged_blocks.append(current_block)

    return [
        {
            "text": " ".join(w["text"] for w in block),
            "x0": min(w["x0"] for w in block),
            "x1": max(w["x1"] for w in block),
            "top": min(w["top"] for w in block),
            "bottom": max(w["bottom"] for w in block),
        }
        for block in merged_blocks
    ]

def merge_words_into_rows(words, x_thresh=10, y_thresh=5):
    """
    Groups words into rows and merges horizontally close words.
    """
    rows = group_words_by_row(words, y_thresh)
    merged_rows = [merge_words_in_row(row, x_thresh) for row in rows]
    print("Merged rows are", merged_rows)
    return merged_rows

def assign_columns_to_blocks(merged_rows, column_gap_thresh=20, ncol=3):
    """
    Assigns a column index to each merged text block by detecting significant gaps in x0 values.
    
    Parameters:
    - merged_rows: List of lists of merged word blocks.
    - column_gap_thresh: Minimum gap to consider as a column boundary.
    
    Returns:
    - A list where each element is a tuple (column_index, word_block_dict).
    """
    all_x_values = sorted(set(block["x0"] for row in merged_rows for block in row))

    # Detect gaps to determine column boundaries
    column_boundaries = [all_x_values[0]]
    for i in range(1, len(all_x_values)):
        if all_x_values[i] - all_x_values[i - 1] > column_gap_thresh:
            column_boundaries.append(all_x_values[i])

    def get_column_index(x0):
        """Finds the appropriate column index for a given x0 value."""
        for i, boundary in enumerate(column_boundaries):
            if x0 < boundary:
                return max(i - 1, 0)
        return len(column_boundaries) - 1

    structured_output = []
    for idx,row in enumerate(merged_rows):
        row_output = [((idx, (get_column_index(block["x0"]))), block) for block in row] # Store row and column indices for each block with that block
        # row_output = [row for row in row_output if row[0] <= ncol]
        structured_output.append(row_output)

    return structured_output




In [None]:
# Example usage
header_x_thresh = 10
header_y_thresh = 1
column_gap_thresh = 20  # Adjust based on observed spacing
ncol = 3

with pdfplumber.open(filename) as pdf:
    words = pdf.pages[-1].extract_words()  # Extract words from page 0
    merged_rows = merge_words_into_rows(words, header_x_thresh, header_y_thresh)
    structured_output = assign_columns_to_blocks(merged_rows, column_gap_thresh)

for row in structured_output:
    print(row)  # Prints the structured output



Merged rows are [[{'text': 'PLUTO DATA DICTIONARY', 'x0': 77.4, 'x1': 224.05204799999993, 'top': 67.58112000000006, 'bottom': 78.62112000000002}, {'text': 'November 2022 (22v3)', 'x0': 470.04, 'x1': 575.83632, 'top': 67.58112000000006, 'bottom': 78.62112000000002}], [{'text': 'APPENDIX D: LAND USE CATEGORIES', 'x0': 201.96528, 'x1': 410.138832, 'top': 112.46304000000009, 'bottom': 123.50304000000006}], [{'text': 'DCP', 'x0': 77.4, 'x1': 99.97680000000001, 'top': 150.38112, 'bottom': 161.42111999999997}], [{'text': 'LAND', 'x0': 77.4, 'x1': 108.52176, 'top': 162.98112000000003, 'bottom': 174.02112}, {'text': 'DCP LAND', 'x0': 122.28, 'x1': 178.713168, 'top': 162.98112000000003, 'bottom': 174.02112}, {'text': 'DOF/DCP BUILDING CLASSES', 'x0': 212.76, 'x1': 371.62891199999984, 'top': 162.98112000000003, 'bottom': 174.02112}], [{'text': 'USE', 'x0': 77.4, 'x1': 98.79993600000002, 'top': 175.70112000000006, 'bottom': 186.74112000000002}, {'text': 'USE', 'x0': 122.28, 'x1': 143.679936, 'top'

In [117]:
# import pdfplumber

# def group_words_by_row(words, y_thresh=5):
#     """Groups words into rows based on vertical proximity, allowing small deviations in top values."""
#     words = sorted(words, key=lambda w: w['top'])  # Sort words top-to-bottom
#     rows = []

#     for word in words:
#         added = False
#         for row in rows:
#             # Compare with first word in the row for stability
#             if abs(word['top'] - row[0]['top']) <= y_thresh:
#                 row.append(word)
#                 added = True
#                 break
#         if not added:
#             rows.append([word])

#     return rows

# def merge_words_in_row(row, x_thresh=10):
#     """
#     Merges words in a single row, considering the provided x_thresh for horizontal grouping.
    
#     Returns:
#     - A list of merged text blocks, each with the merged text and bounding box.
#     """
#     row.sort(key=lambda w: w['x0'])  # Sort words left-to-right
#     merged_blocks = []
#     current_block = []

#     for word in row:
#         if current_block and (word['x0'] - current_block[-1]['x1']) <= x_thresh:
#             current_block.append(word)
#         else:
#             if current_block:
#                 merged_blocks.append(current_block)
#             current_block = [word]

#     if current_block:
#         merged_blocks.append(current_block)

#     return [
#         {
#             "text": " ".join(w["text"] for w in block),
#             "x0": min(w["x0"] for w in block),
#             "x1": max(w["x1"] for w in block),
#             "top": min(w["top"] for w in block),
#             "bottom": max(w["bottom"] for w in block),
#         }
#         for block in merged_blocks
#     ]

# def merge_words_into_rows(words, x_thresh=10, y_thresh=5):
#     """
#     Groups words into rows and merges horizontally close words.
#     """
#     rows = group_words_by_row(words, y_thresh)
#     merged_rows = [merge_words_in_row(row, x_thresh) for row in rows]
#     print("Merged rows are", merged_rows)
#     return merged_rows

# def assign_columns_to_blocks(merged_rows, column_gap_thresh=20, ncol=3):
#     """
#     Assigns a column index to each merged text block by detecting significant gaps in x0 values.
    
#     Parameters:
#     - merged_rows: List of lists of merged word blocks.
#     - column_gap_thresh: Minimum gap to consider as a column boundary.
    
#     Returns:
#     - A list where each element is a tuple (column_index, word_block_dict).
#     """
#     all_x_values = sorted(set(block["x0"] for row in merged_rows for block in row))

#     # Detect gaps to determine column boundaries
#     column_boundaries = [all_x_values[0]]
#     for i in range(1, len(all_x_values)):
#         if all_x_values[i] - all_x_values[i - 1] > column_gap_thresh:
#             column_boundaries.append(all_x_values[i])

#     def get_column_index(x0):
#         """Finds the appropriate column index for a given x0 value."""
#         for i, boundary in enumerate(column_boundaries):
#             if x0 < boundary:
#                 return max(i - 1, 0)
#         return len(column_boundaries) - 1

#     structured_output = []
#     for idx,row in enumerate(merged_rows):
#         row_output = [((idx, (get_column_index(block["x0"]))), block) for block in row] # Store row and column indices for each block with that block
#         # row_output = [row for row in row_output if row[0] <= ncol]
#         structured_output.append(row_output)

#     return structured_output




In [None]:
# # Example usage
# header_x_thresh = 10
# header_y_thresh = 1
# column_gap_thresh = 20  # Adjust based on observed spacing
# ncol = 3

# with pdfplumber.open(filename) as pdf:
#     words = pdf.pages[-1].extract_words()  # Extract words from page 0
#     merged_rows = merge_words_into_rows(words, header_x_thresh, header_y_thresh)
#     structured_output = assign_columns_to_blocks(merged_rows, column_gap_thresh)

# for row in structured_output:
#     print(row)  # Prints the structured output



Merged rows are [[{'text': 'PLUTO DATA DICTIONARY', 'x0': 77.4, 'x1': 224.05204799999993, 'top': 67.58112000000006, 'bottom': 78.62112000000002}, {'text': 'November 2022 (22v3)', 'x0': 470.04, 'x1': 575.83632, 'top': 67.58112000000006, 'bottom': 78.62112000000002}], [{'text': 'APPENDIX D: LAND USE CATEGORIES', 'x0': 201.96528, 'x1': 410.138832, 'top': 112.46304000000009, 'bottom': 123.50304000000006}], [{'text': 'DCP', 'x0': 77.4, 'x1': 99.97680000000001, 'top': 150.38112, 'bottom': 161.42111999999997}], [{'text': 'LAND', 'x0': 77.4, 'x1': 108.52176, 'top': 162.98112000000003, 'bottom': 174.02112}, {'text': 'DCP LAND', 'x0': 122.28, 'x1': 178.713168, 'top': 162.98112000000003, 'bottom': 174.02112}, {'text': 'DOF/DCP BUILDING CLASSES', 'x0': 212.76, 'x1': 371.62891199999984, 'top': 162.98112000000003, 'bottom': 174.02112}], [{'text': 'USE', 'x0': 77.4, 'x1': 98.79993600000002, 'top': 175.70112000000006, 'bottom': 186.74112000000002}, {'text': 'USE', 'x0': 122.28, 'x1': 143.679936, 'top'

In [81]:
structured_output.sort(key=lambda row: row[0][0])
structured_output

[[((0, 0),
   {'text': 'PLUTO DATA DICTIONARY',
    'x0': 77.4,
    'x1': 224.05204799999993,
    'top': 67.58112000000006,
    'bottom': 78.62112000000002}),
  ((0, 5),
   {'text': 'November 2022 (22v3)',
    'x0': 470.04,
    'x1': 575.83632,
    'top': 67.58112000000006,
    'bottom': 78.62112000000002})],
 [((1, 3),
   {'text': 'APPENDIX D: LAND USE CATEGORIES',
    'x0': 201.96528,
    'x1': 410.138832,
    'top': 112.46304000000009,
    'bottom': 123.50304000000006})],
 [((2, 0),
   {'text': 'DCP',
    'x0': 77.4,
    'x1': 99.97680000000001,
    'top': 150.38112,
    'bottom': 161.42111999999997})],
 [((3, 0),
   {'text': 'LAND',
    'x0': 77.4,
    'x1': 108.52176,
    'top': 162.98112000000003,
    'bottom': 174.02112}),
  ((3, 1),
   {'text': 'DCP LAND',
    'x0': 122.28,
    'x1': 178.713168,
    'top': 162.98112000000003,
    'bottom': 174.02112}),
  ((3, 3),
   {'text': 'DOF/DCP BUILDING CLASSES',
    'x0': 212.76,
    'x1': 371.62891199999984,
    'top': 162.9811200000000

In [82]:
structured_output[2]

[((2, 0),
  {'text': 'DCP',
   'x0': 77.4,
   'x1': 99.97680000000001,
   'top': 150.38112,
   'bottom': 161.42111999999997})]

In [83]:
header_x_thresh = 10
header_y_thresh = 10
body_x_thresh = 10
body_y_thresh = 20

# merged_rows = merge_words_into_rows(words, header_x_thresh, header_y_thresh, body_x_thresh, body_y_thresh)
merged_rows = merge_words_into_rows(words, header_x_thresh, header_y_thresh)


Merged rows are [[{'text': 'PLUTO DATA DICTIONARY', 'x0': 77.4, 'x1': 224.05204799999993, 'top': 67.58112000000006, 'bottom': 78.62112000000002}, {'text': 'November 2022 (22v3)', 'x0': 470.04, 'x1': 575.83632, 'top': 67.58112000000006, 'bottom': 78.62112000000002}], [{'text': 'APPENDIX D: LAND USE CATEGORIES', 'x0': 201.96528, 'x1': 410.138832, 'top': 112.46304000000009, 'bottom': 123.50304000000006}], [{'text': 'DCP', 'x0': 77.4, 'x1': 99.97680000000001, 'top': 150.38112, 'bottom': 161.42111999999997}], [{'text': 'LAND', 'x0': 77.4, 'x1': 108.52176, 'top': 162.98112000000003, 'bottom': 174.02112}, {'text': 'DCP LAND', 'x0': 122.28, 'x1': 178.713168, 'top': 162.98112000000003, 'bottom': 174.02112}, {'text': 'DOF/DCP BUILDING CLASSES', 'x0': 212.76, 'x1': 371.62891199999984, 'top': 162.98112000000003, 'bottom': 174.02112}], [{'text': 'USE', 'x0': 77.4, 'x1': 98.79993600000002, 'top': 175.70112000000006, 'bottom': 186.74112000000002}, {'text': 'USE', 'x0': 122.28, 'x1': 143.679936, 'top'

In [84]:
for row in merged_rows:
    print(row)

[{'text': 'PLUTO DATA DICTIONARY', 'x0': 77.4, 'x1': 224.05204799999993, 'top': 67.58112000000006, 'bottom': 78.62112000000002}, {'text': 'November 2022 (22v3)', 'x0': 470.04, 'x1': 575.83632, 'top': 67.58112000000006, 'bottom': 78.62112000000002}]
[{'text': 'APPENDIX D: LAND USE CATEGORIES', 'x0': 201.96528, 'x1': 410.138832, 'top': 112.46304000000009, 'bottom': 123.50304000000006}]
[{'text': 'DCP', 'x0': 77.4, 'x1': 99.97680000000001, 'top': 150.38112, 'bottom': 161.42111999999997}]
[{'text': 'LAND', 'x0': 77.4, 'x1': 108.52176, 'top': 162.98112000000003, 'bottom': 174.02112}, {'text': 'DCP LAND', 'x0': 122.28, 'x1': 178.713168, 'top': 162.98112000000003, 'bottom': 174.02112}, {'text': 'DOF/DCP BUILDING CLASSES', 'x0': 212.76, 'x1': 371.62891199999984, 'top': 162.98112000000003, 'bottom': 174.02112}]
[{'text': 'USE', 'x0': 77.4, 'x1': 98.79993600000002, 'top': 175.70112000000006, 'bottom': 186.74112000000002}, {'text': 'USE', 'x0': 122.28, 'x1': 143.679936, 'top': 175.69920000000002,

In [85]:
def trim_lines_outside_table(lines, table_top_boundary_text=None, table_bottom_boundary_text=None):
    """Returns the index of the first line to contain the specified table_top_boundary_text

    Args:
        lines (_type_): _description_
        table_top_boundary_text (_type_, optional): _description_. Defaults to None.

    Returns:
        _type_: _description_
    """
    for idx, line in enumerate(lines):
        if table_top_boundary_text is not None and table_top_boundary_text in " ".join(
            [word["text"] for word in line]
        ):
            top_trim_line = idx
            continue
        elif (
            table_bottom_boundary_text is not None
            and table_bottom_boundary_text in " ".join([word["text"] for word in line])
        ):
            bottom_trim_line = idx
            continue
        else:
            continue

    trimmed_lines = [
        line
        for idx, line in enumerate(lines)
        if idx > top_trim_line and idx < bottom_trim_line
    ]
    return trimmed_lines


def detect_header_by_uppercase(rows):
    """Identifies the header row by checking if all words are uppercase."""
    header_row = []
    body_rows = []

    for row in rows:
        if all(word["text"].isupper() for word in row):  # All words must be uppercase
            header_row = header_row + row
            print("header_row words:", [w["text"] for w in row])
        else:
            body_rows.append(row)

    return header_row, body_rows


def group_words_by_line(words, y_thresh=5):
    """Groups words into rows based on vertical proximity, allowing small deviations in top values."""
    words = sorted(words, key=lambda w: w["top"])  # Sort words top-to-bottom
    rows = []

    for word in words:
        added = False
        for row in rows:
            # Compare with first word in the row for stability
            if abs(word["top"] - row[0]["top"]) <= y_thresh:
                word['top'] = row[0]["top"]
                row.append(word)
                added = True
                break
        if not added:
            rows.append([word])

    return rows


def merge_words_in_line(line, x_thresh=10, ncol=2):
    """
    Merges words in a single line, considering the provided x_thresh for horizontal grouping.

    Returns:
    - A list of merged text blocks, each with the merged text and bounding box.
    """
    print("Row before sort is:", line)
    line.sort(key=lambda w: w["x0"])  # Sort words left-to-right
    merged_blocks = []
    current_block = []

    for word in line:
        if current_block and (word["x0"] - current_block[-1]["x1"]) <= x_thresh:
            current_block.append(word)
        else:
            if current_block:
                merged_blocks.append(current_block)
            current_block = [word]

    if current_block:
        merged_blocks.append(current_block)
    
    if len(merged_blocks) != ncol:
        print(f"Splitting line into columns yielded {len(merged_blocks)} groups instead of the expected {ncol}. Please try to set ncol appropriately. The line in question: {line}")

    return [
        {
            "text": " ".join(w["text"] for w in block),
            "x0": min(w["x0"] for w in block),
            "x1": max(w["x1"] for w in block),
            "top": min(w["top"] for w in block),
            "bottom": max(w["bottom"] for w in block),
        }
        for block in merged_blocks
    ]


def merge_lines_by_y_distance(lines, y_thresh):
    merged_lines = []
    
    for line in lines:
        if not merged_lines:
            merged_lines.append(line)
            continue
        
        prev_line = merged_lines[-1]
        
        # Compute the merging condition
        min_top_current = min(word["top"] for word in line)
        max_bottom_prev = max(word["bottom"] for word in prev_line)
        
        if min_top_current - max_bottom_prev < y_thresh:
            # Merge into the previous line
            merged_lines[-1].extend(line)
            merged_lines[-1].sort(key=lambda w: w["x0"])
            print("merged_lines[-1] is", merged_lines[-1])
        else:
            # Start a new line
            merged_lines.append(line)
    
    return merged_lines



# def group_lines_by_row(lines, y_thresh=5):
#     """Groups words into rows based on vertical proximity, allowing small deviations in top values."""
#     lines = sorted(lines, key=lambda l: min([block["bottom"] for block in l]))  # Sort words top-to-bottom
#     print('Sorted lines before merging into rows are:', lines)
#     lower_bound = None
#     rows = []

#     for idx,line in enumerate(lines):
#         if lower_bound:
#             upper_bound  = min([block['top'] for block in line])
#             if lower_bound - upper_bound <= y_thresh:
#                 print("Part of previous line", line)
#             lower_bound = max([block['bottom'] for block in line])
#         else:
#             lower_bound = max([block['bottom'] for block in line])

#     return rows

# def merge_lines_into_rows(lines, y_thresh=10, ncol=2):
#     """
#     Merges lines are part of a single table row, considering the provided y_thresh for vertical grouping.

#     Returns:
#     - A list of merged text blocks, each with the merged text and bounding box.
#     """
#     for line in lines:
#         line

# def merge_lines_into_rows(lines, x_thresh=10, y_thresh=5, ncol=2):
#     """Some cells have multiple lines, so when multiple lines are in one cell, they need to be merged into one object
#     """
#     tops = [min(i['top'] for i in line) for line in lines]
#     bottoms = [max(i['top'] for i in line) for line in lines]
#     print(f'tops: {tops}, bottoms: {bottoms}')

def merge_lines_into_columns(lines, x_thresh=10, ncol=2):
    """
    Groups words into rows and merges horizontally close words.
    """
    print("Input lines are", lines)
    merged_lines = [merge_words_in_line(line, x_thresh, ncol=ncol) for line in lines]
    print("Merged rows are: ", merged_lines)
    return merged_lines

def merge_row_into_columns(line, x_thresh=10, ncol=2):
    """
    Groups horizontally close words in a single row, hopefully into the appropriate columns.
    """
    merged_row = merge_words_in_line(line, x_thresh, ncol=ncol)
    print("Merged header row is: ", merged_row)
    return merged_row

def assign_columns_to_blocks(merged_rows, column_gap_thresh=20):
    """
    Assigns a column index to each merged text block by detecting significant gaps in x0 values.

    Parameters:
    - merged_rows: List of lists of merged word blocks.
    - column_gap_thresh: Minimum gap to consider as a column boundary.

    Returns:
    - A list where each element is a tuple (column_index, word_block_dict).
    """
    print("Merged rows to be broken into blocks:", merged_rows)
    all_x_values = sorted(set(block["x0"] for row in merged_rows for block in row))

    # Detect gaps to determine column boundaries
    column_boundaries = [all_x_values[0]]
    for i in range(1, len(all_x_values)):
        if all_x_values[i] - all_x_values[i - 1] > column_gap_thresh:
            column_boundaries.append(all_x_values[i])

    def get_column_index(x0):
        """Finds the appropriate column index for a given x0 value."""
        for i, boundary in enumerate(column_boundaries):
            if x0 < boundary:
                return max(i - 1, 0)
        return len(column_boundaries) - 1


def get_faux_table(
    filename,
    page=0,
    ncol=2,
    header_all_uppercase=False,
    table_top_boundary_text=None,
    table_bottom_boundary_text=None,
    header_x_thresh=10,
    header_y_thresh=20,
    body_x_thresh=10,
    body_y_thresh=20,
    column_gap_thresh=20,
):
    """This for extracting table-like arrangements of text in a PDF (which I call "faux-tables"), for cases where the "table" is not explicitly defined as such and thus missed by `pdfplumber.extract_tables()`
    To help correctly define the table columns, this function allows the user to provide some hints to assist in identifying boundaries. For the moment, this is only for this particular faux table, but I hope it turns out to be reusable.

    Args:
        filename (str): Path to the pdf file
        page (int): Page on which the table is located
        ncol (int): Number of columns in the table
        table_top_boundary_text (str): Unique string of text in the last line before the start of the table header
        table_bottom_boundary_text (str): Unique string of text in the first line after the end of the table
        header_x_thresh (float): Horizontal distance within which words will be grouped into the same block, in the header
        header_y_thresh (float): Vertical distance within which words will be grouped into the same block, to account for table rows that have multiple lines of text inside them, in the header.
        body_x_thresh (float): Horizontal distance within which words will be grouped into the same block, in the table body
        body_y_thresh (float): Vertical distance within which words will be grouped into the same block, to account for table rows that have multiple lines of text inside them, in the table body.
        column_gap_thresh (float): Adjust based on observed spacing
        header_all_uppercase (bool): If the header is in all uppercase, that can help with identifying it.
    """

    with pdfplumber.open(filename) as pdf:
        words = pdf.pages[-1].extract_words()  # Extract words from page 0
        lines = group_words_by_line(
            words, y_thresh=1
        )  # Use body row y_thresh for initial grouping
        print('Lines are', lines)
        # if table_top_boundary_text is not None or table_bottom_boundary_text is not None:
        trimmed_lines = trim_lines_outside_table(
            lines,
            table_top_boundary_text=table_top_boundary_text,
            table_bottom_boundary_text=table_bottom_boundary_text,
        )
        if (
            header_all_uppercase is True
        ):  # If header is all uppercase, separate it on that basis
            print("Assuming all caps distinguishes header")
            header_row, body_rows = detect_header_by_uppercase(trimmed_lines)
            print("Freshly extracted header row is", header_row)
            merged_header_row = merge_row_into_columns(
                header_row, x_thresh=header_x_thresh, ncol=ncol
            )
            print("Header row is", header_row)
            print("body_rows are", body_rows)
            merged_body_lines = merge_lines_into_columns(
                body_rows, x_thresh=body_x_thresh, ncol=ncol
            )
            print('merged_body_lines are', merged_body_lines)
            merged_body_rows = merge_lines_by_y_distance(merged_body_lines, y_thresh=5)
            split_body_rows = merge_lines_into_columns(
                merged_body_rows, x_thresh=body_x_thresh, ncol=ncol
            )
            print("merged and re-split body rows are:", split_body_rows)
            merged_all_rows = [merged_header_row] + split_body_rows
        else:
            print("No header specified")
            merged_all_rows = merge_lines_into_columns(
                trimmed_lines, x_thresh=body_x_thresh, ncol=ncol
            )
        structured_output = assign_columns_to_blocks(merged_all_rows, column_gap_thresh)

    return structured_output

    # Step 1: Group words into rows
    rows = group_words_by_row(
        words, body_y_thresh
    )  # Use body row y_thresh for initial grouping

    # Step 2: Detect the header row based on all uppercase words and vertical threshold
    header_row, body_rows = detect_header_by_uppercase(rows, y_thresh=header_y_thresh)

    # Step 3: Merge words in header and body rows with separate thresholds
    all_rows = [header_row] + body_rows  # Ensure headers come first
    row_blocks = []

    # Merge header row with header-specific thresholds
    if header_row:
        row_blocks.append(
            merge_words_in_row(
                header_row, x_thresh=header_x_thresh, y_thresh=header_y_thresh
            )
        )

    # Merge body rows with body-specific thresholds
    for row in body_rows:
        row_blocks.append(
            merge_words_in_row(row, x_thresh=body_x_thresh, y_thresh=body_y_thresh)
        )

In [86]:
get_faux_table(
    filename,
    page=-1,
    ncol=3,
    header_all_uppercase=True,
    table_top_boundary_text="APPENDIX D: LAND USE CATEGORIES",
    table_bottom_boundary_text="NOTES:",
    header_x_thresh=10,
    header_y_thresh=1,
    body_x_thresh=20,
    body_y_thresh=20,
    column_gap_thresh=20,
)

Lines are [[{'text': 'PLUTO', 'x0': 77.4, 'x1': 115.264992, 'top': 67.58112000000006, 'doctop': 41251.58112, 'bottom': 78.62112000000002, 'upright': True, 'height': 11.039999999999964, 'width': 37.864992, 'direction': 'ltr'}, {'text': 'DATA', 'x0': 118.07577600000002, 'x1': 149.19863999999998, 'top': 67.58112000000006, 'doctop': 41251.58112, 'bottom': 78.62112000000002, 'upright': True, 'height': 11.039999999999964, 'width': 31.122863999999964, 'direction': 'ltr'}, {'text': 'DICTIONARY', 'x0': 151.90344, 'x1': 224.05204799999993, 'top': 67.58112000000006, 'doctop': 41251.58112, 'bottom': 78.62112000000002, 'upright': True, 'height': 11.039999999999964, 'width': 72.14860799999994, 'direction': 'ltr'}, {'text': 'November', 'x0': 470.04, 'x1': 518.981424, 'top': 67.58112000000006, 'doctop': 41251.58112, 'bottom': 78.62112000000002, 'upright': True, 'height': 11.039999999999964, 'width': 48.94142399999993, 'direction': 'ltr'}, {'text': '2022', 'x0': 521.760192, 'x1': 543.840192, 'top': 67.

In [87]:
structured_output

[[((0, 0),
   {'text': 'PLUTO DATA DICTIONARY',
    'x0': 77.4,
    'x1': 224.05204799999993,
    'top': 67.58112000000006,
    'bottom': 78.62112000000002}),
  ((0, 5),
   {'text': 'November 2022 (22v3)',
    'x0': 470.04,
    'x1': 575.83632,
    'top': 67.58112000000006,
    'bottom': 78.62112000000002})],
 [((1, 3),
   {'text': 'APPENDIX D: LAND USE CATEGORIES',
    'x0': 201.96528,
    'x1': 410.138832,
    'top': 112.46304000000009,
    'bottom': 123.50304000000006})],
 [((2, 0),
   {'text': 'DCP',
    'x0': 77.4,
    'x1': 99.97680000000001,
    'top': 150.38112,
    'bottom': 161.42111999999997})],
 [((3, 0),
   {'text': 'LAND',
    'x0': 77.4,
    'x1': 108.52176,
    'top': 162.98112000000003,
    'bottom': 174.02112}),
  ((3, 1),
   {'text': 'DCP LAND',
    'x0': 122.28,
    'x1': 178.713168,
    'top': 162.98112000000003,
    'bottom': 174.02112}),
  ((3, 3),
   {'text': 'DOF/DCP BUILDING CLASSES',
    'x0': 212.76,
    'x1': 371.62891199999984,
    'top': 162.9811200000000

In [88]:
ncol = 3
cols = {i:[] for i in range(ncol)}
for row in merged_rows:
    if len(row) <= ncol:
        print(row)
    for n in range(len(row)):
        coldata = [row[n] for row in merged_rows]
        cols[n] = coldata

[{'text': 'PLUTO DATA DICTIONARY', 'x0': 77.4, 'x1': 224.05204799999993, 'top': 67.58112000000006, 'bottom': 78.62112000000002}, {'text': 'November 2022 (22v3)', 'x0': 470.04, 'x1': 575.83632, 'top': 67.58112000000006, 'bottom': 78.62112000000002}]


IndexError: list index out of range

In [None]:
blocks[0]

In [None]:
with open("environment_data/table_dicts.pkl", "wb") as f:
    dill.dump({'table_dicts' : table_dicts}, f)