In [None]:
import pdfplumber
import re
import dill
from itertools import tee
from src.models import ColCustomization
import src.helpers
import src.pdfutils

In [None]:
filename = '/home/james/Massive/PROJECTDATA/nyc_real_estate_data/dictionaries/mapPLUTO_data_dictionary.pdf'

* Looking at the PLUTO data dictionary, it seems that most category variables are labeled as "alpahnumeric" even if they only contain numbers, such as zip codes.
* There are some exceptions, police precincts and districts are numeric and listed as such. However as there a limited number of repeating variables, I wil treat them as categorical as well.

In [None]:
fulltext = src.pdfutils.map_pdf(filename) 

In [None]:
fulltext.keys()

In [None]:
def normalize_top(objects, tolerance=0.3):
    """Adjust 'top' values so that small variations within tolerance are treated as equal. This is necessary when parsing PDFs where words on a line may have slightly different top positions. Made with the help of ChatGPT"""
    sorted_by_top = sorted(objects, key=lambda w: w["top"])
    clusters = []

    for object in sorted_by_top:
        if not clusters or abs(object["top"] - clusters[-1][0]) > tolerance:
            clusters.append((object["top"], []))  # Create new cluster
        clusters[-1][1].append(object)

    # Assign the lowest top value in each cluster
    top_mapping = {}
    for cluster_top, cluster_objects in clusters:
        for object in cluster_objects:
            top_mapping[object["top"]] = cluster_top

    sorted_objects = sorted(objects, key=lambda w: (top_mapping[w["top"]], w["x0"]))
    return sorted_objects


def map_pdf2(pdf_path, same_line_tolerance=0.1):
    with pdfplumber.open(pdf_path) as pdf:
        char_index = 0
        all_lines = []  # Store all detected lines first
        for page in pdf.pages:
            page_lines = []
            last_top = None
            words = page.extract_words()  
            normalized_words = normalize_top(words, same_line_tolerance)
            rects = [
                    {"text": "---section---", "top": r["top"], 'x0' : r['x0'], 'x1' : r['x1']}  # Dummy marker for sorting
                    for r in page.objects["rect"]
                    if r["width"] > page.width*0.5 and r["height"] < 2  and r['non_stroking_color'] is not None and r['non_stroking_color'][0] < 0.902 # Adjust thresholds as needed
                ]
            normalized_rects = normalize_top(rects, same_line_tolerance)
            elements = normalized_words + normalized_rects
            elements_sorted = sorted(elements, key=lambda e: e["top"])
            line = []
            for word in elements_sorted:
                if word['text'] == "---section---":
                    if line:  # Ensure the last line is stored before adding the section break
                        page_lines.append((line_range, line))
                    page_lines.append(word['text'])  # Append section break
                    line = []  # Reset line
                    last_top = word["top"]
                    continue

                word_length = len(word["text"])
                word['range'] = (char_index, char_index + word_length)
                char_index += word_length + 1

                if last_top is None or abs(word["top"] - last_top) > same_line_tolerance:  # New line
                    line_start = word['range'][0]
                    line_end = word['range'][1]
                    line_range = (line_start, line_end)
                    if last_top is not None and line:  # Save previous line before starting a new one
                        page_lines.append((line_range, line))
                    last_top = word["top"]
                    line = [word]
                else:
                    line.append(word)

            # Ensure the last line is stored after the loop ends
            if line:
                page_lines.append((line_range, line))

            # Store lines, skipping first and last. The first line is the title, and thus identical for each page, the last line is just the page number.
            line_no = 0
            # print(page_lines)
            for line in page_lines[1:-1]:  # Slice to remove first and last lines
                if line == "---section---":
                    print("New section!")
                    all_lines.append(line)
                    continue
                char_range = (next(iter(line[1]))['range'][0],  next(reversed(line[1]))['range'][1])
                x_range = (next(iter(line[1]))['x0'],  next(reversed(line[1]))['x1'])
                line_info = {'range' : char_range, 'x_dims' : x_range }
                line_content = ' '.join([w['text'] for w in line[1]])
                all_lines.append((line_content, line_info, line))
                line_no += 1
    return all_lines


In [178]:
fulltext2 = map_pdf2(filename) 

New section!
New section!
New section!
New section!
New section!
New section!
New section!
New section!
New section!
New section!
New section!
New section!
New section!
New section!
New section!
New section!
New section!
New section!
New section!
New section!
New section!
New section!
New section!
New section!
New section!
New section!
New section!
New section!
New section!
New section!
New section!
New section!
New section!
New section!
New section!
New section!
New section!
New section!
New section!
New section!
New section!
New section!
New section!
New section!
New section!
New section!
New section!
New section!
New section!
New section!
New section!
New section!
New section!
New section!
New section!
New section!
New section!
New section!
New section!
New section!
New section!
New section!
New section!
New section!
New section!
New section!
New section!
New section!
New section!
New section!
New section!
New section!
New section!
New section!
New section!
New section!
New section!

In [179]:
fulltext2

[('Table of Contents',
  {'range': (43, 60), 'x_dims': (252.72, 359.32572)},
  ((61, 71),
   [{'text': 'Table',
     'x0': 252.72,
     'x1': 287.030952,
     'top': 95.44511999999997,
     'doctop': 95.44511999999997,
     'bottom': 109.48512000000005,
     'upright': True,
     'height': 14.040000000000077,
     'width': 34.310952000000015,
     'direction': 'ltr',
     'range': (43, 48)},
    {'text': 'of',
     'x0': 290.517084,
     'x1': 302.268564,
     'top': 95.44511999999997,
     'doctop': 95.44511999999997,
     'bottom': 109.48512000000005,
     'upright': True,
     'height': 14.040000000000077,
     'width': 11.751480000000015,
     'direction': 'ltr',
     'range': (49, 51)},
    {'text': 'Contents',
     'x0': 305.628336,
     'x1': 359.32572,
     'top': 95.44511999999997,
     'doctop': 95.44511999999997,
     'bottom': 109.48512000000005,
     'upright': True,
     'height': 14.040000000000077,
     'width': 53.697384,
     'direction': 'ltr',
     'range': (52, 60)

In [None]:
fields = {}

integer_markers = ['year']
category_markers = ['code', 'category', 'class', 'district', 'precinct', 'company', 'name', 'health_area']


patterns = [
    (re.compile(r'[ ,–]+'), '_'),
    (re.compile(r'#'), 'num'),
    (re.compile(r'/'), 'or'),
    (re.compile(r'&'), 'and')
]

def clean_name(full_name):
    new_name = full_name.lower()
    for pattern, replacement in patterns:
        new_name = pattern.sub(replacement, new_name)
    return new_name

def get_word_starts_x(line):
   starts = [word['x0'] for word in line]
   return starts


def parse_table(table):
    rows = []
    k1,k2 = table[0][0].split(' ', 1)
    for i in table[1:]:
        row = i[0].split(' ', 1)
        rows.append({k1: row[0], k2: row[1] })
    return rows

tables = {}
in_description = False
in_table = False

all_mods = []

for key,value in fulltext.items():
    line = key[1]
    line_start = value[0]['range'][0]
    line_start = value[0]['range'][1]
    # Make sure description is set to False at the beginning of each definition, as 'Field Name' is the first part of each definition
    if line.startswith('Field Name:'):
        if len(value[1][1]) > 2: # Exclude the explanation of "Field Name" itself on page 3
            col_mods = ColCustomization(short_name=value[1][1][-1]['text'][1:-1]) # Get the field name minus the enclosing parentheses
            full_name = ' '.join(word['text'] for word in value[1][1][2:-1])
            new_name = clean_name(full_name.lower())
            print(new_name)
            is_fk = any([word in new_name for word in category_markers])
            col_mods.new_name = new_name
    if line.startswith('Format:'):
        if "Alphanumeric" in ' '.join([word for word in line]):
            col_mods.dtype = "String"
    if line.startswith('Description:'):
        in_description = True
        prev_line_start = line_start
    if in_description is True:
        if (line.startswith('Value') or line.startswith('VALUE')) and len(value[1][1]) <= 3: # Maximum number of words in a column heading
            col_starts = get_word_starts_x(value[1][1])
            in_table = True
            prev_line_start = line_start
            table = [(line, value)]
        elif in_table is True and (abs(col_starts[0] - get_word_starts_x(value[1][1])[0]) < .5) :
            table.append((line, value))
        elif in_table is True:
            table = parse_table(table)
            col_mods.definitions = table
            in_table = False
    
all_mods.append(col_mods)


In [None]:
print(all_mods)

In [None]:
# table_dicts = {}

# for field,table in tables.items():
#     rows = []
#     k1,k2 = table[0][0].split(' ', 1)
#     for i in table[1:]:
#         row = i[0].split(' ', 1)
#         rows.append({k1: row[0], k2: row[1] })
#     table_dicts[field] = rows

In [None]:
# table_dicts

In [None]:
# def get_word_starts_x(line):
#    starts = [word['x0'] for word in line]
#    return starts


# tables = {}
# in_description = False
# in_table = False

# wcols_to_rename = {}
# lookup_columns = []

# for key,value in fulltext.items():
#     line = key[1]
#     line_start = value[0]['range'][0]
#     line_start = value[0]['range'][1]
#     # Make sure description is set to False at the beginning of each definition, as 'Field Name' is the first part of each definition
#     if line.startswith('Field Name:'):
#         in_description = False
#         in_table = False
#         field_name = value[1][1][-1]['text'][1:-1] # Get the field name minus the enclosing parentheses
#         full_name = ' '.join(word['text'] for word in value[1][1][2:-1])
#         new_name = re.sub('[ ,]', '_', full_name.strip().lower())
#         new_name = re.sub('[#]', 'num', new_name)
#         print(field_name, full_name, new_name)
#         cols_to_rename[field_name] = new_name
#         if len(value[1][1]) > 2: # Exclude the explanation of "Field Name" itself on page 3
#             field_name = value[1][1][-1]['text'][1:-1] # Get the field name minus the enclosing parentheses
#         table = []
#         continue
#     # Detect the beginning of a description section, which might contain a table (anything outside it does not contain a table I am interested in)
#     if line.startswith('Description:'):
#         in_description = True
#         prev_line_start = line_start
#     if in_description is True:
#         if (line.startswith('Value') or line.startswith('VALUE')) and len(value[1][1]) <= 3: # Maximum number of words in a column heading
#             col_starts = get_word_starts_x(value[1][1])
#             in_table = True
#             prev_line_start = line_start
#             table = [(line, value)]
#         elif in_table is True and (abs(col_starts[0] - get_word_starts_x(value[1][1])[0]) < .5) :
#             table.append((line, value))
#         elif in_table is True:
#             tables[field_name] = table
#             in_table = False

# table_dicts = {}

# for field,table in tables.items():
#     rows = []
#     k1,k2 = table[0][0].split(' ', 1)
#     for i in table[1:]:
#         row = i[0].split(' ', 1)
#         rows.append({k1: row[0], k2: row[1] })
#     table_dicts[field] = rows


In [None]:
# for k,d in table_dicts.items():
#     print(k, d)

In [None]:

def parse_zoning(pdf_path):
    all_tables = {}
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, start=1):
            # Extract raw text as lines
            lines = page.extract_text().splitlines()
            # Extract tables
            tables = page.extract_tables()
            for table_index, table in enumerate(tables):
                # Find the position of the table in the raw text
                table_start_line = find_table_start(lines, table)
                # Extract the line before the table, if available
                label_line = (
                    lines[table_start_line - 2] if table_start_line > 0 else None
                )
                table = [row for row in table if "Abbreviation" not in row]
                if label_line is not None:
                    if "APPENDIX" in label_line:
                        label_line = re.sub("APPENDIX.*: ", "", label_line)
                        label_line = re.sub(" +", "_", label_line.lower())
                        prev_label_line = label_line
                    elif "PLUTO DATA DICTIONARY" in label_line:
                        label_line = None
                    elif "APPENDIX" not in label_line:
                        print("what's this?: ", print('label_line is', label_line))
                        table = [row for row in table if "Abbreviation" not in row]
                    if label_line != None:
                        all_tables[label_line] = table
                    else:
                        all_tables[prev_label_line] = all_tables[prev_label_line] + table
                else:
                    print('table_index is', table_index)
                    print('missed:', lines[table_start_line])
    return all_tables


def find_table_start(lines, table):
    """
    Identify the start of the table in the text by matching table rows
    """
    for i, line in enumerate(lines):
        # Convert the table's first row into a string and search for it in the text
        table_row = " ".join(str(cell) for cell in table[1] if cell)  # Skip empty cells
        if line in table_row:
            return i
    return -1

* Add tables from appendixes

In [None]:
table_dicts = parse_zoning(filename)

In [None]:
print(table_dicts)

* Parse Appendix D

### Extract the last table, which isn't actually a table, just text arranged in a table-like way.

In [None]:
import pdfplumber

def trim_lines_outside_table(lines, table_top_boundary_text=None, table_bottom_boundary_text=None):
    """Returns the index of the first line to contain the specified table_top_boundary_text

    Args:
        lines (_type_): _description_
        table_top_boundary_text (_type_, optional): _description_. Defaults to None.

    Returns:
        _type_: _description_
    """
    for idx, line in enumerate(lines):
        if table_top_boundary_text is not None and table_top_boundary_text in " ".join(
            [word["text"] for word in line]
        ):
            top_trim_line = idx
            continue
        elif (
            table_bottom_boundary_text is not None
            and table_bottom_boundary_text in " ".join([word["text"] for word in line])
        ):
            bottom_trim_line = idx
            continue
        else:
            continue

    trimmed_lines = [
        line
        for idx, line in enumerate(lines)
        if idx > top_trim_line and idx < bottom_trim_line
    ]
    return trimmed_lines

def group_words_by_row(words, y_thresh=5):
    """Groups words into rows based on vertical proximity, allowing small deviations in top values."""
    words = sorted(words, key=lambda w: w['top'])  # Sort words top-to-bottom
    rows = []

    for word in words:
        added = False
        for row in rows:
            # Compare with first word in the row for stability
            if abs(word['top'] - row[0]['top']) <= y_thresh:
                row.append(word)
                added = True
                break
        if not added:
            rows.append([word])

    return rows

def merge_words_in_row(row, x_thresh=10):
    """
    Merges words in a single row, considering the provided x_thresh for horizontal grouping.
    
    Returns:
    - A list of merged text blocks, each with the merged text and bounding box.
    """
    row.sort(key=lambda w: w['x0'])  # Sort words left-to-right
    merged_blocks = []
    current_block = []

    for word in row:
        if current_block and (word['x0'] - current_block[-1]['x1']) <= x_thresh:
            current_block.append(word)
        else:
            if current_block:
                current_block.sort(key=lambda w: w['top']) # Sort block by top coordinate to get text in each table cell correctly ordered.
                merged_blocks.append(current_block)
            current_block = [word]

    if current_block:
        merged_blocks.append(current_block)

    return [
        {
            "text": " ".join(w["text"] for w in block),
            "x0": min(w["x0"] for w in block),
            "x1": max(w["x1"] for w in block),
            "top": min(w["top"] for w in block),
            "bottom": max(w["bottom"] for w in block),
        }
        for block in merged_blocks
    ]


from collections import defaultdict

def merge_lines_in_row(lines, y_thresh):
    merged_lines = []
    
    for line in lines:
        if not merged_lines:
            merged_lines.append(line)
            continue
        
        prev_line = merged_lines[-1]
        
        # Compute merging condition
        min_top_current = min(word["top"] for word in line)
        max_bottom_prev = max(word["bottom"] for word in prev_line)
        
        if min_top_current - max_bottom_prev < y_thresh:
            # Merge into the previous line
            merged_lines[-1].extend(line)
        else:
            # Start a new line
            merged_lines.append(line)

    # Now merge words by `x0` within each line
    result = []
    
    for line in merged_lines:
        grouped = defaultdict(list)
        
        for (_, word) in enumerate(line):
            grouped[word["x0"]].append(word)
        
        merged_words = []
        
        for x0 in sorted(grouped.keys()):  # Preserve order
            words = grouped[x0]
            merged_text = " ".join(w["text"] for w in words)
            x1 = max(w["x1"] for w in words)
            top = min(w["top"] for w in words)
            bottom = max(w["bottom"] for w in words)
            
            merged_words.append({"text": merged_text, "x0": x0, "x1": x1, "top": top, "bottom": bottom})
        
        result.append(merged_words)

    return result



def detect_header_by_uppercase(rows):
    """Identifies the header row by checking if all words are uppercase."""
    header_row = []
    body_rows = []

    for row in rows:
        if all(word["text"].isupper() for word in row):  # All words must be uppercase
            header_row = header_row + row
            print("header_row words:", [w["text"] for w in row])
        else:
            body_rows.append(row)

    return header_row, body_rows


def merge_words_into_rows(words, header_x_thresh, header_y_thresh, body_x_thresh, body_y_thresh):
    """
    Groups words into rows and merges horizontally close words.
    """
    rows = group_words_by_row(words, header_y_thresh)
    trimmed_rows = trim_lines_outside_table(rows, table_top_boundary_text="APPENDIX D: LAND USE CATEGORIES", table_bottom_boundary_text="NOTES:")
    header_row, body_rows = detect_header_by_uppercase(trimmed_rows)
    merged_header = merge_words_in_row(header_row, header_x_thresh)
    print("merged header is", merged_header)
    merged_rows = [merge_words_in_row(row, body_x_thresh) for row in body_rows]
    merged_rows = merge_lines_in_row(merged_rows, body_y_thresh)
    all_rows = [merged_header] + merged_rows
    print("all_rows rows is", all_rows)
    return all_rows

def assign_columns_to_blocks(merged_rows, column_gap_thresh=20, ncol=3):
    """
    Assigns a column index to each merged text block by detecting significant gaps in x0 values.
    
    Parameters:
    - merged_rows: List of lists of merged word blocks.
    - column_gap_thresh: Minimum gap to consider as a column boundary.
    
    Returns:
    - A list where each element is a tuple (column_index, word_block_dict).
    """
    all_x_values = sorted(set(block["x0"] for row in merged_rows for block in row))

    # Detect gaps to determine column boundaries
    column_boundaries = [all_x_values[0]]
    for i in range(1, len(all_x_values)):
        if all_x_values[i] - all_x_values[i - 1] > column_gap_thresh:
            column_boundaries.append(all_x_values[i])

    def get_column_index(x0):
        """Finds the appropriate column index for a given x0 value."""
        for i, boundary in enumerate(column_boundaries):
            if x0 < boundary:
                return max(i - 1, 0)
        return len(column_boundaries) - 1

    structured_output = []
    for idx,row in enumerate(merged_rows):
        row_output = [((idx, (get_column_index(block["x0"]))), block) for block in row] # Store row and column indices for each block with that block
        # row_output = [row for row in row_output if row[0] <= ncol]
        structured_output.append(row_output)

    return structured_output




In [None]:
# Example usage
header_x_thresh = 10
header_y_thresh = 20
body_x_thresh = 5
body_y_thresh = 10
column_gap_thresh = 20  # Adjust based on observed spacing
ncol = 3

with pdfplumber.open(filename) as pdf:
    words = pdf.pages[-1].extract_words()  # Extract words from page 0
    merged_rows = merge_words_into_rows(words, header_x_thresh, header_y_thresh, body_x_thresh, body_y_thresh)
    structured_output = assign_columns_to_blocks(merged_rows, column_gap_thresh)




In [None]:
for row in structured_output:
    print(row)  # Prints the structured output


In [None]:
# with open("environment_data/table_dicts.pkl", "wb") as f:
#     dill.dump({'table_dicts' : table_dicts}, f)