In [None]:
import pdfplumber
import re
from itertools import tee
import src.models
import src.helpers

In [None]:
filename = '/home/james/Massive/PROJECTDATA/nyc_real_estate_data/dictionaries/mapPLUTO_data_dictionary.pdf'

In [None]:
# Define the delimiter string
delimiter = "Field Name"


# Extract text from the entire PDF
with pdfplumber.open(filename) as pdf:
    full_text = "\n".join(
        page.extract_text() for page in pdf.pages if page.extract_text()
    )

# Split the text based on the delimiter
split_texts = full_text.split(f'\n{delimiter}:')[1:]
restored_texts = [f'\n{delimiter}: {section}' for section in split_texts]


In [None]:

def split_on_strings(text, delimiters):
    pattern = "|".join(map(re.escape, delimiters))  # Regex pattern for delimiters
    matches = re.finditer(pattern, text)  # Find delimiter positions
    matches, matches_copy = tee(matches)  # Create two iterators (one for bounds, one for extraction)
    start = 0
    for match, next_match in zip(matches_copy, list(matches)[1:] + [None]):  
        end = next_match.start() if next_match else len(text)  
        yield (match[0], text[match.end():end])  # Include delimiter at the start, stop before the next

# Example Usage
delimiters = ["Field Name:", "Format:", "Data Source:", "Description:"]

entries = []
for text in restored_texts:
    definitions = {}
    for delimiter, segment in split_on_strings(text, delimiters):
        definitions[delimiter[0:-1]] = segment.strip()
    entries.append(definitions)


* Looking at the PLUTO data dictionary, it seems that most category variables are labeled as "alpahnumeric" even if they only contain numbers, such as zip codes.
* There are some exceptions, police precincts and districts are numeric and listed as such. However as there a limited number of repeating variables, I wil treat them as categorical as well.

In [None]:
def parse_field_name(name_string):
    long = name_string.split('(')[0].strip()
    short = re.sub(r'.*?\((.*?)\).*?', r'\1', name_string)
    return long.lower().replace(' ', '_'),  short.lower()

def parse_definitions_table(description_string, table_start_regex):
    # table_string = re.sub(r'.*Value Description(.*)', r'\1', description_string, flags=re.DOTALL)
    print(description_string)
    table_string = re.sub(f'{table_start_regex}(.*)', r'\1', description_string, flags=re.DOTALL)
    lines = table_string.splitlines()
    d = {}
    for line in lines:
        try:
            key, value = line.split(' ', 1)
            d[key.strip()] = value.strip()
        except:
            print(line)
    return d

definitions = []

category_markers = ['TRACT', 'DISTRICT', 'PRECINCT', 'HEALTH AREA']
# Alphanumeric columns that are not good candidates for lookup tables
alphanumeric_exceptions = ['EDesigNum', 'APPBBL']
# Numeric columns that are category codes
numeric_exceptions = ['CD', 'PLUTOMapID']
for entry in entries:
    d = {}
    d['category'] = src.helpers.isCategory(entry, category_markers, alphanumeric_exceptions, numeric_exceptions)
    d['long_name'], d['short_name'] = parse_field_name(entry['Field Name'])
    # The regex below is used to identify the start of the table in the description, it seems to work ok here.
    table_start_regex = r'\n\s?Value\s[a-zA-Z]+\s?\n'
    if re.search(table_start_regex, entry['Description']):
        d['table'] = parse_definitions_table(entry['Description'], table_start_regex)
    definitions.append(d)



In [None]:
pdf_path = filename

xvals = []
with pdfplumber.open(pdf_path) as pdf:
    for page in pdf.pages:
        words = page.extract_words()  # Each word has an x0 (left) position
        for word in words:
            # print(f"Text: {word['text']}, X: {word['x0']}, Y: {word['top']}")
            xvals.append(int(word['x0']))


In [None]:
xvals

In [None]:

with pdfplumber.open(pdf_path) as pdf:
    for page in pdf.pages:
        words = page.extract_words()  
        
        # Sort words by vertical position (Y-axis) first, then by left position (X-axis)
        words.sort(key=lambda w: (w["top"], w["x0"]))
        
        first_words = []
        last_top = None
        for word in words:
            if last_top is None or abs(word["top"] - last_top) > 2:  # New line detected
                first_words.append(word)
                last_top = word["top"]

        # Print or analyze first words of each line
        for word in first_words:
            print(f"First word: {word['text']} at X: {word['x0']}, Y: {word['top']}")


In [None]:
with pdfplumber.open(pdf_path) as pdf:
    for page in pdf.pages:
        words = page.extract_words()

        words.sort(key=lambda w: (w["top"], w["x0"]))
        first_words = []
        last_top = None
        last_x = None
        table_start = None
        table_end = None
        char_index = 0  # Track estimated character index in the text
        prev_line_startX = None
        for word in words:
            if last_top is None or abs(word["top"] - last_top) > 2: # New line detected
                text = word["text"]
                if prev_line_startX is not None:
                    if word["x0"] > prev_line_startX + 1:
                        word_length = len(text)
                        first_words.append(word)
                        last_top = word["top"]
                        table_start = char_index + 1
                        if word_length > 0:
                            first_char_index = char_index
                            last_char_index = char_index + word_length - 1
                            print(f"Word: {text}, Position: {word['x0']}, First Char Index: {first_char_index}, Last Char Index: {last_char_index}")
                        # Advance the character index (assuming spaces count as 1 character)
                    elif word["x0"] < last_x + 1:
                        table_end = word_length + 1
                last_x = word["x0"]
                prev_line_startX = word["x0"]
            char_index += word_length + 1  # +1 for space

In [None]:
fulltext = {}

with pdfplumber.open(pdf_path) as pdf:
    char_index = 0
    for page in pdf.pages:
        last_top = None
        words = page.extract_words()  
        # Sort words by vertical position (Y-axis) first, then by left position (X-axis)
        words.sort(key=lambda w: (w["top"], w["x0"]))
        line = []
        for word in words:
            word_length = len(word["text"])
            word['range'] = (char_index, char_index + word_length)
            char_index += word_length + 1
            if last_top is None or abs(word["top"] - last_top) > 2: # New line detected
                if last_top is not None:
                    print(line)
                    fulltext[' '.join([w['text'] for w in line])] = line
                last_top = word["top"]
                # fulltext.append(line)
                line = [word]
            else:
                line.append(word)
        

In [None]:
fulltext

In [None]:
definitions