In [None]:
from src.settings import DATA_FOLDER_PATH
from src.pipelines.term_extraction.pipeline_config import PipelineConfig, SiteLeasePipelineConfig
from src.pipelines.term_extraction.pipeline import Pipeline
import re

In [None]:
test_file_path = DATA_FOLDER_PATH / "documents/Site Green - Emerald Garden - Cape Fear.pdf"

In [None]:
pipeline_config: PipelineConfig = SiteLeasePipelineConfig()
pipeline = Pipeline.from_config(config=pipeline_config)
result = pipeline.build_project_preview([test_file_path])

In [None]:
pipeline.processor.get_full_text()

In [None]:
result

In [None]:
def extract_paragraph_number(term: str, full_text: str) -> str:
    """Extract the hierarchy number for a citation term in the full text."""
    escaped_term = re.escape(term)
    match = re.search(escaped_term, full_text)
    if match:
        start_index = match.start()
        sliced_text = full_text[:start_index]
        paragraph_numbers = re.findall(r'\d+(\.\d+)*', sliced_text)
        if paragraph_numbers:
            return paragraph_numbers[-1]
    return ""

In [None]:
def test_extract_paragraph_number(term: str, full_text: str, expected: str) -> None:
    assert extract_paragraph_number(term, full_text) == expected

In [None]:
term = """(a) The initial term of this Lease (including any extensions or renewals, the “Initi: Ferm”)\nshall commence on the Effective Date and shall end at 11:59 P.M. local time shall terminate on the\nfifteenth (15th) anniversary of the Rent Commencement Date (as hereinafter defined) (the “Expiration\nDate")."""

In [None]:
full_text = pipeline.processor.get_full_text()

In [None]:
expected = "1.(a)"

In [None]:
short_text = full_text[:8000]

In [None]:
short_text

In [None]:
def find_hierarchy(full_text, term):
    # Split the full text into paragraphs with identifiable markers
    paragraphs = re.split(r'\n(?=\d+\.)|\n(?=\([a-z]\))', full_text)

    # Preprocess term for matching
    term_first_line = term.strip().split('\n')[0]
    
    # Variables to hold the current numerical and letter part of the hierarchy
    current_num = ''
    current_letter = ''

    # Iterate over paragraphs to find where the term is located
    for i, para in enumerate(paragraphs):
        num_match = re.match(r'^(\d+)\.', para)
        letter_match = re.match(r'^\(([a-z])\)', para)
        
        if num_match:
            # Update current number and reset letter when a new number is encountered
            current_num = num_match.group(1)
            current_letter = None  # Reset letter part since we're in a new section
        elif letter_match:
            # Update current letter
            current_letter = letter_match.group(1)
        
        # Check if the current paragraph contains the start of the term
        if term_first_line in para:
            # Construct the hierarchy string based on the current number and letter
            if current_letter:
                return f"{current_num}.({current_letter})"
            return current_num

    # Return a message if the term is not found in the hierarchy
    return ''

In [None]:
result['Predicted Legal Terms'][39]

In [None]:
find_hierarchy(full_text, result['Predicted Legal Terms'][39])

In [None]:
import difflib

def get_overlap(s1, s2):
    s = difflib.SequenceMatcher(None, s1, s2)
    pos_a, pos_b, size = s.find_longest_match(0, len(s1), 0, len(s2))
    return s1[pos_a:pos_a+size]
terms_with_paragraph_numbers = []
para_nums = []
for _, row in result.iterrows():
    para_num = find_hierarchy(full_text, row['Predicted Legal Terms'])
    para_nums.append(para_num)
    if get_overlap(row['Predicted Legal Terms'][:10], para_num):
        overlap = get_overlap(row['Predicted Legal Terms'], para_num)
        terms_with_paragraph_numbers.append(row['Predicted Legal Terms'].replace(overlap, para_num))
    else:
        terms_with_paragraph_numbers.append(para_num + row['Predicted Legal Terms'])
result['terms_with_paragraph_num'] = terms_with_paragraph_numbers
result['paragraph_number'] = para_nums

In [None]:
result

In [None]:
import re

# Example full_text with partial and full paragraph numbers
full_text = """This is paragraph 1. Following is 1.(a), which is detailed. However, in .(a), we see an overlap. Then there's paragraph 2. and 2.(a). Again, in .(a) there's an overlap."""

# Function to replace partial paragraph numbers with full numbers
def replace_partial_numbers(full_text):
    # Find all unique hierarchical markers
    markers = re.findall(r'\b\d+\.\(?[a-z]?\)?', full_text)

    # Create a map for partial to full marker replacements
    replacement_map = {}
    for marker in markers:
        partial_marker = marker.split('.')[1]  # Get the part after the period
        if partial_marker and partial_marker not in replacement_map:  # Check if not already mapped
            # Map partial marker to full marker
            replacement_map[partial_marker] = marker

    # Replace partial references with their full counterparts
    for partial, full in replacement_map.items():
        full_text = re.sub(r'\b\.' + re.escape(partial) + r'\b', full, full_text)

    return full_text

# Replace the partial paragraph numbers in the full_text
updated_text = replace_partial_numbers(full_text)
print(updated_text)


In [None]:
full_text[1500:6000]

In [None]:
example_text = """Preamble to the file \n1.\nFirst paragraph \n(a) First subparagraph to the paragraph 1. \n (b) Another subparagraph \n (i) New Subparagraph to subparagraph (b) \n 2. New paragraph \n 1. subparagraph to paragraph 2."""

In [None]:
import re

def fix_hierarchy(text):
    lines = text.split('\n')
    # Pattern to identify paragraphs and subparagraphs with hierarchy
    pattern = re.compile(r'(?P<indent>\s*)((?P<num>\d+\.)?)((?P<alpha>\([a-z]\))?)((?P<roman>\([ivxlc]+\))?)\s+(?P<content>.*)')

    hierarchy = []  # Stack to keep track of the current numbering

    for i, line in enumerate(lines):
        print(hierarchy)
        match = pattern.match(line)
        if not match:
            continue  # Skip lines that don't match the pattern

        indent, num, alpha, roman, content = match.group('indent', 'num', 'alpha', 'roman', 'content')
        level = len(indent) // 2  # Determine the hierarchy level based on indentation

        # Build the new hierarchy based on what was matched
        new_hierarchy = []
        if num:
            # New main paragraph
            new_hierarchy = [num.strip('.')]
        if alpha:
            # Subparagraph - alpha level
            while len(new_hierarchy) <= level:
                new_hierarchy.append('')  # Ensure the hierarchy list is long enough
            new_hierarchy[level] = alpha
        if roman:
            # Subparagraph - roman level
            while len(new_hierarchy) <= level + 1:
                new_hierarchy.append('')  # Ensure the hierarchy list is long enough
            new_hierarchy[level + 1] = roman

        # Trim excess entries in the hierarchy if we've moved back up levels
        new_hierarchy = new_hierarchy[:level + (2 if roman else 1 if alpha else 0)]

        # Update the line with the new hierarchy
        new_line = f"{''.join(new_hierarchy)} {content}" if any(new_hierarchy) else line
        lines[i] = new_line

        # Update the current hierarchy stack based on the new hierarchy
        hierarchy = new_hierarchy

    return '\n'.join(lines)

# Test the function with the original example text
example_text = """Preamble to the file \n1.\nFirst paragraph \n(a) First subparagraph to the paragraph 1. \n (b) Another subparagraph \n (i) New Subparagraph to subparagraph (b) \n 2. New paragraph \n 1. subparagraph to paragraph 2."""
expected_output = """Preamble to the file \n1.\nFirst paragraph \n1.(a) First subparagraph to the paragraph 1. \n 1.(b) Another subparagraph \n 1.(b).(i) New Subparagraph to subparagraph (b) \n 2. New paragraph \n 2.1. subparagraph to paragraph 2."""

fixed_text = fix_hierarchy(example_text)
print(fixed_text)


In [None]:
fixed_text

In [None]:
def find_paragraph_number

In [None]:
example_input = """Preamble to the file 
1.
First paragraph 
(a) First subparagraph to the paragraph 1. 
(b) Another subparagraph 
(i) New Subparagraph to subparagraph (b) 
2. New paragraph 
1. subparagraph to paragraph 2."""

# Split the input into lines
lines = example_input.split('\n')

# Function to determine if a line is a paragraph, subparagraph, or neither
def get_line_type(line):
    if re.match(r'^\d+\.', line):
        return 'paragraph'
    elif re.match(r'^\([a-z]+\)', line.strip()):
        return 'subparagraph'
    elif re.match(r'^\([i]+\)', line.strip()):
        return 'subsubparagraph'
    else:
        return 'text'

# Process lines to append correct numbering
current_paragraph = None
current_subparagraph = None
output_lines = []
for line in lines:
    line_type = get_line_type(line)

    if line_type == 'paragraph':
        current_paragraph = re.match(r'^(\d+)\.', line).group(1)
        current_subparagraph = None
        output_lines.append(line)
    elif line_type == 'subparagraph':
        current_subparagraph = re.match(r'^\(([a-z]+)\)', line.strip()).group(1)
        output_lines.append(f'{current_paragraph}.({current_subparagraph}){line.strip()[2:]}')
    elif line_type == 'subsubparagraph':
        output_lines.append(f'{current_paragraph}.({current_subparagraph}).{line.strip()}')
    else:
        output_lines.append(line)

expected_output = '\n'.join(output_lines)
print(expected_output)


In [None]:
example_input = """Preamble to the file 
1.
First paragraph 
(a) First subparagraph to the paragraph 1. 
(b) Another subparagraph 
(i) New Subparagraph to subparagraph (b) 
2. New paragraph 
1. subparagraph to paragraph 2."""

# Split the input into lines
lines = example_input.split('\n')

# Initialize variables to keep track of the hierarchy
current_paragraph = ''
current_subparagraph = ''
current_subsubparagraph = ''
output_lines = []

for line in lines:
    if line.strip().startswith('(') and ')' in line:
        # This is a subparagraph or subsubparagraph
        if line.strip()[1].isalpha():
            # This is a subparagraph
            current_subparagraph = line.strip()[1]
            current_subsubparagraph = ''
            output_lines.append(f"{current_paragraph}.({current_subparagraph}) {line.split(')', 1)[1].strip()}")
        elif line.strip()[1].islower():
            # This handles any other nested subparagraph levels not specifically handled
            output_lines.append(f"{current_paragraph}.({current_subparagraph}).{line.strip()}")
        else:
            # This is a subsubparagraph
            current_subsubparagraph = line.strip()[1]
            output_lines.append(f"{current_paragraph}.({current_subparagraph}).({current_subsubparagraph}) {line.split(')', 1)[1].strip()}")
    elif line.strip().isdigit() + '.' == line.strip():
        # This is a main paragraph
        current_paragraph = line.strip()[:-1]
        output_lines.append(line)
    else:
        # This is regular text or preamble
        output_lines.append(line)

# Correct subparagraph numbering for paragraphs
corrected_output_lines = []
for line in output_lines:
    if 'subparagraph to paragraph' in line:
        line_parts = line.split('.')
        corrected_line = f"{line_parts[0]}.1. {line_parts[1]}"
        corrected_output_lines.append(corrected_line)
    else:
        corrected_output_lines.append(line)

expected_output = '\n'.join(corrected_output_lines)
print(expected_output)
