# PDF Miner

In [14]:
from pdfminer.high_level import extract_pages
from pdfminer.converter import XMLConverter
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.layout import LAParams
from io import BytesIO  # Use BytesIO instead of StringIO

In [15]:
pdf_file_path = 'doc_inputs/Form_2287.pdf'
output = BytesIO()

In [16]:
# Set layout analysis parameters
laparams = LAParams()

In [17]:
# Set up resource manager and device
rsrcmgr = PDFResourceManager()
device = XMLConverter(rsrcmgr, output, laparams=laparams)

In [18]:
# Process each page in the PDF
with open(pdf_file_path, 'rb') as file:
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for page in PDFPage.get_pages(file):
        interpreter.process_page(page)

In [19]:
# Get the XML content as a string
xml_content = output.getvalue().decode('utf-8')

In [20]:
# Optionally, save the XML to a file
with open('doc_outputs/output.xml', 'w') as xml_file:
    xml_file.write(xml_content)

In [21]:
# Close the device and output
device.close()
output.close()

# Layout Parsing for Sections

In [12]:
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextBox, LTTextLine
import layoutparser as lp

In [15]:
#pdf_file_path = 'doc_inputs/Form_2287.pdf'
pdf_file_path = 'doc_inputs/Citi_LA.pdf'

In [16]:
elements = []
for page_layout in extract_pages(pdf_file_path):
    for element in page_layout:
        if isinstance(element, LTTextBox):
            for text_line in element:
                if isinstance(text_line, LTTextLine):
                    text = text_line.get_text().strip()
                    bbox = text_line.bbox  # (x0, y0, x1, y1)
                    font_size = text_line.height
                    elements.append({
                        'text': text,
                        'bbox': bbox,
                        'font_size': font_size
                    })

In [17]:
# Output the extracted elements
for element in elements:
    print(f"Text: {element['text']}, BBox: {element['bbox']}, Font Size: {element['font_size']}")

Text: KUTAK ROCK LLP, BBox: (445.63, 706.488, 551.38, 718.488), Font Size: 12.0
Text: DRAFT 11/28/17, BBox: (463.39, 692.308, 551.38, 704.308), Font Size: 12.0
Text: FUNDING LOAN AGREEMENT, BBox: (218.33, 534.628, 396.79, 546.628), Font Size: 12.0
Text: among, BBox: (289.73, 503.038, 325.37, 515.038), Font Size: 12.0
Text: CITIBANK, N.A.,, BBox: (259.49, 471.238, 355.51, 483.238), Font Size: 12.0
Text: as Funding Lender, BBox: (261.05, 457.438, 354.07, 469.438), Font Size: 12.0
Text: CITY OF LOS ANGELES,, BBox: (235.97, 425.63800000000003, 379.03, 437.63800000000003), Font Size: 12.0
Text: as Governmental Lender, BBox: (246.65, 411.838, 368.35, 423.838), Font Size: 12.0
Text: and, BBox: (297.41, 380.038, 317.69, 392.038), Font Size: 12.0
Text: U.S. BANK NATIONAL ASSOCIATION,, BBox: (197.66, 348.238, 417.43, 360.238), Font Size: 12.0
Text: as Fiscal Agent, BBox: (268.97, 334.418, 346.03, 346.418), Font Size: 12.0
Text: Dated as of December 1, 2017, BBox: (233.09, 272.618, 382.03, 284.61

In [18]:
# Convert the extracted elements into Layout Parser's format
layout = lp.Layout([lp.TextBlock(lp.Rectangle(x0, y0, x1, y1), text=text) 
                    for e in elements for text, (x0, y0, x1, y1) in [(e['text'], e['bbox'])]])

In [19]:
# Analyzing sections and subsections based on bounding box and font size
sections = []
current_section = None

In [20]:
for block in layout:
    if block.height > 12:  # Assuming headers have a larger height
        if current_section:
            sections.append(current_section)
        current_section = {'title': block.text, 'content': []}
    elif current_section:
        current_section['content'].append(block.text)

In [21]:
if current_section:
    sections.append(current_section)

In [23]:
# Output the identified sections
for i, section in enumerate(sections):
    print(f"*******Section {i+1}: {section['title']}")
    for content in section['content']:
        print(f"  {content}")

*******Section 1: Section 4.2.
  Security for the Funding Loan ....................................................................... 20
*******Section 2: Delivery of Security ....................................................................................... 21
  ARTICLE V
  LIMITED LIABILITY
  Section 5.1.
  Section 5.2.
  Section 5.3.
  Source of Payment of Funding Loan and Other Obligations ......................... 22
  Exempt from Individual Liability .................................................................. 22
  Limited Obligation ......................................................................................... 23
  ARTICLE VI
  CLOSING CONDITIONS; APPLICATION OF FUNDS
  Section 6.1.
  Conditions Precedent to Closing .................................................................... 24
  4844-3158-4342.1
  ARTICLE VII
  FUNDS AND ACCOUNTS
  Section 7.1.
  Section 7.2.
  Section 7.3.
  Section 7.4.
  Section 7.5.
  Section 7.6.
  Section 7.7.
  Authorizati

* Conclusion: layoutparser output is very poor