# Natural PDF basics with text and tables

Learn the fundamentals of Natural PDF - opening PDFs, extracting text with layout preservation, selecting elements by criteria, spatial navigation, and managing exclusion zones. Perfect starting point for PDF data extraction.


In [None]:
# Install natural-pdf
!pip install natural-pdf

In [None]:
# Download the PDF file
import urllib.request
import os

pdf_url = "https://pub-4e99d31d19cb404d8d4f5f7efa51ef6e.r2.dev/pdfs/basics/basics.pdf"
pdf_name = "basics.pdf"

if not os.path.exists(pdf_name):
    print(f"Downloading {pdf_name}...")
    urllib.request.urlretrieve(pdf_url, pdf_name)
    print(f"Downloaded {pdf_name}")
else:
    print(f"{pdf_name} already exists")

# Opening a PDF

Let's start by opening a PDF. Natural PDF can work with local files or URLs.

In [None]:
from natural_pdf import PDF

pdf = PDF("basics.pdf")
page = pdf.pages[0]
page.show()

# Grabbing Page Text

You can extract text while preserving the layout, which maintains the spatial arrangement of text on the page.

In [None]:
text = page.extract_text(layout=True)
print(text)

# Selecting Elements and Text

Natural PDF provides powerful selectors to find specific elements on the page.

## Select text in a rectangle

In [None]:
page.find('rect').show()

In [None]:
text = page.find('rect').extract_text()
print(text)

## Find all text elements

In [None]:
page.find_all('text').show()

In [None]:
texts = page.find_all('text').extract_each_text()
for t in texts[:5]:  # Show first 5
    print(t)

## Find colored text

In [None]:
# Find red text
red_text = page.find('text[color~=red]')
print(red_text.extract_text())

## Find text by content

In [None]:
# Find text starting with specific string
text = page.find('text:contains("INS-")')
print(text.extract_text())

# Spatial Navigation

Natural PDF excels at spatial relationships between elements.

## Extract text to the right of a label

In [None]:
# Extract text to the right of "Date:"
date = page.find(text="Date:").right(height='element')
date.show()

In [None]:
date.extract_text()

## Extract tables

In [None]:
table = page.extract_table()
if table:
    df = table.to_df()
    print(df.head())

# Exclusion Zones

Sometimes you need to exclude headers, footers, or other unwanted areas from extraction.

## Exclude specific regions

In [None]:
top = page.region(top=0, left=0, height=80)
bottom = page.find_all("line")[-1].below()
(top + bottom).show()

In [None]:
# Exclude top header area
page.add_exclusion(top)

# Exclude area below last line
page.add_exclusion(bottom)

# Now extract text without excluded areas
text = page.extract_text()
print(text)

## PDF-level exclusions

Apply exclusions to all pages in a PDF:

In [None]:
print("BEFORE EXCLUSION:", pdf.pages[0].extract_text()[:200])
# Add header exclusion to all pages
pdf.add_exclusion(lambda page: page.region(top=0, left=0, height=80))
print("AFTER EXCLUSION:", pdf.pages[0].extract_text()[:200])