# CIA Document Analysis

Extracting information from declassified CIA documents using AI


In [None]:
# Install natural-pdf
!pip install natural-pdf

In [None]:
# Download the PDF file
import urllib.request
import os

pdf_url = "https://pub-4e99d31d19cb404d8d4f5f7efa51ef6e.r2.dev/pdfs/cia-document/cia-doc.pdf"
pdf_name = "cia-doc.pdf"

if not os.path.exists(pdf_name):
    print(f"Downloading {pdf_name}...")
    urllib.request.urlretrieve(pdf_url, pdf_name)
    print(f"Downloaded {pdf_name}")
else:
    print(f"{pdf_name} already exists")

# CIA Document Classification

Let's work with a declassified CIA document and use AI to classify and extract information.

In [None]:
from natural_pdf import PDF

pdf = PDF("cia-doc.pdf")
pdf.pages.show(cols=6)

Just like we did above, we can ask what category we think the PDF belongs to.

In [None]:
pdf.classify(
    ['slaughterhouse report', 'dolphin training manual', 'basketball', 'birding'],
    using='text'
)
(pdf.category, pdf.category_confidence)

I promise birding is real! The PDF is about *using pigeons to take surveillance photos.*

But beyond the text content, notice how all of the pages look very very different. **We can also categorize each page using vision!**

In [None]:
pdf.classify_pages(
    ['diagram', 'text', 'invoice', 'blank'],
    using='vision'
)

for page in pdf.pages:
    print(f"Page {page.number} is {page.category} - {page.category_confidence:0.3}")

And if we just want to see the pages that are diagrams, we can `.filter` for them.

In [None]:
(
    pdf.pages
    .filter(lambda page: page.category == 'diagram')
    .show(show_category=True)
)


We can also put them into groups.

In [None]:
groups = pdf.pages.groupby(lambda page: page.category)
groups.info()

In [None]:
diagrams = groups.get('diagram')
diagrams.show()

And if that's all we're interested in? We can save a new PDF of just those pages!

In [None]:
(
    pdf.pages
    .filter(lambda page: page.category == 'diagram')
    .save_pdf("diagrams.pdf", original=True)
)