In [None]:
import fitz  
import json
import os
import re

def is_useful_paragraph(paragraph):
    """Check if a paragraph is useful without being too strict."""
    paragraph_lower = paragraph.lower()

    if "saylor.org" in paragraph_lower or "saylor url" in paragraph_lower:
        return False
    if re.search(r"http[s]?://", paragraph_lower):  
        return False
    if "copyright" in paragraph_lower or "all rights reserved" in paragraph_lower:
        return False
    if paragraph_lower.startswith("references"):
        return False

    if len(paragraph) < 30 or re.fullmatch(r"[\d\s\W]+", paragraph):
        return False

    return True

def extract_paragraphs_from_pdf(pdf_path, num_paragraphs=25):
    """Extract useful paragraphs from a given PDF file."""
    doc = fitz.open(pdf_path)
    text = ""
    
    for page in doc:
        text += page.get_text("text") + "\n"

    paragraphs = [p.strip() for p in re.split(r"\n{2,}|\n(?=[A-Z])", text)]
    
    useful_paragraphs = [p for p in paragraphs if is_useful_paragraph(p)]
    
    return useful_paragraphs[:num_paragraphs] 

pdf_files = [
    "Documents/Finance - Wikipedia.pdf",
    "Documents/Legal Aspects of Corporate Management and Finance.pdf",
    "Documents/PrinciplesofFinance-WEB.pdf",
    "Documents/Stock market - Wikipedia.pdf"
]

output_data = {}

for pdf_file in pdf_files:
    if os.path.exists(pdf_file):
        print(f"Extracting from {pdf_file}...")
        paragraphs = extract_paragraphs_from_pdf(pdf_file, 25)
        output_data[pdf_file] = paragraphs
    else:
        print(f"File not found: {pdf_file}")

os.makedirs("Results", exist_ok=True)
with open("Results/extracted_paragraphs.json", "w", encoding="utf-8") as json_file:
    json.dump(output_data, json_file, indent=4, ensure_ascii=False)

print("Extraction complete. Data saved to extracted_paragraphs.json")


Extracting from Documents/Finance - Wikipedia.pdf...
Extracting from Documents/Legal Aspects of Corporate Management and Finance.pdf...
Extracting from Documents/PrinciplesofFinance-WEB.pdf...
Extracting from Documents/Stock market - Wikipedia.pdf...
Extraction complete. Data saved to extracted_paragraphs.json


# Loop through the generated Paragraphs 

In [4]:
import json

def loop_through_paragraphs(json_file):
    """Loop through paragraphs in the extracted JSON file."""
    with open(json_file, "r", encoding="utf-8") as file:
        data = json.load(file)
    
    for pdf_file, paragraphs in data.items():
        print(f"\nExtracted paragraphs from: {pdf_file}\n")
        for i, paragraph in enumerate(paragraphs, 1):
            print(f"Paragraph {i}: {paragraph}\n")

json_file = "Results/extracted_paragraphs.json"
loop_through_paragraphs(json_file)


Extracted paragraphs from: Documents/Finance - Wikipedia.pdf

Paragraph 1: Finance refers to monetary resources and to the study and discipline of money, currency, assets
and liabilities.[a] As a subject of study, it is related to but distinct from economics, which is the
study of the production, distribution, and consumption of goods and services.[b] Based on the
scope of financial activities in financial systems, the discipline can be divided into personal,
corporate, and public finance.

Paragraph 2: In these financial systems, assets are bought, sold, or traded as financial instruments, such as
currencies, loans, bonds, shares, stocks, options, futures, etc. Assets can also be banked, invested,
and insured to maximize value and minimize loss. In practice, risks are always present in any
financial action and entities.

Paragraph 3: Due to its wide scope, a broad range of subfields exists within finance. Asset-, money-, risk- and
investment  management  aim  to  maximize  value  and