PDF Q&A section extractor

In [13]:
# Pathways
import os
import sys
from pathlib import Path

# Automatically set the project path to the current working directory
project_path = Path.cwd()
os.chdir(project_path)
sys.path.insert(0, str(project_path))

print(f"Current directory: {os.getcwd()}")

Current directory: c:\Users\joshu\OneDrive\Documents\1 Work\Bank of England NLP\Bank-of-England-NLP-on-Earnings-Calls


In [None]:
import pdfplumber
import os
import csv
from PyPDF2 import PdfWriter, PdfReader
from pathlib import Path

# Use relative "1_data" folder from current working directory
data_folder = Path("1_data")

# Automatically get a list of all PDF files in the folder
pdf_files = list(data_folder.glob("*.pdf"))

# Directory to save extracted Q&A PDFs (relative to current working directory)
output_dir = Path("2_extracted_data")
output_dir.mkdir(parents=True, exist_ok=True)

# CSV file to store metadata
csv_file_path = output_dir / "qna_sections_metadata.csv"
csv_metadata = []

# Variations of Q&A section title
qna_variations = [
    "Questions and Answers", "QUESTION AND ANSWER SECTION", "Q&A Section", "Questions & Answers"
]

# Function to find the Q&A section start page
def find_qna_section(pdf_path, keywords):
    with pdfplumber.open(pdf_path) as pdf:
        for i, page in enumerate(pdf.pages):
            text = page.extract_text()
            if text and any(keyword in text for keyword in keywords):
                return i  # Return the page index where Q&A starts (0-indexed)
    return None

# Function to extract and save Q&A section along with the cover page
def extract_qna_with_cover(pdf_path, qna_page, output_filename):
    with open(pdf_path, "rb") as pdf_file:
        reader = PdfReader(pdf_file)
        writer = PdfWriter()
        
        # Ensure cover page is included. If the Q&A page exists, include it; otherwise, include only the cover.
        pages_to_extract = [0, qna_page] if qna_page is not None and qna_page < len(reader.pages) else [0]

        for page_num in pages_to_extract:
            writer.add_page(reader.pages[page_num])

        new_pdf_path = os.path.join(output_dir, output_filename)
        with open(new_pdf_path, "wb") as output_pdf:
            writer.write(output_pdf)

    return new_pdf_path

# Process each PDF document and save the extracted Q&A section
for pdf_file in pdf_files:
    original_filename = pdf_file.name  # e.g. "Q1_Transcript-Analyst-Call-25-April-2024.pdf"
    qna_page = find_qna_section(pdf_file, qna_variations)
    if qna_page is not None:
        qna_filename = f"QnA_{original_filename}"
        qna_file_path = extract_qna_with_cover(pdf_file, qna_page, qna_filename)
        # Store metadata: original filename, Q&A section page (converted to 1-indexed), and the new QnA filename.
        csv_metadata.append([original_filename, qna_page + 1, qna_filename])
    else:
        print(f"❌ Q&A section not found in {original_filename}")

# Write metadata to CSV
with open(csv_file_path, mode="w", newline="", encoding="utf-8") as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(["Original File Name", "Q&A Section Page", "Extracted Q&A File Name"])
    writer.writerows(csv_metadata)