In [7]:
import PyPDF2
import os
from typing import Optional

In [3]:
def validate_pdf(file_path: str) -> bool:
    if not os.path.exists(file_path):
        print(f"Error: File not found at path: {file_path}")
        return False
    if not file_path.lower().endswith('.pdf'):
        print("Error: File is not a PDF")
        return False
    return True

In [11]:
def extract_text_from_pdf(file_path: str, max_chars: int = 100000) -> Optional[str]:
    if not validate_pdf(file_path):
        return None
    
    try:
        with open(file_path, 'rb') as file:
            # Create PDF reader object
            pdf_reader = PyPDF2.PdfReader(file)
            
            # Get total number of pages
            num_pages = len(pdf_reader.pages)
            print(f"Processing PDF with {num_pages} pages...")
            
            extracted_text = []
            total_chars = 0
            
            # Iterate through all pages
            for page_num in range(num_pages):
                # Extract text from page
                page = pdf_reader.pages[page_num]
                text = page.extract_text()
                extracted_text.append(text)
                total_chars += len(text)
                print(f"Processed page {page_num + 1}/{num_pages}")
            
            final_text = '\n'.join(extracted_text)
            print(f"\nExtraction complete! Total characters: {len(final_text)}")
            return final_text
            
    except PyPDF2.PdfReadError:
        print("Error: Invalid or corrupted PDF file")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {str(e)}")
        return None


In [12]:
# Get PDF metadata
def get_pdf_metadata(file_path: str) -> Optional[dict]:
    if not validate_pdf(file_path):
        return None
    
    try:
        with open(file_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            metadata = {
                'num_pages': len(pdf_reader.pages),
                'metadata': pdf_reader.metadata
            }
            return metadata
    except Exception as e:
        print(f"Error extracting metadata: {str(e)}")
        return None

In [2]:
pdf_path = '/Users/joneickmeier/Downloads/w32942.pdf'


In [13]:
# Extract metadata first
print("Extracting metadata...")
metadata = get_pdf_metadata(pdf_path)
if metadata:
    print("\nPDF Metadata:")
    print(f"Number of pages: {metadata['num_pages']}")
    print("Document info:")
    for key, value in metadata['metadata'].items():
        print(f"{key}: {value}")

# Extract text
print("\nExtracting text...")
extracted_text = extract_text_from_pdf(pdf_path, max_chars=1000000)

# Display first 500 characters of extracted text as preview
if extracted_text:
    print("\nPreview of extracted text (first 500 characters):")
    print("-" * 50)
    print(extracted_text[:500])
    print("-" * 50)
    print(f"\nTotal characters extracted: {len(extracted_text)}")

# Optional: Save the extracted text to a file
if extracted_text:
    output_file = 'extracted_text.txt'
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(extracted_text)
    print(f"\nExtracted text has been saved to {output_file}")

Extracting metadata...

PDF Metadata:
Number of pages: 74
Document info:
/Author: 
/CreationDate: D:20240901151737-04'00'
/Creator: LaTeX with hyperref
/Keywords: 
/ModDate: D:20240910233646-04'00'
/PTEX.Fullbanner: This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5
/Producer: pdfTeX-1.40.25
/Subject: 
/Title: 
/Trapped: /False

Extracting text...
Processing PDF with 74 pages...
Processed page 1/74
Processed page 2/74
Processed page 3/74
Processed page 4/74
Processed page 5/74
Processed page 6/74
Processed page 7/74
Processed page 8/74
Processed page 9/74
Processed page 10/74
Processed page 11/74
Processed page 12/74
Processed page 13/74
Processed page 14/74
Processed page 15/74
Processed page 16/74
Processed page 17/74
Processed page 18/74
Processed page 19/74
Processed page 20/74
Processed page 21/74
Processed page 22/74
Processed page 23/74
Processed page 24/74
Processed page 25/74
Processed page 26/74
Processed page 27/74
Processed page 28/74
Pro

In [9]:
extracted_text

'NBER WORKING PAPER SERIES\nEXPECTED EPS × TRAILING P/E\nItzhak Ben-David\nAlex Chinco\nWorking Paper 32942\nhttp://www.nber.org/papers/w32942\nNATIONAL BUREAU OF ECONOMIC RESEARCH\n1050 Massachusetts Avenue\nCambridge, MA 02138\nSeptember 2024\nWe would like to thank Xavier Gabaix, Sinan Gokkaya, Valentin Haddad, Jeff Meli, Stefan Nagel, \nMarco Sammon, Amir Sufi, Laura Veldkamp, and Jeff Wurgler for helpful comments. This paper \nhas also benefited from feedback at the NBER SI Asset-Pricing meeting. The views expressed \nherein are those of the authors and do not necessarily reflect the views of the National Bureau of \nEconomic  Research.\nNBER working papers are circulated for discussion and comment purposes. They have not been \npeer-reviewed or been subject to the review by the NBER Board of Directors that accompanies \nofficial NBER publications.\n© 2024 by Itzhak Ben-David and Alex Chinco. All rights reserved. Short sections of text, not to \nexceed two paragraphs, may be quote