In [3]:
import os
import logging
from pypdf import PdfReader
import re
import random
from typing import List, Dict, Tuple, Any, Optional
from datetime import datetime
import traceback

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s: %(message)s')

# --- Function Definitions (get_pdf_file, extract_text_from_pdf, chunk_text, preprocess_text, extract_sections) ---
# These functions remain the same as in the previous version.
# ... (Keep the exact code for these 5 functions here) ...
def get_pdf_file(pdf_folder: str, mode: str = "latest") -> str:
    """
    Get PDF file path based on specified mode.

    Args:
        pdf_folder: Directory containing PDF files
        mode: Selection mode - "latest", "random", or "interactive"

    Returns:
        Path to selected PDF file

    Raises:
        FileNotFoundError: If the folder doesn't exist (after attempting creation)
                         or if no PDF files are found in the folder.
        ValueError: If an invalid mode is provided.
    """
    # Ensure folder exists
    if not os.path.exists(pdf_folder):
        try:
            os.makedirs(pdf_folder, exist_ok=True)
            logging.warning(f"PDF folder '{pdf_folder}' did not exist and was created.")
            raise FileNotFoundError(f"PDF folder '{pdf_folder}' was created, but no PDFs found.")
        except OSError as e:
            logging.error(f"Failed to create PDF folder '{pdf_folder}': {e}")
            raise FileNotFoundError(f"PDF folder '{pdf_folder}' does not exist and could not be created.")

    # Get list of PDF files
    try:
        pdf_files = [f for f in os.listdir(pdf_folder) if f.lower().endswith('.pdf') and os.path.isfile(os.path.join(pdf_folder, f))]
    except OSError as e:
        logging.error(f"Error listing files in folder '{pdf_folder}': {e}")
        raise FileNotFoundError(f"Could not access files in the PDF folder '{pdf_folder}'.")


    if not pdf_files:
        logging.warning(f"No PDF files found in folder: {pdf_folder}")
        raise FileNotFoundError(f"No PDF files found in {pdf_folder}")

    logging.info(f"Found {len(pdf_files)} PDF file(s) in '{pdf_folder}'.")

    selected_file_path = None

    if mode == "latest":
        try:
            latest_file = max(pdf_files, key=lambda f: os.path.getmtime(os.path.join(pdf_folder, f)))
            selected_file_path = os.path.join(pdf_folder, latest_file)
            logging.info(f"Selected latest PDF: {latest_file}")
        except Exception as e:
            logging.error(f"Error determining latest file: {e}")
            raise

    elif mode == "random":
        random_file = random.choice(pdf_files)
        selected_file_path = os.path.join(pdf_folder, random_file)
        logging.info(f"Selected random PDF: {random_file}")

    elif mode == "interactive":
        print("\nAvailable PDF files:")
        for i, file in enumerate(pdf_files):
            print(f"{i+1}. {file}")

        while True:
            try:
                choice_str = input(f"\nSelect PDF file number (1-{len(pdf_files)}): ")
                choice = int(choice_str)
                if 1 <= choice <= len(pdf_files):
                    selected_file = pdf_files[choice-1]
                    selected_file_path = os.path.join(pdf_folder, selected_file)
                    logging.info(f"User selected PDF: {selected_file}")
                    break
                else:
                    print(f"Invalid choice. Please enter a number between 1 and {len(pdf_files)}.")
            except ValueError:
                print("Invalid input. Please enter a number.")
            except EOFError:
                 logging.warning("EOF received, exiting interactive selection.")
                 raise ValueError("Interactive selection cancelled.")

    else:
        raise ValueError(f"Invalid mode: '{mode}'. Must be 'latest', 'random', or 'interactive'")

    if selected_file_path is None:
         raise RuntimeError("Failed to select a PDF file.")

    return selected_file_path


def extract_text_from_pdf(pdf_path: str) -> Tuple[str, Dict[str, Any]]:
    """
    Extract text and metadata from a PDF file using pypdf.

    Args:
        pdf_path: Path to the PDF file

    Returns:
        Tuple containing extracted text (str) and metadata (dict)

    Raises:
        FileNotFoundError: If the PDF file does not exist or is not a file.
        Exception: If any error occurs during PDF processing with pypdf.
    """
    if not os.path.exists(pdf_path):
        raise FileNotFoundError(f"PDF file not found: {pdf_path}")
    if not os.path.isfile(pdf_path):
        raise FileNotFoundError(f"Path exists but is not a file: {pdf_path}")

    try:
        logging.info(f"Opening PDF: {os.path.basename(pdf_path)} using pypdf")
        reader = PdfReader(pdf_path)
        metadata = reader.metadata
        num_pages = len(reader.pages)

        # Extract metadata using pypdf's metadata object
        extracted_metadata = {
            "title": getattr(metadata, 'title', os.path.basename(pdf_path)),
            "author": getattr(metadata, 'author', "Unknown"),
            "subject": getattr(metadata, 'subject', ""),
            "creator": getattr(metadata, 'creator', ""),
            "producer": getattr(metadata, 'producer', ""),
            "page_count": num_pages,
            "is_encrypted": reader.is_encrypted,
            "file_name": os.path.basename(pdf_path),
            "file_path": pdf_path
        }

        logging.info(f"Extracting text from {num_pages} page(s)...")
        full_text = ""
        for page_num, page in enumerate(reader.pages):
            try:
                 text = page.extract_text()
                 if text:
                     full_text += text + "\n\n"
                 else:
                      logging.warning(f"No text extracted from page {page_num + 1}.")
                      full_text += f"[Page {page_num + 1} text could not be extracted or is empty]\n\n"
            except Exception as page_error:
                 logging.warning(f"Could not extract text from page {page_num + 1}: {page_error}")
                 full_text += f"[Page {page_num + 1} text extraction failed: {page_error}]\n\n"

        logging.info(f"Successfully extracted {len(full_text)} characters from PDF.")
        full_text = full_text.replace('\x00', '')

        return full_text, extracted_metadata

    except Exception as e:
        logging.error(f"Error extracting text from PDF '{os.path.basename(pdf_path)}' using pypdf: {e}")
        logging.error(traceback.format_exc())
        raise


def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 150) -> List[str]:
    """
    Split text into potentially overlapping chunks based on character count,
    attempting to break at paragraphs or sentences near the target size.

    Args:
        text: The input text to chunk
        chunk_size: Target size of each chunk in characters (approximate)
        overlap: Number of characters to overlap between consecutive chunks

    Returns:
        List of text chunks
    """
    if not isinstance(text, str):
        logging.warning("chunk_text received non-string input, returning empty list.")
        return []
    if not text.strip():
         logging.info("chunk_text received empty or whitespace-only text.")
         return []
    if not isinstance(chunk_size, int) or chunk_size <= 0:
         logging.warning(f"Invalid chunk_size ({chunk_size}), using default 1000.")
         chunk_size = 1000
    if not isinstance(overlap, int) or overlap < 0:
         logging.warning(f"Invalid overlap ({overlap}), using default 150.")
         overlap = 150
    if overlap >= chunk_size:
         logging.warning(f"Overlap ({overlap}) >= chunk_size ({chunk_size}), reducing overlap.")
         overlap = max(0, chunk_size // 5)

    chunks = []
    start = 0
    text_length = len(text)

    if text_length <= chunk_size:
        logging.info("Text length is less than or equal to chunk_size, returning as single chunk.")
        return [text.strip()]

    while start < text_length:
        end = min(start + chunk_size, text_length)
        actual_end = end

        if end < text_length:
            search_start = max(start, end - overlap - (chunk_size // 10))
            search_end = end

            paragraph_break = text.rfind('\n\n', search_start, search_end)
            if paragraph_break != -1:
                 actual_end = paragraph_break + 2
            else:
                sentence_break = -1
                for punct in ['.', '?', '!']:
                     break_pos = text.rfind(punct + ' ', search_start, search_end)
                     if break_pos != -1: sentence_break = max(sentence_break, break_pos + 2)
                     break_pos_nl = text.rfind(punct + '\n', search_start, search_end)
                     if break_pos_nl != -1: sentence_break = max(sentence_break, break_pos_nl + 2)

                if sentence_break != -1:
                     actual_end = sentence_break
                else:
                     newline_break = text.rfind('\n', search_start, search_end)
                     if newline_break != -1:
                          actual_end = newline_break + 1

        if actual_end <= start:
            actual_end = end

        chunk = text[start:actual_end].strip()
        if chunk:
            chunks.append(chunk)

        next_start = actual_end - overlap
        if next_start <= start:
             next_start = start + (chunk_size - overlap)
             if next_start <= start :
                  next_start = actual_end

        start = next_start

        if len(chunks) > text_length:
             logging.error("Chunking loop exceeded text length, breaking.")
             break

    logging.info(f"Text split into {len(chunks)} chunks (size ~{chunk_size}, overlap ~{overlap})")
    return chunks


def preprocess_text(text: str) -> str:
    """
    Perform basic preprocessing on extracted text.

    Args:
        text: Raw text extracted from PDF

    Returns:
        Preprocessed text (str)
    """
    if not isinstance(text, str):
         logging.warning("preprocess_text received non-string input.")
         return ""

    logging.debug(f"Preprocessing text ({len(text)} characters)...")

    text = re.sub(r'(\w)-\n(\w)', r'\1\2', text)
    logging.debug("Applied hyphenation fix.")

    text = text.replace('\n\n', '<<PARAGRAPH_BREAK>>')
    text = re.sub(r'\s+', ' ', text)
    text = text.replace('<<PARAGRAPH_BREAK>>', '\n\n')
    logging.debug("Normalized whitespace.")

    text = text.strip()
    logging.info(f"Preprocessing finished, final text length: {len(text)} characters.")
    return text


def extract_sections(text: str) -> Dict[str, str]:
    """
    Attempt to extract logical sections based on common heading patterns.

    Args:
        text: Preprocessed document text

    Returns:
        Dictionary of section name to section content.
        Returns {"Full Text": text} if no sections are identified.
    """
    if not isinstance(text, str) or not text.strip():
         logging.warning("extract_sections received empty or invalid text.")
         return {"Full Text": text or ""}

    sections = {}
    heading_pattern = re.compile(
        r'^(?:(?:\d+\.|[IVXLCDM]+\.|[a-zA-Z]\))\s*)?'
        r'([A-Z][A-Z0-9\s\-]{3,}|[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)'
        r'\s*(?:\n|$)',
        re.MULTILINE
    )

    matches = list(heading_pattern.finditer(text))

    if not matches:
        logging.info("No distinct section headings found using the pattern.")
        return {"Full Text": text}

    logging.info(f"Found {len(matches)} potential section headings.")

    last_pos = 0
    first_match_start = matches[0].start()
    if first_match_start > 0:
        preface_content = text[0:first_match_start].strip()
        if preface_content:
            sections["Preface"] = preface_content
            logging.debug("Extracted 'Preface' section.")
        last_pos = first_match_start

    for i, match in enumerate(matches):
        section_name = match.group(1).strip()
        section_start = match.start()

        if i + 1 < len(matches):
            section_end = matches[i+1].start()
        else:
            section_end = len(text)

        content_start = match.end()
        section_content = text[content_start:section_end].strip()

        if section_content:
             if section_name in sections:
                  logging.warning(f"Duplicate section name '{section_name}' found. Appending count.")
                  count = 2
                  new_name = f"{section_name}_{count}"
                  while new_name in sections:
                       count += 1
                       new_name = f"{section_name}_{count}"
                  section_name = new_name
             sections[section_name] = section_content
             logging.debug(f"Extracted section: '{section_name}'")

    if not sections:
         logging.warning("Section extraction resulted in an empty dictionary, returning full text.")
         return {"Full Text": text}

    return sections


# --- New Function to Encapsulate Processing ---
def process_pdf(pdf_folder: str, selection_mode: str, chunk_size: int, chunk_overlap: int) -> Tuple[Optional[List[str]], Optional[Dict[str, Any]]]:
    """
    Processes a selected PDF file to extract text, metadata, and generate text chunks.

    Args:
        pdf_folder: Directory containing PDF files.
        selection_mode: Mode for selecting PDF ('latest', 'random', 'interactive').
        chunk_size: Target size for text chunks.
        chunk_overlap: Overlap between text chunks.

    Returns:
        A tuple containing:
          - List of text chunks (List[str]) or None if an error occurs.
          - Metadata dictionary (Dict[str, Any]) or None if an error occurs.
    """
    selected_pdf_path = ""
    try:
        # Step 1: Get PDF Path
        selected_pdf_path = get_pdf_file(pdf_folder, mode=selection_mode)
        logging.info(f"Selected PDF for processing: {selected_pdf_path}")

        # Step 2: Extract Text and Metadata
        extracted_text, pdf_metadata = extract_text_from_pdf(selected_pdf_path)
        logging.info("Text and metadata extracted successfully.")
        # Optionally print metadata here if still desired for console feedback
        # print("\n--- PDF Metadata (from pypdf) ---")
        # for key, value in pdf_metadata.items(): ...

        # Step 3: Preprocess Text
        preprocessed_text = preprocess_text(extracted_text)
        logging.info("Text preprocessing completed.")

        # Step 4: Chunk Text
        text_chunks = chunk_text(preprocessed_text, chunk_size=chunk_size, overlap=chunk_overlap)
        logging.info(f"Text chunking completed. {len(text_chunks)} chunks created.")

        # Step 5 (Optional): Extract Sections - not returned, but logged if needed
        # document_sections = extract_sections(preprocessed_text)
        # logging.info(f"Section extraction attempted: {len(document_sections)} potential sections found.")

        # Return the chunks and metadata
        return text_chunks, pdf_metadata

    except FileNotFoundError as e:
        logging.error(f"File/Folder Error during processing: {e}")
        print(f"\nERROR: {e}")
        return None, None # Return None on error
    except ValueError as e:
        logging.error(f"Configuration or Input Error during processing: {e}")
        print(f"\nERROR: {e}")
        return None, None # Return None on error
    except Exception as e:
        logging.error(f"An unexpected error occurred during processing pdf '{selected_pdf_path}': {e}", exc_info=True)
        print(f"\nUNEXPECTED ERROR processing {os.path.basename(selected_pdf_path)}: {e}")
        return None, None # Return None on error


# --- Main execution block ---
if __name__ == "__main__":
    logging.info("Script execution started.")

    # <<< --- CONFIGURATION --- >>>
    PDF_FOLDER = "my_pdfs"  # IMPORTANT: Change this to the path of your folder
    SELECTION_MODE = "random" # "latest", "random", or "interactive"
    CHUNK_SIZE = 1500        # Adjusted chunk size
    CHUNK_OVERLAP = 200
    # <<< --- END CONFIGURATION --- >>>

    # Call the processing function
    all_chunks, metadata = process_pdf(
        pdf_folder=PDF_FOLDER,
        selection_mode=SELECTION_MODE,
        chunk_size=CHUNK_SIZE,
        chunk_overlap=CHUNK_OVERLAP
    )

    # Check the results and print confirmation/output
    if all_chunks is not None:
        print("\n" + "=" * 60)
        print(f"PDF Processing Completed Successfully for: {metadata.get('file_name', 'N/A')}")
        print(f"Total chunks returned: {len(all_chunks)}")
        print("=" * 60)

        # Optionally, print the first few chunks to verify the returned list
        print("\n--- First 3 Chunks (Returned Value Sample) ---")
        for i, chunk in enumerate(all_chunks[:3]):
             print(f"\nChunk {i + 1} (length: {len(chunk)} chars):")
             # Print first 200 chars of each sample chunk
             print(chunk[:200] + "...")
        print("-" * 60)

        # You can now use the 'all_chunks' list for further processing
        # For example:
        # process_chunks_further(all_chunks)

    else:
        print("\n" + "=" * 60)
        print("PDF Processing Failed. Check logs for details.")
        print("=" * 60)


    logging.info("Script execution finished.")


PDF Processing Completed Successfully for: w27392 (1).pdf
Total chunks returned: 53

--- First 3 Chunks (Returned Value Sample) ---

Chunk 1 (length: 1272 chars):
NBER WORKING PAPER SERIES THE IMPACT OF COVID-19 ON STUDENT EXPERIENCES AND EXPECTATIONS: EVIDENCE FROM A SURVEY Esteban M. Aucejo Jacob F. French Maria Paola Ugalde Araya Basit Zafar Working Paper 27...

Chunk 2 (length: 1435 chars):
f COVID-19 on Student Experiences and Expectations: Evidence from a Survey Esteban M. Aucejo, Jacob F. French, Maria Paola Ugalde Araya, and Basit Zafar NBER Working Paper No. 27392 June 2020 JEL No. ...

Chunk 3 (length: 1434 chars):
ally by socioeconomic factors and constitute key mediators in explaining the large (and heterogeneous) effects of the pandemic. Esteban M. Aucejo Department of Economics Arizona State University P.O. ...
------------------------------------------------------------
