In [3]:
!poetry add PyPDF2

Using version [39;1m^3.0.1[39;22m for [36mpypdf2[39m

[34mUpdating dependencies[39m
[2K[34mResolving dependencies...[39m [39;2m(1.3s)[39;22m

[39;1mPackage operations[39;22m: [34m1[39m install, [34m0[39m updates, [34m0[39m removals

  [34;1m-[39;22m [39mInstalling [39m[36mpypdf2[39m[39m ([39m[39;1m3.0.1[39;22m[39m)[39m: [34mPending...[39m
[1A[0J  [34;1m-[39;22m [39mInstalling [39m[36mpypdf2[39m[39m ([39m[39;1m3.0.1[39;22m[39m)[39m: [34mDownloading...[39m [39;1m0%[39;22m
[1A[0J  [34;1m-[39;22m [39mInstalling [39m[36mpypdf2[39m[39m ([39m[39;1m3.0.1[39;22m[39m)[39m: [34mDownloading...[39m [39;1m61%[39;22m
[1A[0J  [34;1m-[39;22m [39mInstalling [39m[36mpypdf2[39m[39m ([39m[39;1m3.0.1[39;22m[39m)[39m: [34mDownloading...[39m [39;1m91%[39;22m
[1A[0J  [34;1m-[39;22m [39mInstalling [39m[36mpypdf2[39m[39m ([39m[39;1m3.0.1[39;22m[39m)[39m: [34mDownloading...[39m [39;1m100%[39;22m
[1A[0J  [34

In [5]:
import PyPDF2
from pathlib import Path


def sliding_window(seq, size, step):
    """
    Split a sequence into overlapping chunks using a sliding window.
    
    Args:
        seq: The sequence to chunk (typically a string)
        size: Size of each chunk
        step: Step size between chunks (overlap = size - step)
    
    Returns:
        List of dictionaries with 'start' index and 'chunk' content
    """
    if size <= 0 or step <= 0:
        raise ValueError("size and step must be positive")

    n = len(seq)
    result = []
    for i in range(0, n, step):
        chunk = seq[i:i+size]
        result.append({'start': i, 'chunk': chunk})
        if i + size >= n:
            break

    return result


def read_pdf(pdf_path):
    """
    Read text content from a PDF file.
    
    Args:
        pdf_path: Path to the PDF file (string or Path object)
    
    Returns:
        String containing all text from the PDF
    """
    pdf_path = Path(pdf_path)
    
    if not pdf_path.exists():
        raise FileNotFoundError(f"PDF file not found: {pdf_path}")
    
    if not pdf_path.suffix.lower() == '.pdf':
        raise ValueError(f"File is not a PDF: {pdf_path}")
    
    text_content = []
    
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        
        for page_num, page in enumerate(pdf_reader.pages):
            page_text = page.extract_text()
            text_content.append(page_text)
    
    return '\n'.join(text_content)


def process_pdf_with_chunks(pdf_path, chunk_size=2000, step_size=1000, metadata=None):
    """
    Read a PDF and create sliding window chunks with metadata.
    
    Args:
        pdf_path: Path to the PDF file
        chunk_size: Size of each chunk (default: 2000 characters)
        step_size: Step between chunks (default: 1000 characters)
        metadata: Optional dict of metadata to add to each chunk
    
    Returns:
        List of chunk dictionaries with content and metadata
    """
    # Read PDF content
    content = read_pdf(pdf_path)
    
    # Create chunks using sliding window
    chunks = sliding_window(content, chunk_size, step_size)
    
    # Add metadata to each chunk
    if metadata is None:
        metadata = {}
    
    # Add source file to metadata
    metadata['source'] = str(Path(pdf_path).name)
    metadata['full_path'] = str(Path(pdf_path).absolute())
    
    for chunk in chunks:
        chunk.update(metadata)
    
    return chunks


# Example usage for multiple PDF files
def process_multiple_pdfs(pdf_paths, chunk_size=2000, step_size=1000):
    """
    Process multiple PDF files and combine their chunks.
    
    Args:
        pdf_paths: List of paths to PDF files
        chunk_size: Size of each chunk
        step_size: Step between chunks
    
    Returns:
        List of all chunks from all PDFs
    """
    all_chunks = []
    
    for pdf_path in pdf_paths:
        try:
            # You can add custom metadata per document here
            metadata = {
                'document_id': Path(pdf_path).stem,  # filename without extension
            }
            
            chunks = process_pdf_with_chunks(pdf_path, chunk_size, step_size, metadata)
            all_chunks.extend(chunks)
            
            print(f"Processed {pdf_path}: {len(chunks)} chunks created")
            
        except Exception as e:
            print(f"Error processing {pdf_path}: {e}")
    
    return all_chunks


if __name__ == "__main__":
    # Example 1: Process a single PDF
    pdf_file = "Kenya-ARV-Guidelines-2022-Final-1.pdf"
    
    try:
        chunks = process_pdf_with_chunks(
            pdf_file,
            chunk_size=2000,
            step_size=1000,
            metadata={'category': 'documentation', 'version': '1.0'}
        )
        
        print(f"\nTotal chunks: {len(chunks)}")
        print(f"\nFirst chunk preview:")
        print(f"Start position: {chunks[0]['start']}")
        print(f"Content preview: {chunks[0]['chunk'][:200]}...")
        print(f"Metadata: {chunks[0].get('source')}")
        
    except FileNotFoundError:
        print(f"Please provide a valid PDF file path")
    
    # Example 2: Process multiple PDFs (matching your original pattern)
    """
    evidently_docs = [
        {'path': 'doc1.pdf', 'author': 'John', 'type': 'report'},
        {'path': 'doc2.pdf', 'author': 'Jane', 'type': 'analysis'},
    ]
    
    evidently_chunks = []
    
    for doc in evidently_docs:
        doc_copy = doc.copy()
        pdf_path = doc_copy.pop('path')  # Get the path and remove from metadata
        
        # Read PDF and create chunks
        content = read_pdf(pdf_path)
        chunks = sliding_window(content, 2000, 1000)
        
        # Add metadata to each chunk
        for chunk in chunks:
            chunk.update(doc_copy)
        
        evidently_chunks.extend(chunks)
    
    print(f"\nProcessed {len(evidently_docs)} documents into {len(evidently_chunks)} chunks")
    """


Total chunks: 608

First chunk preview:
Start position: 0
Content preview:  
 


  
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Kenya HIV Prevention and Treatment Guidelines, 2022  
 
2022 Edition  
 
 
  
 
 
 
 
 
 
 
 
 
Â© National AIDS & STI Control Program 2022  
 
Th...
Metadata: Kenya-ARV-Guidelines-2022-Final-1.pdf


In [6]:
import re
text = evidently_docs[45]['content']
paragraphs = re.split(r"\n\s*\n", text.strip())

NameError: name 'evidently_docs' is not defined

In [7]:
import re

def split_markdown_by_level(text, level=2):
    """
    Split markdown text by a specific header level.
    
    :param text: Markdown text as a string
    :param level: Header level to split on
    :return: List of sections as strings
    """
    # This regex matches markdown headers
    # For level 2, it matches lines starting with "## "
    header_pattern = r'^(#{' + str(level) + r'} )(.+)$'
    pattern = re.compile(header_pattern, re.MULTILINE)

    # Split and keep the headers
    parts = pattern.split(text)
    
    sections = []
    for i in range(1, len(parts), 3):
        # We step by 3 because regex.split() with
        # capturing groups returns:
        # [before_match, group1, group2, after_match, ...]
        # here group1 is "## ", group2 is the header text
        header = parts[i] + parts[i+1]  # "## " + "Title"
        header = header.strip()

        # Get the content after this header
        content = ""
        if i+2 < len(parts):
            content = parts[i+2].strip()

        if content:
            section = f'{header}\n\n{content}'
        else:
            section = header
        sections.append(section)
    
    return sections

In [8]:
sections = split_markdown_by_level(text, level=2)

NameError: name 'text' is not defined