In [None]:
from PyPDF2 import PdfReader
import re
import os
import pysbd

def clean_text(text: str) -> str:
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s,.!?]', '', text)
    return text.strip()

def chunk_text_pysbd(text: str, chunk_size: int = 6200, overlap: int = 30) -> list:
    segmenter = pysbd.Segmenter(language="en", clean=False)
    sentences = segmenter.segment(text)
    chunks = []
    current_chunk = []
    current_length = 0

    for sentence in sentences:
        sentence_length = len(sentence.split())
        if current_length + sentence_length > chunk_size:
            chunks.append(' '.join(current_chunk))
            current_chunk = current_chunk[-overlap:]  # Keep the overlap
            current_length = len(current_chunk)
        current_chunk.append(sentence)
        current_length += sentence_length

    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks

def read_pdf(file_path: str):
    if not os.path.isfile(file_path):
        raise FileNotFoundError(f"No such file: '{file_path}'")

    try:
        pdf_reader = PdfReader(file_path)
        extracted_text = ""
        for page in pdf_reader.pages:
            extracted_text += page.extract_text()
            
        cleaned_text = clean_text(extracted_text)
        chunks = chunk_text_pysbd(cleaned_text)
        
        return {"chunks": chunks}

    except Exception as e:
        raise Exception(f"An error occurred: {str(e)}")

file_path = "mml-book.pdf"
result = read_pdf(file_path)
print(result)
print(f"Number of chunks: {len(result['chunks'])}")
for i, chunk in enumerate(result['chunks']):
    print(f"Chunk {i+1} length: {len(chunk.split())} words")