In [None]:
from PyPDF2 import PdfReader
import matplotlib.pyplot as plt
import re
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter

def clean_text(text: str) -> str:
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s,.!?]', '', text)
    return text.strip()

def chunk_text(text: str, chunk_size: int = 800, overlap: int = 100) -> list:
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
    chunks = splitter.split_text(text)
    return chunks

def plot_chunk_lengths_histogram(chunks, title="Distribution of Chunk Lengths"):
    chunk_lengths = [len(chunk.split()) for chunk in chunks]  # Calculate word count for each chunk
    
    plt.hist(chunk_lengths, bins=30, alpha=0.7, color='blue', edgecolor='black')
    plt.title(title)
    plt.xlabel('Chunk Length (Word Count)')
    plt.ylabel('Frequency')
    plt.show()

def read_pdf(file_path: str): 
    if not os.path.isfile(file_path):
        raise FileNotFoundError(f"No such file: '{file_path}'")

    try:
        pdf_reader = PdfReader(file_path)
        extracted_text = ""
        for page in pdf_reader.pages:
            extracted_text += page.extract_text()
            
        # cleaned_text = clean_text(extracted_text)
        chunks = chunk_text(extracted_text)
        
        return {"chunks": chunks}

    except Exception as e:
        raise Exception(f"An error occurred: {str(e)}")

file_path = "gdpr.pdf"
result = read_pdf(file_path)
print(result)
print(f"Number of chunks: {len(result['chunks'])}")
for i, chunk in enumerate(result['chunks']):
    print(f"Chunk {i+1} length: {len(chunk.split())} words")


plot_chunk_lengths_histogram(result['chunks'])