### Experiment 1

In [None]:
import fitz # From PyMuPDF for PDF processing
import aiohttp
from langchain.schema import Document

text_chunks = []
file_urls = ["https://emcdevstoragev2.blob.core.windows.net/document-analysis/42aadc49-c7bf-4055-97d6-b3933554d1a1"]

for file_url in file_urls:
    async with aiohttp.ClientSession() as session:
        async with session.get(file_url) as response:
            response.raise_for_status()
            pdf_bytes = await response.read()

    with fitz.open(stream=pdf_bytes, filetype="pdf") as doc:
        for page_num, page in enumerate(doc, start=1):
            page_text = page.get_text()
            doucment = Document(page_content=page_text, metadata={"page": page_num})
            text_chunks.append(doucment)
        
print(text_chunks)

### Experiment 2

In [None]:
from langchain_core.prompts import ChatPromptTemplate

doc_analyse_prompt = ChatPromptTemplate.from_template(
    """
    **You are a data-cleaning assistant.**

    I will give you a raw table (possibly from a PDF) that may include:

    - Merged or split cells  
    - Empty columns or rows  
    - Repeated header rows  
    - Multi-line entries  
    - Mixed logical sections  

    ---

    ### Your job:

    1. **Detect separate sections**  
    If the table contains two or more distinct blocks—each with its own header—treat them as separate tables.

    2. **Clean each table:**  
    - **Drop** any fully empty rows or columns.  
    - **Remove** duplicate header rows (keep only the first header in each section).  
    - **Flatten** multi-line cells into single lines and trim extra whitespace.  
    - **Use** the first line of each section as the header.

    3. **Output**  
    - For each section produce a JSON array of objects.  
    - Use the cleaned header labels (in lowerCamelCase or snake_case) as keys.  
    - If there are multiple sections, label each output (e.g. `"policies": […]`, `"errors": […]`).
    - Just return the JSON without any additional text or explanations.
    
    Here's the raw table data:
    {table_content}
    """
)

In [None]:
from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
from dotenv import load_dotenv

load_dotenv()

azOpenAIllm = AzureChatOpenAI(
    azure_deployment="gpt-4o-mini",
    api_version="2025-01-01-preview",
)

# messages = [
#     (
#         "system",
#         "You are a helpful assistant that translates English to French. Translate the user sentence.",
#     ),
#     ("human", "I love programming."),
# ]

# azOpenAIllm.invoke(messages)

In [None]:
import json
from typing import List

def jsonify_tables(tables: List[List[List[str | None]]]) -> list[str | dict]:
    tables_json = []
        
    for table in tables:
        rows = ""
        
        for row in table:
            # Filter out empty rows. Example: ["", "name", "age", None, "city"] -> ["name", "age", "city"]
            filtered_row = [col for col in row if col not in ('', None)]
            # Structure the row. Example: "name | age | city"
            final_row = " | ".join([x.strip().replace('\n', '') for x in filtered_row])
            # Add the structured row to the rows string
            rows += final_row + "\n"
            
        chain = doc_analyse_prompt | azOpenAIllm
        
        response = chain.invoke({
            "table_content": rows
        })
        
        tables_json.append(response.content)
        
    return tables_json

In [None]:
def pretty_print_json(json_data: str) -> str:
    content = json_data
    
    # If the content starts with "```json", remove it
    if content.strip().startswith("```json"):
        content = content.strip()[7:]
        
        # If the content ends with "```", remove it
        if content.endswith("```"):
            content = content[:-3]
        
    try:
        parsed = json.loads(content)
        return json.dumps(parsed, indent=2, ensure_ascii=False)
    except Exception:
        return content.strip()

In [None]:
import pdfplumber
import requests

from io import BytesIO
from langchain.schema import Document
     
def process_pdf_tables(url: str) -> List[Document]:
    response = requests.get(url)
    response.raise_for_status()

    with pdfplumber.open(BytesIO(response.content)) as pdf:
        doucments = []

        for page in pdf.pages:
            text = page.extract_text()
            tables = page.extract_tables()

            # JSONify the tables. This will convert the tables into a JSON format that can be processed
            j_tables = jsonify_tables(tables)

            # Pretty print the JSON tables. This will format the JSON tables for better readability
            pretty_tables = [pretty_print_json(tbl) for tbl in j_tables] if j_tables else []

            # Add text to the documents list
            doucments.append(Document(page_content=text, metadata={"page": page.page_number}))
            # Add the pretty printed tables to the documents list
            [doucments.append(Document(page_content=pt, metadata={"page": page.page_number, "table": True})) for pt in pretty_tables if pt]
            
    return doucments


In [None]:
url = "https://emcdevstoragev2.blob.core.windows.net/public/88e6c6c9-afc8-4c6d-bc77-c35bc0af71de.pdf"
        
documents = process_pdf_tables(url)

In [None]:
for page in documents:
    print(page)

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

table_lengths = [len(doc.page_content) for doc in documents if doc.metadata.get("table")]
max_table_length = max(table_lengths) if table_lengths else None

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=100,
    length_function=len,
)

table_splitter = RecursiveCharacterTextSplitter(
    chunk_size=max_table_length if max_table_length else 500,
    chunk_overlap=(int(max_table_length / 5)) if max_table_length else 100,
    length_function=len,
)

splitted_doc: List[Document] = []

for doc in documents:
    if doc.metadata.get("table"):
        # Split the table content using the table splitter
        # split_documents returns a list of Document objects. So we extend the splitted_doc list with the result
        # Note: appending the result directly to splitted_doc will create a nested list
        splitted_doc.extend(table_splitter.split_documents([doc]))
    else:
        # Split the regular text using the text splitter
        # split_documents returns a list of Document objects. So we extend the splitted_doc list with the result
        # Note: appending the result directly to splitted_doc will create a nested list
        splitted_doc.extend(text_splitter.split_documents([doc]))

for doc in splitted_doc:
    print(doc.page_content)