In [1]:
import ollama
import fitz 
import pdfplumber
import json
from IPython.display import display
from PIL import Image


In [2]:

def convert_pdf_to_images(pdf_path):
    images = convert_from_path(pdf_path)
    return images


In [3]:

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text("text") + "\n"
    return text.strip()


In [4]:

def extract_tables_from_pdf(pdf_path):
    tables = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            extracted_tables = page.extract_tables()
            for table in extracted_tables:
                headers = table[0] 
                for row in table[1:]:
                    row_dict = {headers[i]: row[i] for i in range(len(headers))}
                    tables.append(row_dict)
    return tables


In [5]:

def structure_data_with_ollama(text, tables):
    prompt = f"""
    Extract structured data from this purchase order.
    - Headers: Extract all relevant key-value pairs.
    - list_items: Extract table data as structured lists.
    - Do not summarize or omit any data.
    - Add all list_items properly.

    Text:
    {text}

    Tables:
    {tables}

    Format response as:
    {{
        "headers": {{"key1": "value1", "key2": "value2"}},
        "list_items": [{{"column1": "value", "column2": "value"}}]
    }}
    """

    response = ollama.chat(model="mistral", messages=[{"role": "user", "content": prompt}])
    # response = ollama.chat(model="llama3", messages=[{"role": "user", "content": prompt}])

    return response['message']['content']


In [6]:

pdf_path = "Document Extraction 3.pdf" 
print(f"Processing {pdf_path}...")


Processing Document Extraction 3.pdf...


In [7]:

extracted_text = extract_text_from_pdf(pdf_path)
extracted_tables = extract_tables_from_pdf(pdf_path)


In [8]:

structured_json = structure_data_with_ollama(extracted_text, extracted_tables)


In [9]:
print("Raw Ollama Response:")
print(structured_json)  # Debugging: Check if response is valid JSON

Raw Ollama Response:
 Here is the structured data extracted from the given text:

    ```json
    {
        "headers": {
            "PURCHASE ORDER TO": "Construction Masters",
            "SERVICE DESCRIPTION": "",
            "ITEM DESCRIPTION": "",
            "COMMENTS": "",
            "DATE": "01/01/2022",
            "APPROVED BY": "",
            "TOTAL": "",
            "TOTAL MATERIALS": "",
            "ACCEPTED BY": "",
            "TOTAL SERVICES": "",
            "SHIP TO": "270 Construction Road Drive, Dayton, 25143 OH, PH: 555-1524-58554",
            "PURCHASE ORDER": "Total Job Cost",
            "TAX": "10%",
            "PO #": "0001253"
        },
        "list_items": [
            {
                "column1": "[Days]",
                "column2": "",
                "column3": "",
                "TOTAL PRICE": ""
            },
            {
                "column1": "#",
                "column2": "Service Description",
                "column3": "TOTAL PRICE"

In [10]:

with open("purchase_order_data.json", "w") as f:
    json.dump(structured_json, f, indent=4)


In [11]:

print("Purchase Order Data Extracted:")
print(structured_json)


Purchase Order Data Extracted:
 Here is the structured data extracted from the given text:

    ```json
    {
        "headers": {
            "PURCHASE ORDER TO": "Construction Masters",
            "SERVICE DESCRIPTION": "",
            "ITEM DESCRIPTION": "",
            "COMMENTS": "",
            "DATE": "01/01/2022",
            "APPROVED BY": "",
            "TOTAL": "",
            "TOTAL MATERIALS": "",
            "ACCEPTED BY": "",
            "TOTAL SERVICES": "",
            "SHIP TO": "270 Construction Road Drive, Dayton, 25143 OH, PH: 555-1524-58554",
            "PURCHASE ORDER": "Total Job Cost",
            "TAX": "10%",
            "PO #": "0001253"
        },
        "list_items": [
            {
                "column1": "[Days]",
                "column2": "",
                "column3": "",
                "TOTAL PRICE": ""
            },
            {
                "column1": "#",
                "column2": "Service Description",
                "column3": "TO