In [26]:
import ollama
import fitz 
import pdfplumber
import json
from IPython.display import display
from PIL import Image


In [27]:

def convert_pdf_to_images(pdf_path):
    images = convert_from_path(pdf_path)
    return images


In [28]:

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text("text") + "\n"
    return text.strip()


In [29]:

def extract_tables_from_pdf(pdf_path):
    tables = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            extracted_tables = page.extract_tables()
            for table in extracted_tables:
                headers = table[0] 
                for row in table[1:]:
                    row_dict = {headers[i]: row[i] for i in range(len(headers))}
                    tables.append(row_dict)
    return tables


In [30]:

def structure_data_with_ollama(text, tables):
    prompt = f"""
    Extract structured data from this purchase order.
    - Headers: Extract all relevant key-value pairs.
    - list_items: Extract table data as structured lists.
    - Do not summarize or omit any data.

    Text:
    {text}

    Tables:
    {tables}

    Format response as:
    {{
        "headers": {{"key1": "value1", "key2": "value2"}},
        "list_items": [{{"column1": "value", "column2": "value"}}]
    }}
    """

    response = ollama.chat(model="mistral", messages=[{"role": "user", "content": prompt}])
    # response = ollama.chat(model="llama3", messages=[{"role": "user", "content": prompt}])

    return response['message']['content']


In [31]:

pdf_path = "Document Extraction 1.pdf" 
print(f"Processing {pdf_path}...")


Processing Document Extraction 1.pdf...


In [32]:

extracted_text = extract_text_from_pdf(pdf_path)
extracted_tables = extract_tables_from_pdf(pdf_path)


In [33]:

structured_json = structure_data_with_ollama(extracted_text, extracted_tables)


In [34]:
print("Raw Ollama Response:")
print(structured_json)  # Debugging: Check if response is valid JSON

Raw Ollama Response:
 {
       "headers": {
          "PURCHASE ORDER #": "123456/22",
          "DATE": "22ND SEPTEMBER, 2022",
          "CLIENT INFORMATION - Name": "FASHION QUEEN",
          "CLIENT INFORMATION - Address": "108 Waldorf Street\nCoventry, 25448 IL",
          "CLIENT INFORMATION - Email": "info@fashionqueen.com",
          "CLIENT INFORMATION - Phone": "(000) 1234 56789",
          "ORDER INFORMATION - Company": "FASHION ITEMS INC Attn. Sam Martin (Chief of Finance)",
          "ORDER INFORMATION - Address": "211 Arrow Bay, Westminster, 21656 Los Angeles",
          "ORDER INFORMATION - Email": "info@fashionitems.com",
          "ORDER INFORMATION - Phone": "(555) 1234 56789",
          "SHIPPING INFO - Method": "Courier",
          "SHIPPING INFO - Company": "FedEx",
          "SHIPPING INFO - Track #": "1222052520000680",
          "SHIPPING INFO - Arrival Date": "29/09/2022",
          "PAYMENT INFO - Payment Type": "Credit Card",
          "PAYMENT INFO - TOTAL":

In [35]:

with open("purchase_order_data.json", "w") as f:
    json.dump(structured_json, f, indent=4)


In [36]:

print("Purchase Order Data Extracted:")
print(structured_json)


Purchase Order Data Extracted:
 {
       "headers": {
          "PURCHASE ORDER #": "123456/22",
          "DATE": "22ND SEPTEMBER, 2022",
          "CLIENT INFORMATION - Name": "FASHION QUEEN",
          "CLIENT INFORMATION - Address": "108 Waldorf Street\nCoventry, 25448 IL",
          "CLIENT INFORMATION - Email": "info@fashionqueen.com",
          "CLIENT INFORMATION - Phone": "(000) 1234 56789",
          "ORDER INFORMATION - Company": "FASHION ITEMS INC Attn. Sam Martin (Chief of Finance)",
          "ORDER INFORMATION - Address": "211 Arrow Bay, Westminster, 21656 Los Angeles",
          "ORDER INFORMATION - Email": "info@fashionitems.com",
          "ORDER INFORMATION - Phone": "(555) 1234 56789",
          "SHIPPING INFO - Method": "Courier",
          "SHIPPING INFO - Company": "FedEx",
          "SHIPPING INFO - Track #": "1222052520000680",
          "SHIPPING INFO - Arrival Date": "29/09/2022",
          "PAYMENT INFO - Payment Type": "Credit Card",
          "PAYMENT INFO