## Data Extraction Eval Toolset Development

In [None]:
%pip install pdf2image

In [None]:
%pip install pillow

In [None]:
%pip install python-dotenv

In [None]:
%pip install -U langchain-ollama

In [None]:
%pip install mistralai

In [None]:
%pip install openai

In [None]:
poppler_path = r"C:\Users\HP Victus\Downloads\Release-24.08.0-0\poppler-24.08.0\Library\bin"
print("Poppler path set to:", poppler_path)

#### Loading the environment variables

In [11]:
from dotenv import load_dotenv
import os

load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")
mistral_ocr_key = os.getenv("MISTRAL_OCR_KEY")
mistral_local_LLM = os.getenv("MISTRAL_LOCAL_LLM")
ollama_model_seven = os.getenv("OLLAMA_MODEL_SEVEN")
ollama_model_eight = os.getenv("OLLAMA_MODEL_EIGHT")
ollama_model_nine = os.getenv("OLLAMA_MODEL_NINE")
ollama_model_ten = os.getenv("OLLAMA_MODEL_TEN")


In [2]:
max_tokens=2048
temperature = 0.2
top_p=0.9
presence_penalty=0.0
frequency_penalty=0.0


#### Creating a function to convert pdf pages to images using pdf2image and poppler

In [12]:
from PyPDF2 import PdfReader, PdfWriter
import os
from pdf2image import convert_from_path
from pdf2image.exceptions import PDFPageCountError

def convert_pdf_to_image(child_pdf_path):
    # Ensure the main images directory exists
    images_output_folder = './images'
    os.makedirs(images_output_folder, exist_ok=True)
    child_pdf_name = os.path.basename(child_pdf_path).split('.')[0]
    print(f"Child PDF Name: {child_pdf_name}")

    print(f"Processing: {child_pdf_path}")

    try:
        # Convert PDF pages to images
        images = convert_from_path(child_pdf_path, poppler_path=poppler_path)
        
        # Save the images to the subfolder
        for i, img in enumerate(images):
            image_path = os.path.join(images_output_folder, f'{child_pdf_name}.png')
            img.save(image_path, 'PNG')

        print(f"Saved images for {os.path.basename(child_pdf_path)} in {images_output_folder}")
    
    except PDFPageCountError:
        print(f"Error: Unable to get page count for {child_pdf_path}. The file might be corrupted or not a valid PDF.")
    except FileNotFoundError:
        print(f"Error: File {child_pdf_path} not found.")
    except Exception as e:
        print(f"An unexpected error occurred while processing {child_pdf_path}: {e}")

#### Finally converting PDF to images with function invocation

In [13]:
import os

# Sort files, treating non-numeric prefixes as having the highest sort order
for file in sorted(os.listdir('./uploaded_invoices'), key=lambda x: int(x.split('.')[0]) if x.split('.')[0].isdigit() else float('inf')):
    if file.endswith('.pdf'):
        child_pdf_path = os.path.join('./uploaded_invoices', file)
        convert_pdf_to_image(child_pdf_path)

Child PDF Name: output_1
Processing: ./uploaded_invoices\output_1.pdf
Saved images for output_1.pdf in ./images


#### Writing the System Prompt for Data Extraction

In [14]:
data_extraction_system_prompt = f""" You are a data extraction system. Your task is to extract key information from
the invoice provided to you as an image."""

#### Creating a function to generate local base64 URL for an image

In [15]:
import os
import base64
from IPython.display import Image, display

def generate_base64_url(child_pdf_image_path):
            print(f"Processing image: {child_pdf_image_path}")
            
            # Read the image file in binary mode
            with open(child_pdf_image_path, "rb") as img_file:
                raw_data = img_file.read()
                image_data = base64.b64encode(raw_data).decode("utf-8")
            
            # Determine the image format
            image_format = child_pdf_image_path.split('.')[-1]
            
            # Generate the data URL (optional, for other use cases)
            data_url = f"data:image/{image_format};base64,{image_data}"
            
            # Print the data URL (or save it as needed)
            print(f"Data URL for {child_pdf_image_path}:\n{data_url[:100]}...\n")  # printing full base64 is too long
            
           
            return data_url

#### Creating a Function to Encode PDF with Base64

In [16]:
def encode_pdf(pdf_path):
    """Encode the pdf to base64."""
    try:
        with open(pdf_path, "rb") as pdf_file:
            return base64.b64encode(pdf_file.read()).decode('utf-8')
    except FileNotFoundError:
        print(f"Error: The file {pdf_path} was not found.")
        return None
    except Exception as e:  # Added general exception handling
        print(f"Error: {e}")
        return None

#### Function to Generate Markdown from MISTRAL OCR

In [17]:
import base64
import requests
import os
from mistralai import Mistral
import pprint

def generate_markdown_from_mistral_OCR(image_path):

    # Getting the base64 string
    image_base_64_path = generate_base64_url(image_path)

    
    client = Mistral(api_key=mistral_ocr_key)

    ocr_response = client.ocr.process(
        model="mistral-ocr-latest",
        document={
            "type": "image_url",
            "image_url": f"{image_base_64_path}" 
        }
    )
    
    
    print(ocr_response.pages[0].markdown)
    return str((ocr_response))

In [18]:
generate_markdown_from_mistral_OCR('./images/output_1.png')

Processing image: ./images/output_1.png
Data URL for ./images/output_1.png:
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAABnMAAAV2CAIAAADOYd7mAAEAAElEQVR4nOzdUbaqOpCAYeh15yWODB...

# Lutz Isdebski 

## Schornsteinfegermeister

## Schornsteinfegertachbetrieb

Arnoldstraße 7
63075 Offenbach am Main
St.-Nr.: 03583063531
![img-0.jpeg](img-0.jpeg)

Tel. 069 / 21002021
Mobil. 017620465243

Bankverbindung: Commerzbank BLZ:50540028 Kto.Nr.:281809400
IBAN: DE19 505400280281809400
Swift-Adresse(BIC): COBADEFF

## Rechnung

Rechnungs-Nummer: 646.000 - 1269
Rechnungs-Datum: $\quad 08.05 .2023$
Bei Zahlungs- und Schriftverkehr immer angeben!
Effizienz Check

Jahresabrechnung 2023

## Betrifft:

Kurhessenstraße 55
63075 Offenbach

Leistungsdatum: K-05.05.2023, (1-05.05.2023,
Gebåude-
Nr. Leistung
Anzahl Größe AW Betrag

| (1) | 1,0 | 27,100 | 36,32 |
| :--: | :--: | :--: | :--: |
| 1 | 10,0 | 6,100 | 8,17 |
| 1 | 10,0 | 4,100 | 5,49 |
| 1 | 1,0 | 19,280 | 25,84 |
| 1 | 1,0 | 2,240 | 3,00 |
| 1 | 1,0

"pages=[OCRPageObject(index=0, markdown='# Lutz Isdebski \\n\\n## Schornsteinfegermeister\\n\\n## Schornsteinfegertachbetrieb\\n\\nArnoldstraße 7\\n63075 Offenbach am Main\\nSt.-Nr.: 03583063531\\n![img-0.jpeg](img-0.jpeg)\\n\\nTel. 069 / 21002021\\nMobil. 017620465243\\n\\nBankverbindung: Commerzbank BLZ:50540028 Kto.Nr.:281809400\\nIBAN: DE19 505400280281809400\\nSwift-Adresse(BIC): COBADEFF\\n\\n## Rechnung\\n\\nRechnungs-Nummer: 646.000 - 1269\\nRechnungs-Datum: $\\\\quad 08.05 .2023$\\nBei Zahlungs- und Schriftverkehr immer angeben!\\nEffizienz Check\\n\\nJahresabrechnung 2023\\n\\n## Betrifft:\\n\\nKurhessenstraße 55\\n63075 Offenbach\\n\\nLeistungsdatum: K-05.05.2023, (1-05.05.2023,\\nGebåude-\\nNr. Leistung\\nAnzahl Größe AW Betrag\\n\\n| (1) | 1,0 | 27,100 | 36,32 |\\n| :--: | :--: | :--: | :--: |\\n| 1 | 10,0 | 6,100 | 8,17 |\\n| 1 | 10,0 | 4,100 | 5,49 |\\n| 1 | 1,0 | 19,280 | 25,84 |\\n| 1 | 1,0 | 2,240 | 3,00 |\\n| 1 | 1,0 | 37,810 | 37,81 |\\n| Zwischensumme EUR | 116,63 

#### Creating Function to Call GPT-4 Vision Model

In [None]:
import base64
from openai import OpenAI

def call_openai_vision_model(image_path):
    base64_path = generate_base64_url(image_path)
    # Initialize the OpenAI client
    client = OpenAI(api_key=openai_api_key)
    response = client.responses.create(
    model="gpt-4.1-mini",
    input=[
        {
            "role": "user",
            "content": [
                { "type": "input_text", "text": "what's in this image?" },
                {
                    "type": "input_image",
                    "image_url": f"{base64_path}",
                },
            ],
        }
    ],
    max_output_tokens=max_tokens,
    temperature=temperature,
    top_p=top_p,
    
    
    )

    print(response.output_text)