In [1]:
# Ensure project root is on sys.path so `src` package is importable
import sys
import pathlib
project_root = pathlib.Path.cwd()
if (project_root / "src").exists():
    sys.path.insert(0, str(project_root))
elif (project_root.parent / "src").exists():
    sys.path.insert(0, str(project_root.parent))


In [2]:
import base64
import os
import pathlib
import json

from dotenv import load_dotenv
from google import genai
from google.genai import types

from src.domain.prompts.balance_sheet import balance_sheet_ocr_instructions
from src.domain.prompts.profit_and_loss import profit_and_loss_ocr_instructions
from src.domain.prompts.statement_disambiguation import statement_disambiguation_instructions

from src.domain.models.balance_sheet import BalanceSheet
from src.domain.models.profit_and_loss import ProfitAndLoss
from src.shared import utils

In [3]:
%reload_ext autoreload
%autoreload 2

In [4]:
load_dotenv();

In [5]:
client = genai.Client()

# Retrieve and encode the PDF byte
filepath = pathlib.Path('sample_data/bohm/výkazy 2024 Bohm Plast CZ.pdf')
# filepath = pathlib.Path('sample_data/Výroční zpráva ZS Dublovice včetně zprávy auditora 2024.pdf')

model_name = os.getenv("GENAI_MODEL", "gemini-2.5-pro")

In [6]:
response = client.models.generate_content(
  model=model_name,
  contents=[
    types.Part.from_bytes(
      data=filepath.read_bytes(),
      mime_type='application/pdf'
    ),
    statement_disambiguation_instructions
  ],
  config={
    "response_mime_type": "application/json"
  },
)

In [7]:
disambiguation_result = json.loads(response.text)

In [8]:
disambiguation_result

{'rozvaha': True, 'výkaz_zisku_a_ztráty': True, 'datum': '2024-12-31'}

In [9]:
response = client.models.generate_content(
  model=model_name,
  contents=[
    types.Part.from_bytes(
      data=filepath.read_bytes(),
      mime_type='application/pdf'
    ),
    balance_sheet_ocr_instructions
  ],
  config={
    "response_mime_type": "application/json"
  },
)

In [10]:
try:
    ocr_result = json.loads(response.text)
except json.JSONDecodeError:
    ocr_result = utils.load_json_from_text(response.text)

In [11]:
validated = BalanceSheet.model_validate_with_tolerance(ocr_result, tolerance=1)

ValidationError: 1 validation error for BalanceSheet
  Value error, Balance sheet validation failed:
- Rule validation failed for netto: Row 57 (25306) != Sum of rows 58+59+60+61 (23483) (difference: 1823, tolerance: 1) [type=value_error, input_value={'rok': 2024, 'data': {'1... 0, 'netto_minule': 0}}}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.11/v/value_error

In [13]:
ocr_result["data"]["57"]["netto"]

25306

In [14]:
ocr_result["data"]["58"]["netto"]

20667

In [15]:
ocr_result["data"]["59"]["netto"]

0

In [16]:
ocr_result["data"]["60"]["netto"]

0

In [17]:
ocr_result["data"]["61"]["netto"]

2816

In [22]:
25306 - (20667 + 2816)

1823

In [None]:
validated.data

In [None]:
response = client.models.generate_content(
  model=model_name,
  contents=[
    types.Part.from_bytes(
      data=filepath.read_bytes(),
      mime_type='application/pdf'
    ),
    profit_and_loss_ocr_instructions
  ],
  config={
    "response_mime_type": "application/json"
  },
)

In [None]:
try:
    ocr_result = json.loads(response.text)
except json.JSONDecodeError:
    ocr_result = utils.load_json_from_text(response.text)

In [None]:
validated_profit_and_loss = ProfitAndLoss.model_validate_with_tolerance(ocr_result, tolerance=1)

In [None]:
validated_profit_and_loss.data

In [None]:
validated.data[99].netto  == validated_profit_and_loss.data[53].současné

In [None]:
validated.data[99].netto_minule  == validated_profit_and_loss.data[53].minulé