In [1]:
# Ensure project root is on sys.path so `src` package is importable
import sys
import pathlib
project_root = pathlib.Path.cwd()
if (project_root / "src").exists():
    sys.path.insert(0, str(project_root))
elif (project_root.parent / "src").exists():
    sys.path.insert(0, str(project_root.parent))


In [2]:
import base64
import os
import pathlib
import json

from dotenv import load_dotenv
from google import genai
from google.genai import types

from src.domain.prompts.balance_sheet import balance_sheet_ocr_instructions
from src.domain.prompts.profit_and_loss import profit_and_loss_ocr_instructions
from src.domain.prompts.statement_disambiguation import statement_disambiguation_instructions

from src.domain.models.balance_sheet import BalanceSheet
from src.domain.models.profit_and_loss import ProfitAndLoss
from src.shared import utils

In [3]:
%reload_ext autoreload
%autoreload 2

In [4]:
load_dotenv();

In [None]:
client = genai.Client()

# Retrieve and encode the PDF byte
filepath = pathlib.Path('sample_data/bohm/výkazy 2024 Bohm Plast CZ.pdf')
# filepath = pathlib.Path('sample_data/Výroční zpráva ZS Dublovice včetně zprávy auditora 2024.pdf')

model_name = os.getenv("GENAI_MODEL", "gemini-2.5-pro")

In [6]:
response = client.models.generate_content(
  model=model_name,
  contents=[
    types.Part.from_bytes(
      data=filepath.read_bytes(),
      mime_type='application/pdf'
    ),
    statement_disambiguation_instructions
  ],
  config={
    "response_mime_type": "application/json"
  },
)

In [7]:
disambiguation_result = json.loads(response.text)

In [8]:
disambiguation_result

{'rozvaha': True, 'výkaz_zisku_a_ztráty': True, 'datum': '2024-12-31'}

In [9]:
response = client.models.generate_content(
  model=model_name,
  contents=[
    types.Part.from_bytes(
      data=filepath.read_bytes(),
      mime_type='application/pdf'
    ),
    balance_sheet_ocr_instructions
  ],
  config={
    "response_mime_type": "application/json"
  },
)

In [10]:
try:
    ocr_result = json.loads(response.text)
except json.JSONDecodeError:
    ocr_result = utils.load_json_from_text(response.text)

In [11]:
validated = BalanceSheet.model_validate_with_tolerance(ocr_result, tolerance=1)

In [12]:
validated.data

{1: BalanceSheetRow(brutto=277437, korekce=-79710, netto=197727, netto_minule=154186),
 2: BalanceSheetRow(brutto=0, korekce=0, netto=0, netto_minule=0),
 3: BalanceSheetRow(brutto=121228, korekce=-79710, netto=41518, netto_minule=54231),
 4: BalanceSheetRow(brutto=11808, korekce=-1502, netto=10306, netto_minule=6811),
 5: BalanceSheetRow(brutto=0, korekce=0, netto=0, netto_minule=0),
 6: BalanceSheetRow(brutto=8877, korekce=-1502, netto=7375, netto_minule=5824),
 7: BalanceSheetRow(brutto=8877, korekce=-1502, netto=7375, netto_minule=5824),
 8: BalanceSheetRow(brutto=0, korekce=0, netto=0, netto_minule=0),
 9: BalanceSheetRow(brutto=0, korekce=0, netto=0, netto_minule=0),
 10: BalanceSheetRow(brutto=0, korekce=0, netto=0, netto_minule=0),
 11: BalanceSheetRow(brutto=2931, korekce=0, netto=2931, netto_minule=987),
 12: BalanceSheetRow(brutto=0, korekce=0, netto=0, netto_minule=0),
 13: BalanceSheetRow(brutto=2931, korekce=0, netto=2931, netto_minule=987),
 14: BalanceSheetRow(brutto=10

In [13]:
response = client.models.generate_content(
  model=model_name,
  contents=[
    types.Part.from_bytes(
      data=filepath.read_bytes(),
      mime_type='application/pdf'
    ),
    profit_and_loss_ocr_instructions
  ],
  config={
    "response_mime_type": "application/json"
  },
)

In [14]:
try:
    ocr_result = json.loads(response.text)
except json.JSONDecodeError:
    ocr_result = utils.load_json_from_text(response.text)

In [15]:
validated_profit_and_loss = ProfitAndLoss.model_validate_with_tolerance(ocr_result, tolerance=1)

In [16]:
validated_profit_and_loss.data

{1: ProfitAndLossRow(současné=495982, minulé=371072),
 2: ProfitAndLossRow(současné=0, minulé=0),
 3: ProfitAndLossRow(současné=171628, minulé=138856),
 4: ProfitAndLossRow(současné=0, minulé=0),
 5: ProfitAndLossRow(současné=67266, minulé=61876),
 6: ProfitAndLossRow(současné=104362, minulé=76980),
 7: ProfitAndLossRow(současné=-6481, minulé=-4383),
 8: ProfitAndLossRow(současné=0, minulé=0),
 9: ProfitAndLossRow(současné=133405, minulé=120619),
 10: ProfitAndLossRow(současné=98997, minulé=89546),
 11: ProfitAndLossRow(současné=34408, minulé=31073),
 12: ProfitAndLossRow(současné=32436, minulé=29541),
 13: ProfitAndLossRow(současné=1972, minulé=1532),
 14: ProfitAndLossRow(současné=18687, minulé=19122),
 15: ProfitAndLossRow(současné=18687, minulé=19122),
 16: ProfitAndLossRow(současné=18687, minulé=19122),
 17: ProfitAndLossRow(současné=0, minulé=0),
 18: ProfitAndLossRow(současné=0, minulé=0),
 19: ProfitAndLossRow(současné=0, minulé=0),
 20: ProfitAndLossRow(současné=235, minulé=54

In [17]:
validated.data[99].netto  == validated_profit_and_loss.data[53].současné

True

In [18]:
validated.data[99].netto_minule  == validated_profit_and_loss.data[53].minulé

True