# OCR Financial Statements Example

This notebook demonstrates how to use the Valuagent OCR capabilities to extract financial data from PDF documents.

## Prerequisites

Before running this notebook, make sure you have:

1. **Environment Variables**: Create a `.env` file in the project root with:
   ```
   GOOGLE_API_KEY=your_google_api_key_here
   GENAI_MODEL=gemini-2.5-pro  # optional, defaults to gemini-2.5-pro
   ```

2. **Sample Data**: The notebook uses sample PDF files from the `sample_data/` directory.

3. **Dependencies**: Install all dependencies using Poetry:
   ```bash
   poetry install
   ```


In [None]:
import sys
import pathlib
project_root = pathlib.Path.cwd()
if (project_root / "src").exists():
    sys.path.insert(0, str(project_root))
elif (project_root.parent / "src").exists():
    sys.path.insert(0, str(project_root.parent))


In [9]:
import base64
import os
import pathlib
import json

from dotenv import load_dotenv

from src.domain.prompts.balance_sheet import balance_sheet_ocr_instructions
from src.domain.prompts.profit_and_loss import profit_and_loss_ocr_instructions
from src.domain.prompts.statement_disambiguation import statement_disambiguation_instructions

from src.domain.models.balance_sheet import BalanceSheet
from src.domain.models.profit_and_loss import ProfitAndLoss
from src.shared import utils
from src.infrastructure.clients.genai_client import generate_json_from_pdf_async

In [10]:
%reload_ext autoreload
%autoreload 2

In [11]:
load_dotenv();

In [12]:
# Retrieve and encode the PDF byte
# filepath = pathlib.Path('sample_data/bohm/výkazy 2024 Bohm Plast CZ.pdf')
filepath = pathlib.Path('sample_data/Výroční zpráva ZS Dublovice včetně zprávy auditora 2024.pdf')

model_name = os.getenv("GENAI_MODEL", "gemini-2.5-pro")

In [13]:
response_text = await generate_json_from_pdf_async(
    pdf_bytes=filepath.read_bytes(),
    prompt=statement_disambiguation_instructions,
    model=model_name
)

In [14]:
try:
    disambiguation_result = json.loads(response_text)
except json.JSONDecodeError:
    disambiguation_result = utils.load_json_from_text(response_text)

In [15]:
response_text = await generate_json_from_pdf_async(
    pdf_bytes=filepath.read_bytes(),
    prompt=balance_sheet_ocr_instructions,
    model=model_name
)

In [16]:
try:
    ocr_result = json.loads(response_text)
except json.JSONDecodeError:
    ocr_result = utils.load_json_from_text(response_text)

In [17]:
validated = BalanceSheet.model_validate_with_tolerance(ocr_result, tolerance=1)

In [18]:
validated.data

{1: BalanceSheetRow(brutto=783093, korekce=356515, netto=426578, netto_minule=436197),
 2: BalanceSheetRow(brutto=0, korekce=0, netto=0, netto_minule=0),
 3: BalanceSheetRow(brutto=664524, korekce=356328, netto=308196, netto_minule=318187),
 4: BalanceSheetRow(brutto=0, korekce=0, netto=0, netto_minule=0),
 5: BalanceSheetRow(brutto=0, korekce=0, netto=0, netto_minule=0),
 6: BalanceSheetRow(brutto=0, korekce=0, netto=0, netto_minule=0),
 7: BalanceSheetRow(brutto=0, korekce=0, netto=0, netto_minule=0),
 8: BalanceSheetRow(brutto=0, korekce=0, netto=0, netto_minule=0),
 9: BalanceSheetRow(brutto=0, korekce=0, netto=0, netto_minule=0),
 10: BalanceSheetRow(brutto=0, korekce=0, netto=0, netto_minule=0),
 11: BalanceSheetRow(brutto=0, korekce=0, netto=0, netto_minule=0),
 12: BalanceSheetRow(brutto=0, korekce=0, netto=0, netto_minule=0),
 13: BalanceSheetRow(brutto=0, korekce=0, netto=0, netto_minule=0),
 14: BalanceSheetRow(brutto=664311, korekce=356317, netto=307994, netto_minule=317985

In [19]:
response_text = await generate_json_from_pdf_async(
    pdf_bytes=filepath.read_bytes(),
    prompt=profit_and_loss_ocr_instructions,
    model=model_name
)

In [20]:
try:
    ocr_result = json.loads(response_text)
except json.JSONDecodeError:
    ocr_result = utils.load_json_from_text(response_text)

In [21]:
validated_profit_and_loss = ProfitAndLoss.model_validate_with_tolerance(ocr_result, tolerance=1)

In [22]:
validated_profit_and_loss.data

{1: ProfitAndLossRow(současné=141287, minulé=175888),
 2: ProfitAndLossRow(současné=0, minulé=0),
 3: ProfitAndLossRow(současné=117794, minulé=107696),
 4: ProfitAndLossRow(současné=0, minulé=0),
 5: ProfitAndLossRow(současné=70745, minulé=66315),
 6: ProfitAndLossRow(současné=47049, minulé=41381),
 7: ProfitAndLossRow(současné=-14539, minulé=19045),
 8: ProfitAndLossRow(současné=-3923, minulé=-4633),
 9: ProfitAndLossRow(současné=27486, minulé=26891),
 10: ProfitAndLossRow(současné=19283, minulé=18924),
 11: ProfitAndLossRow(současné=8203, minulé=7967),
 12: ProfitAndLossRow(současné=6591, minulé=6308),
 13: ProfitAndLossRow(současné=1612, minulé=1659),
 14: ProfitAndLossRow(současné=18102, minulé=22534),
 15: ProfitAndLossRow(současné=22080, minulé=22049),
 16: ProfitAndLossRow(současné=22080, minulé=22049),
 17: ProfitAndLossRow(současné=0, minulé=0),
 18: ProfitAndLossRow(současné=-3978, minulé=485),
 19: ProfitAndLossRow(současné=0, minulé=0),
 20: ProfitAndLossRow(současné=25670,

In [23]:
validated.data[99].netto  == validated_profit_and_loss.data[53].současné

True

In [24]:
validated.data[99].netto_minule  == validated_profit_and_loss.data[53].minulé

True