In [67]:
import pandas as pd
import json
import pymupdf
import io
import base64
import importlib
import os
import copy


In [18]:
with open("../data_temp/10_bs_page_by_sha1.json", "r") as f:
    res_by_sha1 = json.load(f)

with open("../data_temp/10_bs_page_llm_text.json", "r") as f:
    balance_sheet_page_llm_text = json.load(f)

df_bs_page_text_llm = pd.read_csv("../data_temp/10_bs_page_text_llm.csv", index_col=0)
df_bs_page_text_llm["page"] = df_bs_page_text_llm["page"].astype("Int64")
df_bs_page_text_llm[:5]


Unnamed: 0,sha1,page
0,0279901b645e568591ad95dac2c2bf939ef0c00d,91
1,0981826b4b43a88920f3e01c71ae73539bab84cc,79
2,0a61a353b1ea9fd9b8f63b60239634ca3007d58f,61
3,105688726e097505beef4934896193ac51295037,32
4,12bff07b957b1c8f8cad9d917ca18005720cce9b,53


In [19]:
# Function to get text of a specific page
def get_page_text(text, page_number):
    """
    Extract text for a specified page number from the full document text.
    
    Args:
        text (str): The full document text
        page_number (int): The page number to extract
        
    Returns:
        str: The extracted text for the specified page
    """
    page_text = ""
    # Find the start and end of the page in the text
    page_marker = f"{{{page_number}}}------------------------------------------------"
    page_start = text.find(page_marker)
    if page_start != -1:
        page_start = page_start + len(page_marker)
        # Look for the next page marker or end of text
        next_page_marker = f"{{{page_number + 1}}}------------------------------------------------"
        next_page_start = text.find(next_page_marker, page_start)
        if next_page_start != -1:
            page_text = text[page_start:next_page_start]
        else:
            # If there's no next page, take all text until the end
            page_text = text[page_start:]
        return page_text
    else:
        return f"Page {page_number} is not found in the document."


def get_sha1_page_text(sha1, page_number):
    """
    Load the markdown file for a specific sha1 and extract text for a specified page.
    
    Args:
        sha1 (str): The SHA1 hash of the document
        page_number (int): The page number to extract
        
    Returns:
        str: The extracted text for the specified page
    """
    # Construct the path to the markdown file
    output_path = f"../data_in/temp/EnterpriseRAG_2025_02_markdown/{sha1}/{sha1}.md"
    
    # Read the file
    try:
        with open(output_path, "r", encoding="utf-8") as f:
            text = f.read()
        
        # Extract and return the page text
        return get_page_text(text, page_number)
    except FileNotFoundError:
        return f"File for SHA1 {sha1} not found."

In [23]:
for row_id, row in df_bs_page_text_llm.iloc[0:1].iterrows():
    sha1 = row["sha1"]
    page_number = row["page"]
    text_page = get_sha1_page_text(sha1, page_number)
print(text_page)



# **ACRES COMMERCIAL REALTY CORP. AND SUBSIDIARIES CONSOLIDATED BALANCE SHEETS (in thousands, except share and per share data)**

|                                                                                         |                 | December 31 |           |
|-----------------------------------------------------------------------------------------|-----------------|-------------|-----------|
|                                                                                         | 2022            |             | 2021      |
| (1)<br>ASSETS                                                                           |                 |             |           |
| Cash and cash equivalents                                                               | \$<br>66,232    | \$          | 35,500    |
| Restricted cash                                                                         | 38,579          |             | 248,431   |
| Accrued interest receivable                       

In [11]:
row_id

'0279901b645e568591ad95dac2c2bf939ef0c00d'

In [32]:
filename = f"../data_temp/05_cut_pdfs/{sha1}/{sha1}_{page_number:04d}.pdf"
doc = pymupdf.open(filename)
page = doc.load_page(0)
zoom = 300 / 72
matrix = pymupdf.Matrix(zoom, zoom)
pix = page.get_pixmap(matrix=matrix)
import os

# Create directory if it doesn't exist
os.makedirs(f"../data_temp/11_png_pages", exist_ok=True)
png_bytes = io.BytesIO(pix.tobytes())
png_image = base64.b64encode(png_bytes.getvalue()).decode("utf-8")
# pix.save(f"../data_temp/11_png_pages/{sha1}_{page_number:04d}.png")
png_image[:10]

'iVBORw0KGg'

In [84]:
importlib.reload(importlib.import_module("utils.llm"))
import utils.llm as llm

In [56]:

in_params = [{'png_image': png_image}]
res = await llm.run_tasks_with_retries(llm.balance_sheet_pgn_extraction, in_params, retries=0)
res

Sending for one question started
1


[(0,
  {'png_image': 'iVBORw0KGgoAAAANSUhEUgAACfYAAAzkCAIAAACYgxr+AAAACXBIWXMAAA7EAAAOxAGVKw4bAAjXgElEQVR4nOzdd5zVVP7/8SnUoRdpUgVBepEuiCKyCwgiCljpKEhRkSbIPhARBd1lKYKIjaLOoBRBEUTYVaQLyFIEho6gdJihzQAzv8938t387jcnyc2dmwwTfD3/2IfMPTk5OUlO7p73TRKRCgAAAAAAAAAAAADwiYib3QAAAAAAAAAAAAAAgFNEvAAAAAAAAAAAAADgG0S8AAAAAAAAAAAAAOAbRLwAAAAAAAAAAAAA4BtEvAAAAAAAAAAAAADgG0S8AAAAAAAAAAAAAOAbRLwAAAAAAAAAAAAA4BtEvAAAAAAAAAAAAADgG0S8AAAAAAAAAAAAAOAbRLwAAAAAAAAAAAAA4BtEvAAAAAAAAAAAAADgG0S8AAAAAAAAAAAAAOAbRLwAAAAAAAAAAAAA4BtEvAAAAAAAAAAAAADgG0S8AAAAAAAAAAAAAOAbRLwAAAAAAAAAAAAA4BtEvAAAAAAAAAAAAADgG0S8AAAAAAAAAAAAAOAbRLwAAAAAAAAAAAAA4BtEvAAAAAAAAAAAAADgG0S8AAAAAAAAAAAAAOAbRLwAAAAAAAAAAAAA4BtEvAAAAAAAAAAAAADgG0S8AAAAAAAAAAAAAOAbRLwAAAAAAAAAAAAA4BtEvAAAAAAAAAAAAADgG0S8AAAAAAAAAAAAAOAbRLwAAAAAAAAAAAAA4BtEvAAAAAAAAAAAAADgG0S8AAAAAAAAAAAAAOAbRLwAAAAAAAAAAAAA4BtEvAAAAAAAAAAAAADgG0S8AAAAAAAAAAAAAOAbRLwAAAAAAAAAAAAA4BtEvAAAAAAAAAAAAADgG0S8AAAAAAAAAAAAAOAbRLwAAAAAAAAAAAAA4BtEvAAAAAAAAAAAAADgG0S8AAAAAAAAAAAAAOAbRLwAAAAAAA

In [57]:
print(res[0][2]["balance_sheet_markdown_table"])

|                | 2022      | 2021      |
|----------------|-----------|-----------|
| cash and cash equivalents | 66,232    | 35,500    |
| restricted cash | 38,579    | 248,431   |
| accrued interest receivable | 11,969    | 6,112     |
| cre loans | 2,057,590 | 1,882,551 |
| less: allowance for credit losses | (18,803)  | (8,805)   |
| cre loans, net | 2,038,787 | 1,873,746 |
| principal paydowns receivable | -         | 14,899    |
| loan receivable - related party | 11,275    | 11,575    |
| investments in unconsolidated entities | 1,548     | 1,548     |
| properties held for sale | 53,769    | 17,846    |
| investments in real estate | 120,968   | 59,308    |
| right of use assets | 20,281    | 5,951     |
| intangible assets | 8,880     | 3,877     |
| other assets | 4,364     | 5,482     |
| total assets | 2,376,652 | 2,284,275 |
| accounts payable and other liabilities | 10,391    | 7,025     |
| management fee payable - related party | 898       | 561       |
| accrued inte

In [94]:
in_params = []
in_sha1 = []
for row_id, row in df_bs_page_text_llm.iterrows():
    sha1 = row["sha1"]
    page_number = row["page"]
    if pd.isna(page_number):
        continue
    text_page = get_sha1_page_text(sha1, page_number)
    filename = f"../data_temp/05_cut_pdfs/{sha1}/{sha1}_{page_number:04d}.pdf"
    doc = pymupdf.open(filename)
    page = doc.load_page(0)
    zoom = 300 / 72
    matrix = pymupdf.Matrix(zoom, zoom)
    pix = page.get_pixmap(matrix=matrix)


    # Create directory if it doesn't exist
    os.makedirs(f"../data_temp/11_png_pages", exist_ok=True)
    png_bytes = io.BytesIO(pix.tobytes())
    png_image = base64.b64encode(png_bytes.getvalue()).decode("utf-8")

    in_params.append({"png_image": png_image, "page_number": page_number, "sha1": sha1})
    in_sha1.append(sha1)

res = await llm.run_tasks_with_retries(llm.balance_sheet_pgn_extraction, in_params, retries=5)


Sending for one question started
0000000000000000000000000000000000000000000000100000000000
0001000000000000000000000000000000000000000000100000000000
0001000000000000000000000000000000000000000000110000000000
0001000000000000000000000000000000000000000100110000000000
0001000000000000000000000000000000000001000100110000000000
0001010000000000000000000000000000000001000100110000000000
0001010000000000000000000001000000000001000100110000000000
0001010000000000010000000001000000000001000100110000000000
0001010000000000010000000001000000000001000100110000001000
0001010000000000010000000001000000000001000100110000001001
0001010000000000010000000001000000000001000100110100001001
0001011000000000010000000001000000000001000100110100001001
0001011000000000010000000001000100000001000100110100001001
0001011000000000010000000001000100001001000100110100001001
0001011000000000010000000001000101001001000100110100001001
0001011000000000010000000001000101001001000100110100101001
00010110000000010100000

In [95]:
res[34]

(34,
 {'png_image': 'iVBORw0KGgoAAAANSUhEUgAACbEAAA20CAIAAABOHAVkAAAACXBIWXMAAA7EAAAOxAGVKw4bAAXNiklEQVR4nOzdiZsU5YE/8N8/gW5Oc2+SzbG7cTebTdZbjMZ4S+IVolHjlWi8o9GoMdFEDhEExQMVERBU5BZFLpVDRBAEUQ65Qe57YID5VWisKXr6qJl5e6an+Hyeenxwuvrtt96ueqv7/Xa99f/qAAAAAAAAALLr/7V2BQAAAAAAAAAqSCYKAAAAAAAAZJlMFAAAAAAAAMgymSgAAAAAAACQZTJRAAAAAAAAIMtkogAAAAAAAECWyUQBAAAAAACALJOJAgAAAAAAAFkmEwUAAAAAAACyTCYKAAAAAAAAZJlMFAAAAAAAAMgymSgAAAAAAACQZTJRAAAAAAAAIMtkogAAAAAAAECWyUQBAAAAAACALJOJAgAAAAAAAFkmEwUAAAAAAACyTCYKAAAAAAAAZJlMFAAAAAAAAMgymSgAAAAAAACQZTJRAAAAAAAAIMtkogAAAAAAAECWyUQBAAAAAACALJOJAgAAAAAAAFkmEwUAAAAAAACyTCYKAAAAAAAAZJlMFAAAAAAAAMgymSgAAAAAAACQZTJRAAAAAAAAIMtkogAAAAAAAECWyUQBAAAAAACALJOJAgAAAAAAAFkmEwUAAAAAAACyTCYKAAAAAAAAZJlMFAAAAAAAAMgymSgAAAAAAACQZTJRAAAAAAAAIMtkogAAAAAAAECWyUQBAAAAAACALJOJAgAAAAAAAFkmEwUAAAAAAACyTCYKAAAAAAAAZJlMFAAAAAAAAMgymSgAAAAAAACQZTJRAAAAAAAAIMtkogAAAAAAAECWyUQBAAAAAACALJOJAgAAAAAAAFkmEwUAAAAAAACyTCYKAAAAAAAAZJlMFAAAAAAAAMgymSgAAAAAAACQZTJRAAAAAAAAIMtkogAAAAAAAECWyUQBAAAAAAC

In [77]:

# res_one = copy.deepcopy(res[21][2]["result"])
# llm.LineItem.model_validate(res_one)
print(res[56][2]["result"])

{
  "list_of_years_in_columns": [
    2022,
    2021
  ],
  "last_year": 2022,
  "list_of_periods_in_columns": [],
  "last_period": "N/A",
  "scale_of_values_in_table": "thousands",
  "currency_of_values_in_table": "usd",
  "total_assets": 63232.0,
  "cash_and_cash_equivalents": 20399.0,
  "total_liabilities": "N/A",
  "total_equity": 52763.0,
  "total_deposits": "N/A",
  "loans_outstanding": "N/A",
  "result": true,
  "line_items": [
    {
      "name": "cash and cash equivalents",
      "value": 20399.0
    },
    {
      "name": "marketable securities",
      "value": 4973.0
    },
    {
      "name": "accounts receivable, net",
      "value": 13220.0
    },
    {
      "name": "other current assets",
      "value": 3729.0
    },
    {
      "name": "total current assets",
      "value": 42321.0
    },
    {
      "name": "investments",
      "value": 5180.0
    },
    {
      "name": "property and equipment, at cost less accumulated depreciation",
      "value": 12006.0
    },
    

In [96]:
res_by_sha1 = {}
for i, res_i in enumerate(res):
    sha1_i = res_i[1]["sha1"]
    res_by_sha1[sha1_i] = res_i

with open("../data_temp/11_bs_page_data_by_sha1_text.json", "w") as f:
    json.dump(res_by_sha1, f)

In [98]:
res_by_sha2 = {}
for k, v in res_by_sha1.items():
    el = copy.deepcopy(v[2])
    del el["line_items"]
    res_by_sha2[k] = el




res_by_sha2["980742aa08ea64d552c153bcefbd7e8243fb9efd"]



{'list_of_years_in_columns': [2022, 2021],
 'last_year': 2022,
 'list_of_periods_in_columns': ['31.12', '31.12'],
 'last_period': '31.12',
 'scale_of_values_in_table': 'thousands',
 'currency_of_values_in_table': 'EUR',
 'total_assets': 845511.0,
 'cash_and_cash_equivalents': 107482.0,
 'total_liabilities': 'N/A',
 'total_equity': 'N/A',
 'total_deposits': 'N/A',
 'loans_outstanding': 'N/A',
 'result': True}

In [100]:
df_bs_page_data = pd.DataFrame.from_dict(res_by_sha2, orient='index')
df_bs_page_data

Unnamed: 0,list_of_years_in_columns,last_year,list_of_periods_in_columns,last_period,scale_of_values_in_table,currency_of_values_in_table,total_assets,cash_and_cash_equivalents,total_liabilities,total_equity,total_deposits,loans_outstanding,result
0279901b645e568591ad95dac2c2bf939ef0c00d,"[2022, 2021]",2022,[december 31],december 31,thousands,USD,2376652.0,66232.0,1935338.0,441314.0,,,True
0981826b4b43a88920f3e01c71ae73539bab84cc,"[2022, 2021]",2022,"[december 31, december 31]",december 31,thousands,USD,34215.0,22635.0,16241.0,17974.0,,,True
0a61a353b1ea9fd9b8f63b60239634ca3007d58f,"[2022, 2021]",2022,[],,millions,AUD,7461.0,738.5,4627.0,2834.0,,,True
105688726e097505beef4934896193ac51295037,"[2022, 2021]",2022,[],,units,USD,5293478.0,1510559.0,356837.0,4936641.0,,,True
12bff07b957b1c8f8cad9d917ca18005720cce9b,"[2022, 2021]",2022,[],,thousands,,342466.0,42763.0,427253.0,-84787.0,,,True
13999998018cc53440310d94a26d1e8957e2277f,"[2022, 2021]",2022,"[december 31,, december 31,]","december 31,",millions,USD,2001.0,262.0,217.0,1784.0,,,True
14fa568899745270c4ff2c10073f97f2c2e7764b,"[2022, 2021]",2022,"[december 31, december 31]",december 31,thousands,CAD,4078.398,6.117,2227.858,1850.54,,,True
1a12ef3f11a64e92eeca39e493a17d2860c014a6,"[2022, 2021]",2022,"[december 31,]","december 31,",thousands,USD,2259879.0,33172.0,1889355.0,370524.0,1607110.0,,True
1af8f906e34af6e0acfe4f73e37093bbe34700f3,"[2022, 2021]",2022,[],,thousands,,223060.0,87550.0,33774.0,189286.0,,,True
23b2c590c4887dfb86761730dd7156fe3b216ab7,"[2022, 2021]",2022,"[december 31,, december 31,]","december 31,",thousands,USD,1745530.0,41916.0,1626581.0,118949.0,1420647.0,1110124.0,True


In [102]:
sha1 = "446545ae548543d8744f8d885ff75face3424ba4"
res_by_sha1[sha1][2]["line_items"]


[{'name': 'cash and cash equivalents', 'value': 375.0},
 {'name': 'trade and other receivables, net', 'value': 201.0},
 {'name': 'prepaid and other current assets', 'value': 84.0},
 {'name': 'total current assets', 'value': 660.0},
 {'name': 'investment in charter, accounted for using the equity method (note 6)',
  'value': 11433.0},
 {'name': 'property and equipment, net (note 2)', 'value': 1011.0},
 {'name': 'intangible assets not subject to amortization', 'value': 755.0},
 {'name': 'goodwill (note 7)', 'value': 550.0},
 {'name': 'cable certificates', 'value': 550.0},
 {'name': 'other', 'value': 37.0},
 {'name': 'intangible assets subject to amortization, net (note 7)',
  'value': 516.0},
 {'name': 'other assets, net', 'value': 180.0}]

In [92]:
sha1 = "980742aa08ea64d552c153bcefbd7e8243fb9efd"
page_number = df_bs_page_text_llm["page"][df_bs_page_text_llm["sha1"] == sha1]
page_number = page_number.values[0]
text_page = get_sha1_page_text(sha1, page_number+1)
print(text_page)



#### **Consolidated Statement of Financial Position**

| LIABILITIES (Euro/000)                                               | Notes | 31.12.2022 | 31.12.2021 |
|----------------------------------------------------------------------|-------|------------|------------|
| A)<br>Total Equity (1+2+3+4+5+6)                                     | 10    | 451,567    | 421,724    |
| 1)<br>Share capital                                                  | 10    | 30,392     | 30,392     |
| 2)<br>Reserves                                                       | 10    | 132,266    | 119,668    |
| 3)<br>Retained earnings (losses)                                     | 10    | 255,840    | 229,691    |
| 4)<br>Profit (loss) for the year                                     | 10    | 29,550     | 38,913     |
| 5)<br>Group Equity                                                   | 10    | 448,048    | 418,665    |
| Profit (loss) for the year attributable to non-controlling interests | 10    | 576   