In [3]:
import cv2 as cv
from IPython.core.interactiveshell import InteractiveShell
import pandas as pd
import tqdm.notebook
import matplotlib.pyplot as plt

import re
import importlib
import json

In [4]:
# Configure Jupyter to display all output lines
InteractiveShell.ast_node_interactivity = "all"

In [5]:
df_subset = pd.read_parquet(f"../data_temp/04 subset.parquet")
df_subset_filtered = df_subset[df_subset["is_in_questions"]==True]
pdf_name_i = df_subset_filtered["sha1"].iloc[0] + ".pdf"
output_path = f"../data_in/temp/md/{pdf_name_i}.md"
output_path
# Read the markdown text from the saved file
with open(output_path, "r", encoding="utf-8") as f:
    text = f.read()

'../data_in/temp/md/0279901b645e568591ad95dac2c2bf939ef0c00d.pdf.md'

In [6]:
words_one_string = """net assets
equity
liabilities held for sale
plant and equipment
preferred stock
accrued interest payable
current tax liabilities
current portion of long-term debt
principal paydowns receivable
assets
other current liabilities
total liabilities and equity
operating lease liability
accounts payable and other liabilities
non-controlling interest
exploration and evaluation assets
other non-current assets
liability related to the sale of royalties
total current liabilities
restricted cash
issued capital
less: allowance for credit losses
inventories
deferred tax liabilities
loan receivable
contract liabilities
right of use assets
prepayments and other assets
retained earnings
liabilities and stockholders' equity
operating lease right-of-use asset
income tax receivable
other non-current liabilities
other assets
employee benefits provision
distributions in excess of earnings
deferred tax assets
short-term
total equity
lease liabilities
properties held for sale
derivative financial instruments
accumulated other comprehensive loss
prepayments
other current assets
investments in unconsolidated entities
non-controlling interests
lease receivables
borrowings
common stock
total liabilities
current tax assets
non-current assets
total stockholders' equity
cash and cash equivalents
investments in real estate
other financial assets
right-of-use assets
equity accounted investments
motor vehicles
total current assets
accrued interest receivable
reserves
deferred consideration
intangible assets
liabilities
stockholders' equity
long-term
distributions payable
current assets
other receivables
total non-current liabilities
financial liabilities
accounts payable and other accrued liabilities
trade payables and contract liabilities
non-current liabilities
accumulated losses
additional paid-in capital
other provisions
provisions
accrued compensation
current liabilities
trade receivables and contract assets
trade and other receivables
prepaid expenses
other financial liabilities
long-term debt
royalty and milestone receivable
assets held for sale
cre loans
parent interests
accrued tax liability
total non-current assets
total assets
totalassets
property and equipment
property, plant and equipment
trade and other payables
management fee payable
trade receivables and contract assetsother financial assets"""
balance_sheet_words = words_one_string.split("\n")
print(f"len(balance_sheet_words): {len(balance_sheet_words)}")

len(balance_sheet_words): 100


In [7]:
# Function to get text of a specific page
def get_page_text(text, page_number):
    """
    Extract text for a specified page number from the full document text.
    
    Args:
        text (str): The full document text
        page_number (int): The page number to extract
        
    Returns:
        str: The extracted text for the specified page
    """
    page_text = ""
    # Find the start and end of the page in the text
    page_marker = f"{{{page_number}}}------------------------------------------------"
    page_start = text.find(page_marker)
    if page_start != -1:
        page_start = page_start + len(page_marker)
        # Look for the next page marker or end of text
        next_page_marker = f"{{{page_number + 1}}}------------------------------------------------"
        next_page_start = text.find(next_page_marker, page_start)
        if next_page_start != -1:
            page_text = text[page_start:next_page_start]
        else:
            # If there's no next page, take all text until the end
            page_text = text[page_start:]
        return page_text
    else:
        return f"Page {page_number} is not found in the document."

def count_phrases_by_page(text, phrases):
    # Compile the regular expression for page markers.
    # The marker matches: "{" followed by one or more digits, then "}" and at least 48 hyphens.
    page_marker = re.compile(r"\{(\d+)\}-{48,}")

    # Split the text by the page markers.
    # With the capturing group, the list will include the page number(s) interleaved with the text parts.
    parts = page_marker.split(text)

    page_counts = {}

    # Iterate over the list in pairs: page number and page text.
    for i in range(1, len(parts), 2):
        page_num = int(parts[i])
        page_text = parts[i+1] if (i+1) < len(parts) else ""
        page_text = page_text.replace("<br>", " ").replace("\n", " ")
        counts = {}
        for phrase in phrases:
            pattern = re.compile(re.escape(phrase), flags=re.IGNORECASE)
            counts[phrase] = len(pattern.findall(page_text))
        page_counts[page_num] = counts

    return page_counts

def count_numbers_on_page(text, filter_pages=None):
    """
    Count the number of numeric values on each page of the document.
    
    Args:
        text (str): The markdown text of the document
        filter_pages (list, optional): List of specific page numbers to analyze. If None, all pages are analyzed.
        
    Returns:
        dict: Dictionary mapping page numbers to the count of numbers found on that page
    """
    # Find page markers in the text
    page_markers = re.finditer(r"\{(\d+)\}-{48,}", text)
    page_numbers = []
    page_starts = []

    # Collect all page numbers and their starting positions
    for match in page_markers:
        page_numbers.append(int(match.group(1)))
        page_starts.append(match.start())

    # Add the end of the document as the last position
    page_starts.append(len(text))

    # Pattern to match numbers in various formats:
    # - Regular integers: 123, 456
    # - Numbers with commas: 1,234,567
    # - Numbers with dots as decimal separators: 123.45
    # - Numbers with spaces as thousand separators: 1 234 567
    number_pattern = r'(?<!\w)(?:(?:\d{1,3}(?:,\d{3})+|\d{1,3}(?:\s\d{3})+|\d+(?:\.\d+)?))(?!\w)'
    
    # Count numbers on each page
    page_number_counts = {}

    for i in range(len(page_numbers)):
        page_num = page_numbers[i]
        
        # Skip pages not in filter_pages if filter is provided
        if filter_pages is not None and page_num not in filter_pages:
            continue
            
        start_pos = page_starts[i]
        end_pos = page_starts[i+1] if i+1 < len(page_starts) else len(text)
        
        # Extract the page content
        page_content = text[start_pos:end_pos]
        
        # Find all numbers on the page
        numbers_found = re.findall(number_pattern, page_content)
        
        # Store the count
        page_number_counts[page_num] = len(numbers_found)

    return page_number_counts

In [8]:
df_subset_filtered["sha1"].iloc[1]

'0981826b4b43a88920f3e01c71ae73539bab84cc'

In [9]:
df_subset_filtered["sha1"][df_subset_filtered["sha1"].str.startswith("8f5e29")]

52    8f5e29eea4f4a3e944707c71148439ca1fd4b2d8
Name: sha1, dtype: object

In [10]:
texts_for_llm = []
for i, sha1_i in enumerate(tqdm.notebook.tqdm(df_subset_filtered["sha1"])):
    # print(f"i: {i}")

    # sha1_i = "8f5e29eea4f4a3e944707c71148439ca1fd4b2d8"
    # print(f"sha1_i: {sha1_i}")
    pdf_name_i = sha1_i + ".pdf"
    output_path = f"../data_in/temp/EnterpriseRAG_2025_02_markdown/{sha1_i}/{sha1_i}.md"
    with open(output_path, "r", encoding="utf-8") as f:
        text = f.read()
    res = count_phrases_by_page(text, balance_sheet_words)
    df = pd.DataFrame.from_dict(res, orient='index')
    df2 = (df > 0)*1
    number_counts = count_numbers_on_page(text)
    s_number_counts = pd.Series(number_counts)
    sums1 = df2.sum(axis=1) 
    sums2 = df2.sum(axis=1) + df2["total assets"]*10
    sums1 = sums1.sort_values(ascending=False)
    sums2 = sums2.sort_values(ascending=False)
    # Create a list of pages by interleaving the indices from sums1 and sums2
    pages_from_sums1 = sums1.index.tolist()
    pages_from_sums2 = sums2.index.tolist()
    
    # Determine the maximum length to iterate through
    max_length = max(len(pages_from_sums1), len(pages_from_sums2))
    
    # Create the interleaved list
    interleaved_pages = []
    for i in range(max_length):
        # Add page from sums1 if available
        if i < len(pages_from_sums1):
            interleaved_pages.append(pages_from_sums1[i])
        # Add page from sums2 if available
        if i < len(pages_from_sums2):
            interleaved_pages.append(pages_from_sums2[i])
    
    # Remove duplicates while preserving order
    unique_interleaved_pages = []
    for page in interleaved_pages:
        if page not in unique_interleaved_pages:
            unique_interleaved_pages.append(page)
    
    # print("Interleaved pages:", unique_interleaved_pages[:10])
    
    # Use the first page from the interleaved list
    sums = sums1  # Default to sums1 for compatibility with existing code
    page = sums.idxmax()
    bs_page = get_page_text(text, page)
    # print("-"*80)
    # print(f"sha1_i: {sha1_i}")
    text_to_llm = ""
    for page in unique_interleaved_pages[:10]:
        page_string = f"{{{page}}}------------------------------------------------"
        text_to_llm += page_string + "\n"
        # print(page_string)
        bs_page = get_page_text(text, page)
        text_to_llm += bs_page + "\n"
        # print(bs_page)
    # print("#"*80)
    # print("# text_to_llm")
    # print("#"*80)
    # print(text_to_llm)
    texts_for_llm.append(text_to_llm)


  0%|          | 0/60 [00:00<?, ?it/s]

In [11]:
importlib.reload(importlib.import_module("utils.llm"))
import utils.llm as llm

<module 'utils.llm' from 'c:\\Users\\keivf\\Documents\\20250224 RAG Challenge TTA 2025-02\\notebooks_all\\utils\\llm.py'>

In [12]:
in_params = [{'text': x} for x in texts_for_llm]
res = await llm.run_tasks_with_retries(llm.find_balance_sheet_page, in_params, retries=20)
res


Sending for one question started
000000000000000000000000000000000000000010000000000000000000
000100000000000000000000000000000000000010000000000000000000
000100000000000000000000000000000000000010000000000000000100
000101000000000000000000000000000000000010000000000000000100
000101000000000000000001000000000000000010000000000000000100
000101000000000000000001000001000000000010000000000000000100
000101000000000001000001000001000000000010000000000000000100
000101000000100001000001000001000000000010000000000000000100
000101000000100101000001000001000000000010000000000000000100
000101000000100101001001000001000000000010000000000000000100
000101000010100101001001000001000000000010000000000000000100
000101000010100101001001000001000000100010000000000000000100
000101000010100101001001000001000000100010000000000010000100
000101000010110101001001000001000000100010000000000010000100
000101000010110101001001000001000000100010000100000010000100
0001011000101101010010010000010000001000100001000000

[(0,
  {'text': '{91}------------------------------------------------\n\n\n# **ACRES COMMERCIAL REALTY CORP. AND SUBSIDIARIES CONSOLIDATED BALANCE SHEETS (in thousands, except share and per share data)**\n\n|                                                                                         |                 | December 31 |           |\n|-----------------------------------------------------------------------------------------|-----------------|-------------|-----------|\n|                                                                                         | 2022            |             | 2021      |\n| (1)<br>ASSETS                                                                           |                 |             |           |\n| Cash and cash equivalents                                                               | \\$<br>66,232    | \\$          | 35,500    |\n| Restricted cash                                                                         | 38,579        

In [13]:
res_by_sha1 = {}
for i, sha1_i in enumerate(df_subset_filtered["sha1"]):
    res_by_sha1[sha1_i] = res[i]

with open("../data_temp/10_bs_page_by_sha1.json", "w") as f:
    json.dump(res_by_sha1, f)

balance_sheet_page_llm_text = {}
for i, sha1_i in enumerate(df_subset_filtered["sha1"]):
    key = sha1_i
    best_page = res[i][2]["result"]
    # print(f"best_page: {best_page}")
    if best_page != "N/A":
        pages = res[i][2]["pages"]
        page_data = [page for page in pages if page["page_number"] == best_page][0]
        balance_sheet_page_llm_text[key] = {"page_number": best_page, "page_data": page_data}
    else:
        balance_sheet_page_llm_text[key] = {"page_number": "N/A", "page_data": "N/A"}

with open("../data_temp/10_bs_page_llm_text.json", "w") as f:
    json.dump(balance_sheet_page_llm_text, f)

balance_sheet_page_llm_text_only_page = {k: v["page_number"] for k, v in balance_sheet_page_llm_text.items()}
df_bs_page_text_llm = pd.Series(balance_sheet_page_llm_text_only_page, name="page").reset_index().rename(columns={"index": "sha1"})
df_bs_page_text_llm.to_csv("../data_temp/10_bs_page_text_llm.csv")
df_bs_page_text_llm






Unnamed: 0,sha1,page
0,0279901b645e568591ad95dac2c2bf939ef0c00d,91.0
1,0981826b4b43a88920f3e01c71ae73539bab84cc,79.0
2,0a61a353b1ea9fd9b8f63b60239634ca3007d58f,61.0
3,105688726e097505beef4934896193ac51295037,32.0
4,12bff07b957b1c8f8cad9d917ca18005720cce9b,53.0
5,13999998018cc53440310d94a26d1e8957e2277f,61.0
6,14fa568899745270c4ff2c10073f97f2c2e7764b,93.0
7,1a12ef3f11a64e92eeca39e493a17d2860c014a6,69.0
8,1af8f906e34af6e0acfe4f73e37093bbe34700f3,29.0
9,23b2c590c4887dfb86761730dd7156fe3b216ab7,66.0
