In [48]:
import cv2 as cv
from IPython.core.interactiveshell import InteractiveShell
from markdown import Markdown
from marker.config.parser import ConfigParser
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.output import text_from_rendered
import numpy as np
import openai
import pandas as pd
from pydantic import BaseModel, Field
from rapidfuzz import fuzz
import tqdm

import asyncio
import csv
import io
import json
import os
import re
import textwrap
from typing import List, Optional, Tuple, Union, Literal

In [3]:
with open("../tokens/openai_token.txt") as f:
    openai_token = f.read()

In [4]:
client = openai.OpenAI(
    # This is the default and can be omitted
    api_key=openai_token,
)

In [5]:
async_client = openai.AsyncOpenAI(
    # This is the default and can be omitted
    api_key=openai_token,
)

In [49]:
# Configure Jupyter to display all output lines
InteractiveShell.ast_node_interactivity = "all"


In [6]:
df_subset = pd.read_parquet(f"../data_temp/04 subset.parquet")
df_subset_filtered = df_subset[df_subset["is_in_questions"]==True]
df_subset_filtered

Unnamed: 0,sha1,cur,company_name,major_industry,mentions_recent_mergers_and_acquisitions,has_leadership_changes,has_layoffs,has_executive_compensation,has_rnd_investment_numbers,has_new_product_launches,...,has_dividend_policy_changes,has_share_buyback_plans,has_capital_structure_changes,mentions_new_risk_factors,has_guidance_updates,has_regulatory_or_litigation_issues,has_strategic_restructuring,has_supply_chain_disruptions,has_esg_initiatives,is_in_questions
0,0279901b645e568591ad95dac2c2bf939ef0c00d,USD,ACRES Commercial Realty Corp.,Financial Services,False,False,False,True,False,False,...,False,True,False,True,True,False,False,False,True,True
1,0981826b4b43a88920f3e01c71ae73539bab84cc,USD,Aptevo Therapeutics Inc.,Healthcare,True,False,False,True,True,True,...,False,False,False,True,False,True,False,True,True,True
2,0a61a353b1ea9fd9b8f63b60239634ca3007d58f,USD,Downer EDI Limited,Transport & Logistics,True,True,True,True,False,False,...,True,True,True,True,True,True,True,False,True,True
5,105688726e097505beef4934896193ac51295037,USD,Peako Limited,Technology,False,True,False,True,True,False,...,False,False,True,True,False,False,False,False,False,True
6,12bff07b957b1c8f8cad9d917ca18005720cce9b,USD,Mosaic Brands Limited,Retail,True,False,False,True,True,False,...,True,True,True,True,True,False,True,True,True,True
7,13999998018cc53440310d94a26d1e8957e2277f,USD,"Aurora Innovation, Inc.",Technology,True,False,False,True,True,True,...,False,False,False,True,False,True,False,True,True,True
8,14fa568899745270c4ff2c10073f97f2c2e7764b,USD,Crombie REIT,Retail,True,True,False,True,False,True,...,False,False,False,True,False,False,False,False,True,True
9,1a12ef3f11a64e92eeca39e493a17d2860c014a6,USD,Medallion Financial Corp.,Financial Services,True,False,False,True,False,True,...,False,False,False,True,False,True,False,False,False,True
11,1af8f906e34af6e0acfe4f73e37093bbe34700f3,USD,BetMakers Technology Group Ltd,Technology,True,True,False,True,False,True,...,False,True,True,True,True,False,True,False,True,True
13,23b2c590c4887dfb86761730dd7156fe3b216ab7,USD,"FNCB Bancorp, Inc.",Financial Services,True,True,False,True,False,True,...,True,True,False,True,False,True,False,False,True,True


In [7]:
pdf_name_i = df_subset_filtered["sha1"].iloc[0] + ".pdf"
pdf_name_i

'0279901b645e568591ad95dac2c2bf939ef0c00d.pdf'

In [8]:
output_path = f"../data_in/temp/md/{pdf_name_i}.md"
output_path

'../data_in/temp/md/0279901b645e568591ad95dac2c2bf939ef0c00d.pdf.md'

In [9]:
# config = {
#     #"output_format": "json",
#     #"page_range": [0, 1],
#     "paginate_output": True,
# }
# config_parser = ConfigParser(config)

# converter = PdfConverter(
#     config=config_parser.generate_config_dict(),
#     artifact_dict=create_model_dict(),
# )
# rendered = converter(f"../data_in/temp/pdfs/pdfs/{pdf_name_i}")
# text, _, images = text_from_rendered(rendered)
# # Save markdown text to file
# os.makedirs(os.path.dirname(output_path), exist_ok=True)
# with open(output_path, "w", encoding="utf-8") as f:
#     f.write(text)


In [10]:
# Read the markdown text from the saved file
with open(output_path, "r", encoding="utf-8") as f:
    text = f.read()


In [11]:
t70k = text[:70000].rpartition('\n')[0]
t70k

'{0}------------------------------------------------\n\n![](_page_0_Picture_0.jpeg)\n\n# 2022 Annual Report\n\n{1}------------------------------------------------\n\n{2}------------------------------------------------\n\nDear Fellow Shareholder,\n\nWe are pleased to provide you an update on the progress of your investments in ACRES Commercial Realty Corp. ("ACR" or the "Company"). During the past year, our manager has continued to make progress on the core tenets of its responsibilities to manage the existing assets in the portfolio, originate new loan assets, facilitate the financing of the assets, and control costs to drive earnings and increase book value per share. We are pleased to report that substantial progress has been made on all fronts and we remain focused on these mandates going forward.\n\nThe ACRES team continued to diligently manage our assets and provide exceptional service for our borrowers. We worked proactively with these borrowers to address their needs so that the

In [12]:
classes = """
class TableOfContentsEntry(BaseModel):
    id: Optional[str] = None  # Could be number or text representing numbering, or empty if none found
    name: str  # Full name of the TOC element (required)
    parent: Optional[str] = None  # Full name of parent category, or empty if none
    page: Optional[Union[int, str]] = None  # Page number for the TOC element. Usually a number, but sometimes a short string like "PG01"
class TableOfContents(BaseModel):
    table_of_contents_page: int # The page number where the table of contents starts. The page numbers are 0-based and all pages start with a line "{<page number>}------------------------------------------------"
    entries: List[TableOfContentsEntry]
"""

exec(classes)

msg = f"""Extract the table of contents from this markdown text with the page number where it starts and return it as JSON matching the Pydantic model. The JSON output should begin with and end with @@@

---
Output format should be in JSON format matching this Pydantic model:
{classes}

---
Text:

{t70k}
---"""

chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user", 
            "content": msg,
        }
    ],
    model="o1-mini",
)

ans = chat_completion.choices[0].message.content
ans2 = ans.split("@@@")[1]
_ = TableOfContents.model_validate_json(ans2)
json_ans = json.loads(ans2)
json_ans

{'table_of_contents_page': 7,
 'entries': [{'id': None,
   'name': 'Forward-Looking Statements',
   'parent': None,
   'page': 3},
  {'id': 'PART I', 'name': 'PART I', 'parent': None, 'page': None},
  {'id': 'Item 1', 'name': 'Business', 'parent': 'PART I', 'page': 5},
  {'id': 'Item 1A', 'name': 'Risk Factors', 'parent': 'Business', 'page': 16},
  {'id': 'Item 1B',
   'name': 'Unresolved Staff Comments',
   'parent': 'Business',
   'page': 40},
  {'id': 'Item 2', 'name': 'Properties', 'parent': 'PART I', 'page': 40},
  {'id': 'Item 3',
   'name': 'Legal Proceedings',
   'parent': 'PART I',
   'page': 40},
  {'id': 'Item 4',
   'name': 'Mine Safety Disclosures',
   'parent': 'PART I',
   'page': 40},
  {'id': 'PART II', 'name': 'PART II', 'parent': None, 'page': None},
  {'id': 'Item 5',
   'name': "Market for Registrant's Common Equity, Related Stockholder Matters and Issuer Purchases of Equity Securities",
   'parent': 'PART II',
   'page': 41},
  {'id': 'Item 6', 'name': '[Reserved]

In [13]:
toc_page = json_ans["table_of_contents_page"]
toc_page


7

In [14]:
df_toc = pd.DataFrame(json_ans["entries"])
df_toc2 = df_toc.dropna(subset = ["page"]).copy()
df_toc2["page"]

# Replace None values with pd.NA in the id, name, and parent columns
df_toc2["id"] = df_toc2["id"].replace({None: pd.NA})
df_toc2["name"] = df_toc2["name"].replace({None: pd.NA})
df_toc2["parent"] = df_toc2["parent"].replace({None: pd.NA})



df_toc2["page"] = df_toc2["page"].astype("Int64")

df_toc2.reset_index()

Unnamed: 0,index,id,name,parent,page
0,0,,Forward-Looking Statements,,3
1,2,Item 1,Business,PART I,5
2,3,Item 1A,Risk Factors,Business,16
3,4,Item 1B,Unresolved Staff Comments,Business,40
4,5,Item 2,Properties,PART I,40
5,6,Item 3,Legal Proceedings,PART I,40
6,7,Item 4,Mine Safety Disclosures,PART I,40
7,9,Item 5,"Market for Registrant's Common Equity, Related...",PART II,41
8,10,Item 6,[Reserved],PART II,43
9,11,Item 7,Management's Discussion and Analysis of Financ...,PART II,44


In [15]:
# Find the string pattern "{<toc_page>}------------------------------------------------" in the text
# where toc_page is the page number from the table of contents
toc_page_pattern = f"{{{toc_page+1}}}------------------------------------------------"

# Search for the pattern in the text
toc_page_index = text.find(toc_page_pattern)

t70k_after_toc = text[toc_page_index:][:70000]
for line in t70k_after_toc.split("\n")[:10]:
    print(line)


{8}------------------------------------------------

# **FORWARD-LOOKING STATEMENTS**

*In this annual report on Form 10-K, references to "Company," "we," "us," or "our" refer to ACRES Commercial Realty Corp. and its subsidiaries; references to the Company's "Manager" refer to ACRES Capital, LLC, a subsidiary of ACRES Capital Corp., unless specifically stated otherwise or the context otherwise indicates. This report contains certain forward-looking statements. Forwardlooking statements relate to expectations, beliefs, projections, future plans and strategies, anticipated events or trends and similar expressions concerning matters that are not historical facts. In some cases, you can identify forward-looking statements by terms such as "anticipate," "believe," "could," "estimate," "expects," "intend," "may," "plan," "potential," "project," "should," "will" and "would" or the negative of these terms or other comparable terminology.* 

Forward-looking statements contained in this report a

In [16]:
df_toc2.columns

Index(['id', 'name', 'parent', 'page'], dtype='object')

In [17]:

# Create the string representation of the table of contents
str_toc_2 = df_toc2.reset_index()[["index", "id", "name", "parent"]].to_csv(index=False, quoting=csv.QUOTE_NONNUMERIC, lineterminator="\n")


In [18]:
classes = """
class TableOfContentsEntry(BaseModel):
    index: int # index column from the input table of contents
    id: Optional[str]  # id column from the input table of contents
    name: str # name column from the input table of contents
    parent: Optional[str] # parent column from the input table of contents
    text_page_number: Union[int, Literal["N/A"]] = "N/A"  # Physical page number where the item is found. Every page starts with "{<page number>}------------------------------------------------"
class Result(BaseModel):
    entries: List[TableOfContentsEntry]
"""

exec(classes)

msg = f"""This text has page numbers (let's call them text page numbers), starting from 0. They are in the format "{{<page number>}}------------------------------------------------". 
Using the table of contents below find a text page number for each line in table of contents. If not found use None. Output in JSON. The JSON output should begin with and end with @@@

---
Output format should be in JSON format matching this Pydantic model:
{classes}

---
Table of contents:
{str_toc_2}
---
Text:

{t70k_after_toc}
---"""

chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user", 
            "content": msg,
        }
    ],
    model="o1-mini",
)

ans = chat_completion.choices[0].message.content
ans2 = ans.split("@@@")[1]
_ = Result.model_validate_json(ans2)
json_ans = json.loads(ans2)
json_ans

{'entries': [{'index': 0,
   'id': '',
   'name': 'Forward-Looking Statements',
   'parent': '',
   'text_page_number': 8},
  {'index': 2,
   'id': 'Item 1',
   'name': 'Business',
   'parent': 'PART I',
   'text_page_number': 10},
  {'index': 3,
   'id': 'Item 1A',
   'name': 'Risk Factors',
   'parent': 'Business',
   'text_page_number': 21},
  {'index': 4,
   'id': 'Item 1B',
   'name': 'Unresolved Staff Comments',
   'parent': 'Business',
   'text_page_number': 'N/A'},
  {'index': 5,
   'id': 'Item 2',
   'name': 'Properties',
   'parent': 'PART I',
   'text_page_number': 'N/A'},
  {'index': 6,
   'id': 'Item 3',
   'name': 'Legal Proceedings',
   'parent': 'PART I',
   'text_page_number': 'N/A'},
  {'index': 7,
   'id': 'Item 4',
   'name': 'Mine Safety Disclosures',
   'parent': 'PART I',
   'text_page_number': 'N/A'},
  {'index': 9,
   'id': 'Item 5',
   'name': "Market for Registrant's Common Equity, Related Stockholder Matters and Issuer Purchases of Equity Securities",
   'pa

In [19]:
df_toc3 = pd.DataFrame(json_ans["entries"])
df_toc3

Unnamed: 0,index,id,name,parent,text_page_number
0,0,,Forward-Looking Statements,,8.0
1,2,Item 1,Business,PART I,10.0
2,3,Item 1A,Risk Factors,Business,21.0
3,4,Item 1B,Unresolved Staff Comments,Business,
4,5,Item 2,Properties,PART I,
5,6,Item 3,Legal Proceedings,PART I,
6,7,Item 4,Mine Safety Disclosures,PART I,
7,9,Item 5,"Market for Registrant's Common Equity, Related...",PART II,
8,10,Item 6,[Reserved],PART II,
9,11,Item 7,Management's Discussion and Analysis of Financ...,PART II,


In [20]:
map_text_page_number = df_toc3.set_index("index")["text_page_number"].to_dict()
df_toc4 = df_toc2.reset_index()
df_toc4["text_page_number"] = df_toc4["index"].map(map_text_page_number)
df_toc4


Unnamed: 0,index,id,name,parent,page,text_page_number
0,0,,Forward-Looking Statements,,3,8.0
1,2,Item 1,Business,PART I,5,10.0
2,3,Item 1A,Risk Factors,Business,16,21.0
3,4,Item 1B,Unresolved Staff Comments,Business,40,
4,5,Item 2,Properties,PART I,40,
5,6,Item 3,Legal Proceedings,PART I,40,
6,7,Item 4,Mine Safety Disclosures,PART I,40,
7,9,Item 5,"Market for Registrant's Common Equity, Related...",PART II,41,
8,10,Item 6,[Reserved],PART II,43,
9,11,Item 7,Management's Discussion and Analysis of Financ...,PART II,44,


In [25]:
text_after_toc = text[toc_page_index:]
# Create overlapping chunks with 20k character overlap
chunk_size = 70000
overlap = 20000
text_chunks = []
for i in range(0, len(text_after_toc), chunk_size - overlap):
    chunk = text_after_toc[i:i + chunk_size]
    # Find the start of the first page in the chunk
    page_start_match = re.search(r'\{(\d+)\}------------------------------------------------', chunk)
    if page_start_match and page_start_match.start() > 0:
        # If we found a page marker and it's not at the beginning, adjust the chunk to start at that page
        chunk = chunk[page_start_match.start():]
    text_chunks.append(chunk)
text_chunks_first5 = text_chunks[:5]


classes = """
class TableOfContentsEntry(BaseModel):
    index: int # index column from the input table of contents
    id: Optional[str]  # id column from the input table of contents
    name: str # name column from the input table of contents
    parent: Optional[str] # parent column from the input table of contents
    text_page_number_present: bool = False  # Whether the found text_page_number is present in the text. The page numbers are in the format "{<page number>}------------------------------------------------"
    text_page_number: Union[int, Literal["N/A"]] = "N/A"  # Physical page number where the item is found. Every page starts with "{<page number>}------------------------------------------------"
class Result(BaseModel):
    page_number_error_beginning_from_this_index: int # The index of the line in the table of contents where the page number do not seem reasonable. For example if the page number is lower than the previous page number or there is a big gap of items between the page number and the previous page number and the difference in page numbers is too small.
    entries: List[TableOfContentsEntry]
    page_number_error_beginning_from_this_index2: int # The index of the line in the table of contents where the page number do not seem reasonable. For example if the page number is lower than the previous page number or there is a big gap of items between the page number and the previous page number and the difference in page numbers is too small.

"""

exec(classes)

msg0 = f"""This text has page numbers (let's call them text page numbers), starting from 0. They are in the format "{{<page number>}}------------------------------------------------". 
Take each line from the table of contents below and find the text page number where the item starts. If not found use N/A. Output in JSON. The JSON output should begin with and end with @@@

---
Output format should be in JSON format matching this Pydantic model:
{classes}

---
Table of contents:
{str_toc_2}
---"""

async def process_chunk(chunk_i: int, chunk: str, msg0: str) -> Tuple[int, str]:
    msg = msg0 + textwrap.dedent(f"""
    Text:
    ---
    {chunk}
    ---""")
    chat_completion = await async_client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": msg,
            }
        ],
        model="o1-mini",
    )
    ans = chat_completion.choices[0].message.content
    try:
        ans2 = ans.split("@@@")[1]
        _ = Result.model_validate_json(ans2)
        return chunk_i, ans2
    except:
        # Error handling if the response doesn't contain @@@ or is invalid JSON
        return chunk_i, None

# Setup for async processing with retries
tasks_done = ["0"] * len(text_chunks_first5)
tasks_tries = {}
tasks = set()
json_results = [None] * len(text_chunks_first5)

# Create initial tasks
for i, chunk in enumerate(text_chunks_first5):
    tasks_tries[i] = 1
    tasks.add(asyncio.create_task(process_chunk(i, chunk, msg0)))

# Process tasks with retries
async def process_all_chunks():
    remaining_tasks = tasks.copy()
    print("".join(tasks_done))
    while remaining_tasks:
        done, remaining_tasks = await asyncio.wait(
            remaining_tasks, return_when=asyncio.FIRST_COMPLETED
        )
        for completed in done:
            i, result = await completed
            if result is None:
                if tasks_tries[i] < 5:
                    print(f"Task {i} failed. Total retries: {tasks_tries[i]}. Retrying")
                    tasks_tries[i] += 1
                    remaining_tasks.add(asyncio.create_task(process_chunk(i, text_chunks_first5[i], msg0)))
                else:
                    print(f"Task {i} failed. Total retries: {tasks_tries[i]}. Stop retrying.")
                    tasks_done[i] = "2"
                    json_results[i] = None  # None for failed chunks
            else:
                tasks_done[i] = "1"
                json_results[i] = result
            print("".join(tasks_done))

# Run the async processing
await process_all_chunks()

# Collect all entries from all chunks
all_entries = []
page_number_error_beginning_from_this_index = None
page_number_error_beginning_from_this_index2 = None

for i, result in enumerate(json_results):
    if result is not None:
        json_data = json.loads(result)
        # Add chunk number to each entry
        for entry in json_data["entries"]:
            entry["chunk_number"] = i
            entry["page_number_error_beginning_from_this_index"] = json_data["page_number_error_beginning_from_this_index"]
            entry["page_number_error_beginning_from_this_index2"] = json_data["page_number_error_beginning_from_this_index2"]
        all_entries.extend(json_data["entries"])

# Group entries by index to collect all values
entries_by_index = {}
for entry in all_entries:
    idx = entry["index"]
    if idx not in entries_by_index:
        entries_by_index[idx] = {
            "index": idx,
            "id": entry["id"],
            "name": entry["name"],
            "parent": entry["parent"],
            "text_page_numbers": [],
            "text_page_number_present": [],
            "page_number_error_beginning_from_this_index":[],
            "page_number_error_beginning_from_this_index2":[]
        }
    

    entries_by_index[idx]["text_page_numbers"].append(entry["text_page_number"])
    entries_by_index[idx]["text_page_number_present"].append(entry["text_page_number_present"])

json_ans = {
    "entries": list(entries_by_index.values())
}

# Create dataframe and map to original table of contents
df_toc3 = pd.DataFrame(json_ans["entries"])
df_toc4 = df_toc2.reset_index()

# Map the lists of values to the dataframe
df_toc4["text_page_numbers"] = df_toc4["index"].map(
    {idx: entry["text_page_numbers"] for idx, entry in entries_by_index.items()}
)

df_toc4["text_page_number_present"] = df_toc4["index"].map(
    {idx: entry["text_page_number_present"] for idx, entry in entries_by_index.items()}
)


# For backward compatibility, keep a single text_page_number column with the first non-N/A value
df_toc4["text_page_number"] = df_toc4["text_page_numbers"].apply(
    lambda x: next((val for val in x if val != "N/A"), "N/A") if isinstance(x, list) else "N/A"
)

df_toc4

00000
00100
01100
01110
11110
11111


Unnamed: 0,index,id,name,parent,page,text_page_numbers,text_page_number_present,text_page_number
0,0,,Forward-Looking Statements,,3,"[8, N/A, N/A, 38, N/A]","[True, False, False, True, False]",8
1,2,Item 1,Business,PART I,5,"[10, N/A, N/A, 38, N/A]","[True, False, False, True, False]",10
2,3,Item 1A,Risk Factors,Business,16,"[N/A, 21, N/A, N/A, N/A]","[False, True, False, False, False]",21
3,4,Item 1B,Unresolved Staff Comments,Business,40,"[N/A, N/A, N/A, 45, N/A]","[False, False, False, True, False]",45
4,5,Item 2,Properties,PART I,40,"[N/A, N/A, N/A, 46, N/A]","[False, False, False, True, False]",46
5,6,Item 3,Legal Proceedings,PART I,40,"[N/A, N/A, N/A, 46, N/A]","[False, False, False, True, False]",46
6,7,Item 4,Mine Safety Disclosures,PART I,40,"[N/A, N/A, N/A, 46, N/A]","[False, False, False, True, False]",46
7,9,Item 5,"Market for Registrant's Common Equity, Related...",PART II,41,"[N/A, N/A, N/A, 47, N/A]","[False, False, False, True, False]",47
8,10,Item 6,[Reserved],PART II,43,"[N/A, N/A, N/A, 49, 48]","[False, False, False, True, True]",49
9,11,Item 7,Management's Discussion and Analysis of Financ...,PART II,44,"[11, N/A, N/A, 49, 49]","[True, False, False, True, True]",11


In [51]:
# Function to find page ranges in text chunks
def find_page_numbers(text):
    """
    Find all page markers in the format "{<page number>}------------------------------------------------"
    and return a list of page numbers.
    """
    import re
    
    # Pattern to match the page marker format
    pattern = r"\{(\d+)\}-{48,}"
    
    # Find all matches
    matches = re.findall(pattern, text)
    
    # Convert matches to integers
    page_numbers = [int(match) for match in matches]
    
    return page_numbers if page_numbers else []

def get_page_range(text):
    """
    Find all page markers in the text and return a tuple of (min, max) page numbers.
    
    Args:
        text (str): The text to search for page markers
        
    Returns:
        tuple: A tuple containing (min_page, max_page), or (None, None) if no pages found
    """
    page_numbers = find_page_ranges(text)
    
    if not page_numbers:
        return (None, None)
    
    return (min(page_numbers), max(page_numbers))


In [29]:
min_max_page_ranges = []
for chunk in text_chunks:
    page_ranges = find_page_numbers(chunk)
    min_max_page_ranges.append({"min": min(page_ranges), "max": max(page_ranges)})

df_chunks = pd.DataFrame(min_max_page_ranges)
df_chunks



Unnamed: 0,min,max
0,8,23
1,19,32
2,29,41
3,38,51
4,48,61
5,59,69
6,67,78
7,76,86
8,83,98
9,94,108


In [96]:
def extract_specific_pages(pages_to_extract: Union[list[int],pd.NA], text: str ) -> str:
    """
    Extract only specific pages from a text document.
    
    Args:
        text (str): The full text document with page markers like {0}, {1}, etc.
        pages_to_extract (list[int]): List of page numbers to extract
    
    Returns:
        str: A new text containing only the specified pages
    """
    if pages_to_extract is pd.NA:
        return pd.NA
    # Find all page markers and their positions with exactly 48 hyphens
    pattern = r'\{(\d+)\}-{48}\n'
    matches = list(re.finditer(pattern, text))
    
    if not matches:
        return text  # No page markers found, return original text
    
    extracted_text = []
    
    # Process each page to extract
    for i, match in enumerate(matches):
        page_num = int(match.group(1))
        
        # Skip if this page is not in our list of pages to extract
        if page_num not in pages_to_extract:
            continue
            
        # Find the start position (after the page marker)
        start_pos = match.start()
        
        # Find the end position (start of next page marker or end of text)
        if i < len(matches) - 1:
            end_pos = matches[i + 1].start()
        else:
            end_pos = len(text)
            
        # Extract the page content and add it to our result
        page_content = text[start_pos:end_pos].strip()
        extracted_text.append(page_content)
    
    # Join all extracted pages with newlines
    return "\n\n".join(extracted_text)

# Example usage:
# extracted_text = extract_specific_pages(text_chunks[0], [1, 3, 5])


In [88]:
def expand_page_ranges(page_lists: pd.Series, min_page: int = 1, max_page: float = float('inf')) -> pd.Series:
    """
    Add one page number before and one page number after each page in the input lists,
    while respecting minimum and maximum page boundaries.
    
    Args:
        page_lists (pd.Series): Series containing lists of page numbers or pd.NA
        min_page (int): Minimum allowed page number
        max_page (int): Maximum allowed page number
        
    Returns:
        pd.Series: Series containing lists with expanded page ranges
    """
    
    def expand_single_list(pages):
        # Check for *scalar* missing values: pd.NA, np.nan, None, etc.
        if pages is pd.NA or pages is None or (isinstance(pages, float) and np.isnan(pages)):
            return pages
        
        # Next, ensure 'pages' is list-like
        if not isinstance(pages, (list, tuple, np.ndarray)):
            # If it's some other scalar that's not NA, just return it as is
            return pages
            
        expanded_pages = set()  # Use a set to automatically remove duplicates
        for page in pages:
            # Add the page itself if within bounds
            if min_page <= page <= max_page:
                expanded_pages.add(page)
                
            # Add one page before (if within bounds)
            prev_page = page - 1
            if min_page <= prev_page <= max_page:
                expanded_pages.add(prev_page)
                
            # Add one page after (if within bounds)
            next_page = page + 1
            if min_page <= next_page <= max_page:
                expanded_pages.add(next_page)
                
        # Convert set to sorted list to maintain order
        return sorted(expanded_pages)
    
    # Apply the function to each element in the series
    return page_lists.apply(expand_single_list)

In [76]:
def find_closest_pages(page_lists: pd.Series, page_model: pd.Series, n_closest: int = 3) -> pd.Series:
    """
    For each list of pages in page_lists, find the n_closest pages to the corresponding
    value in page_model.
    
    Args:
        page_lists (pd.Series): Series containing lists of page numbers or pd.NA
        page_model (pd.Series): Series containing target page numbers to compare against
        n_closest (int): Number of closest pages to keep (default: 3)
        
    Returns:
        pd.Series: Series containing lists with only the n_closest pages to the model
    """
    
    def find_closest_in_list(pages, model_page):
        # Check for missing values
        if pages is pd.NA or pages is None or (isinstance(pages, float) and np.isnan(pages)):
            return pages
        
        # If not list-like, return as is
        if not isinstance(pages, (list, tuple, np.ndarray)):
            return pages
            
        # If empty list, return as is
        if len(pages) == 0:
            return pages
            
        # If list has fewer elements than n_closest, return all pages
        if len(pages) <= n_closest:
            return pages
            
        # Calculate absolute differences between each page and the model page
        differences = [(page, abs(page - model_page)) for page in pages]
        
        # Sort by difference (ascending)
        differences.sort(key=lambda x: x[1])
        
        # Take the n_closest pages
        closest_pages = [page for page, _ in differences[:n_closest]]
        
        # Return the closest pages in their original order
        return sorted(closest_pages)
    
    # Apply the function to each element in the series
    return pd.Series([
        find_closest_in_list(pages, model) 
        for pages, model in zip(page_lists, page_model)
    ], index=page_lists.index)

In [98]:
# Clean up text_page_numbers by removing 'N/A' values
df_toc5 = df_toc4.copy()  # Create a copy to avoid SettingWithCopyWarning

# Function to filter out 'N/A' values from a list
def remove_na_values(page_list):
    if isinstance(page_list, list):
        # Remove 'N/A' values and keep only unique values while preserving order
        seen = set()
        return [page for page in page_list if page != 'N/A' and not (page in seen or seen.add(page))]
    return page_list

# Apply the function to the text_page_numbers column
df_toc5['text_page_numbers2'] = df_toc5['text_page_numbers'].apply(remove_na_values)

first_page_difference = df_toc5['text_page_number'].iloc[0] - df_toc5["page"].iloc[0]
df_toc5["text_page_number_model"] = df_toc5["page"] + first_page_difference
min_page , _ = get_page_range(text_chunks_first5[0])
_ , max_page = get_page_range(text_chunks_first5[-1])
df_toc5.loc[df_toc5["text_page_number_model"] > (max_page - 10), "text_page_numbers2"] = pd.NA
df_toc5["text_page_numbers2"] = find_closest_pages(df_toc5["text_page_numbers2"], df_toc5["text_page_number_model"])
df_toc5["text_page_numbers2"] = expand_page_ranges(df_toc5["text_page_numbers2"], min_page, max_page)
df_toc5['extracted_text'] = df_toc5['text_page_numbers2'].apply(lambda x: extract_specific_pages(x, text_after_toc))


# Display the cleaned dataframe
df_toc5



Unnamed: 0,index,id,name,parent,page,text_page_numbers,text_page_number_present,text_page_number,text_page_numbers2,text_page_number_model,extracted_text
0,0,,Forward-Looking Statements,,3,"[8, N/A, N/A, 38, N/A]","[True, False, False, True, False]",8,"[8, 9, 37, 38, 39]",8,{8}-------------------------------------------...
1,2,Item 1,Business,PART I,5,"[10, N/A, N/A, 38, N/A]","[True, False, False, True, False]",10,"[9, 10, 11, 37, 38, 39]",10,{9}-------------------------------------------...
2,3,Item 1A,Risk Factors,Business,16,"[N/A, 21, N/A, N/A, N/A]","[False, True, False, False, False]",21,"[20, 21, 22]",21,{20}------------------------------------------...
3,4,Item 1B,Unresolved Staff Comments,Business,40,"[N/A, N/A, N/A, 45, N/A]","[False, False, False, True, False]",45,"[44, 45, 46]",45,{44}------------------------------------------...
4,5,Item 2,Properties,PART I,40,"[N/A, N/A, N/A, 46, N/A]","[False, False, False, True, False]",46,"[45, 46, 47]",45,{45}------------------------------------------...
5,6,Item 3,Legal Proceedings,PART I,40,"[N/A, N/A, N/A, 46, N/A]","[False, False, False, True, False]",46,"[45, 46, 47]",45,{45}------------------------------------------...
6,7,Item 4,Mine Safety Disclosures,PART I,40,"[N/A, N/A, N/A, 46, N/A]","[False, False, False, True, False]",46,"[45, 46, 47]",45,{45}------------------------------------------...
7,9,Item 5,"Market for Registrant's Common Equity, Related...",PART II,41,"[N/A, N/A, N/A, 47, N/A]","[False, False, False, True, False]",47,"[46, 47, 48]",46,{46}------------------------------------------...
8,10,Item 6,[Reserved],PART II,43,"[N/A, N/A, N/A, 49, 48]","[False, False, False, True, True]",49,"[47, 48, 49, 50]",48,{47}------------------------------------------...
9,11,Item 7,Management's Discussion and Analysis of Financ...,PART II,44,"[11, N/A, N/A, 49, 49]","[True, False, False, True, True]",11,"[10, 11, 12, 48, 49, 50]",49,{10}------------------------------------------...


In [103]:
classes = """
class Result(BaseModel):
    result: int # The page number that best corresponds to the start of the item.
"""

exec(classes)

async def process_item(index: int, item_name: str, item_id: str, text_pages: list, extracted_text: str) -> Tuple[int, int]:
    
    msg = f"""I need to find the most relevant page number for a table of contents item.

Item: {item_name} (ID: {item_id})
Available pages: {text_pages}

Please analyze the following text and determine which page number best corresponds to the start of this item.
Return only the single most relevant page number as an integer in a JSON format. The JSON output should begin with and end with @@@

---
Output format should be in JSON format matching this Pydantic model:
{classes}

---
Text:
---
{extracted_text}
---"""

    chat_completion = await async_client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": msg,
            }
        ],
        model="o1-mini",
    )
    ans = chat_completion.choices[0].message.content
    try:
        ans2 = ans.split("@@@")[1]
        res = Result.model_validate_json(ans2)
        page_number = res.result
        if page_number in text_pages:
            return index, page_number
        return index, None
    except:
        return index, None

# Setup for async processing with retries
df_toc5['best_page'] = pd.NA

# Create tasks for each row in df_toc5
tasks = []
for idx, row in df_toc5.iterrows():
    if row["text_page_numbers2"] is not pd.NA and not pd.isna(row["extracted_text"]):
        tasks.append(asyncio.create_task(
            process_item(
                idx, 
                row['name'], 
                str(row['id']), 
                row['text_page_numbers2'], 
                row['extracted_text']
            )
        ))

# Process tasks with retries
async def process_all_items():
    results = {}
    remaining_tasks = set(tasks)
    
    tasks_done = ["0"] * len(tasks)
    tasks_tries = [0] * len(tasks)
    print("".join(tasks_done))
    
    while remaining_tasks:
        done, remaining_tasks = await asyncio.wait(
            remaining_tasks, return_when=asyncio.FIRST_COMPLETED
        )
        
        for completed in done:
            try:
                idx, page = await completed
                i = tasks.index(completed)
                
                if page is not None:
                    results[idx] = page
                    tasks_done[i] = "1"
                else:
                    if tasks_tries[i] < 5:
                        print(f"Task {i} failed. Total retries: {tasks_tries[i]}. Retrying")
                        tasks_tries[i] += 1
                        remaining_tasks.add(asyncio.create_task(
                            process_item(
                                idx,
                                df_toc5.loc[idx, 'name'],
                                str(df_toc5.loc[idx, 'id']),
                                df_toc5.loc[idx, 'text_page_numbers2'],
                                df_toc5.loc[idx, 'extracted_text']
                            )
                        ))
                    else:
                        print(f"Task {i} failed. Total retries: {tasks_tries[i]}. Stop retrying.")
                        tasks_done[i] = "2"
                        results[idx] = None  # None for failed items
            except Exception as e:
                print(f"Error processing task: {e}")
            
            print("".join(tasks_done))
            
    return results

# Run the async processing
results = await process_all_items()

# Update the dataframe with the results
for idx, page in results.items():
    df_toc5.at[idx, 'best_page'] = page

df_toc5

0000000000
1000000000
1000001000
1000001100
1000101100
1100101100
1100101101
1100101111
1100111111
1101111111
1111111111


Unnamed: 0,index,id,name,parent,page,text_page_numbers,text_page_number_present,text_page_number,text_page_numbers2,text_page_number_model,extracted_text,best_page
0,0,,Forward-Looking Statements,,3,"[8, N/A, N/A, 38, N/A]","[True, False, False, True, False]",8,"[8, 9, 37, 38, 39]",8,{8}-------------------------------------------...,8.0
1,2,Item 1,Business,PART I,5,"[10, N/A, N/A, 38, N/A]","[True, False, False, True, False]",10,"[9, 10, 11, 37, 38, 39]",10,{9}-------------------------------------------...,10.0
2,3,Item 1A,Risk Factors,Business,16,"[N/A, 21, N/A, N/A, N/A]","[False, True, False, False, False]",21,"[20, 21, 22]",21,{20}------------------------------------------...,21.0
3,4,Item 1B,Unresolved Staff Comments,Business,40,"[N/A, N/A, N/A, 45, N/A]","[False, False, False, True, False]",45,"[44, 45, 46]",45,{44}------------------------------------------...,45.0
4,5,Item 2,Properties,PART I,40,"[N/A, N/A, N/A, 46, N/A]","[False, False, False, True, False]",46,"[45, 46, 47]",45,{45}------------------------------------------...,45.0
5,6,Item 3,Legal Proceedings,PART I,40,"[N/A, N/A, N/A, 46, N/A]","[False, False, False, True, False]",46,"[45, 46, 47]",45,{45}------------------------------------------...,45.0
6,7,Item 4,Mine Safety Disclosures,PART I,40,"[N/A, N/A, N/A, 46, N/A]","[False, False, False, True, False]",46,"[45, 46, 47]",45,{45}------------------------------------------...,45.0
7,9,Item 5,"Market for Registrant's Common Equity, Related...",PART II,41,"[N/A, N/A, N/A, 47, N/A]","[False, False, False, True, False]",47,"[46, 47, 48]",46,{46}------------------------------------------...,46.0
8,10,Item 6,[Reserved],PART II,43,"[N/A, N/A, N/A, 49, 48]","[False, False, False, True, True]",49,"[47, 48, 49, 50]",48,{47}------------------------------------------...,48.0
9,11,Item 7,Management's Discussion and Analysis of Financ...,PART II,44,"[11, N/A, N/A, 49, 49]","[True, False, False, True, True]",11,"[10, 11, 12, 48, 49, 50]",49,{10}------------------------------------------...,49.0


In [108]:
df_toc6 = df_toc5.copy()
df_toc6["best_page_diff"] = df_toc6["best_page"] - df_toc6["page"]
best_page_diff_mean = int(np.round(df_toc6["best_page_diff"].mean(), 0))
df_toc6["best_page_last_model"] = df_toc6["page"] + best_page_diff_mean
df_toc6


Unnamed: 0,index,id,name,parent,page,text_page_numbers,text_page_number_present,text_page_number,text_page_numbers2,text_page_number_model,extracted_text,best_page,best_page_diff,best_page_last_model
0,0,,Forward-Looking Statements,,3,"[8, N/A, N/A, 38, N/A]","[True, False, False, True, False]",8,"[8, 9, 37, 38, 39]",8,{8}-------------------------------------------...,8.0,5.0,8
1,2,Item 1,Business,PART I,5,"[10, N/A, N/A, 38, N/A]","[True, False, False, True, False]",10,"[9, 10, 11, 37, 38, 39]",10,{9}-------------------------------------------...,10.0,5.0,10
2,3,Item 1A,Risk Factors,Business,16,"[N/A, 21, N/A, N/A, N/A]","[False, True, False, False, False]",21,"[20, 21, 22]",21,{20}------------------------------------------...,21.0,5.0,21
3,4,Item 1B,Unresolved Staff Comments,Business,40,"[N/A, N/A, N/A, 45, N/A]","[False, False, False, True, False]",45,"[44, 45, 46]",45,{44}------------------------------------------...,45.0,5.0,45
4,5,Item 2,Properties,PART I,40,"[N/A, N/A, N/A, 46, N/A]","[False, False, False, True, False]",46,"[45, 46, 47]",45,{45}------------------------------------------...,45.0,5.0,45
5,6,Item 3,Legal Proceedings,PART I,40,"[N/A, N/A, N/A, 46, N/A]","[False, False, False, True, False]",46,"[45, 46, 47]",45,{45}------------------------------------------...,45.0,5.0,45
6,7,Item 4,Mine Safety Disclosures,PART I,40,"[N/A, N/A, N/A, 46, N/A]","[False, False, False, True, False]",46,"[45, 46, 47]",45,{45}------------------------------------------...,45.0,5.0,45
7,9,Item 5,"Market for Registrant's Common Equity, Related...",PART II,41,"[N/A, N/A, N/A, 47, N/A]","[False, False, False, True, False]",47,"[46, 47, 48]",46,{46}------------------------------------------...,46.0,5.0,46
8,10,Item 6,[Reserved],PART II,43,"[N/A, N/A, N/A, 49, 48]","[False, False, False, True, True]",49,"[47, 48, 49, 50]",48,{47}------------------------------------------...,48.0,5.0,48
9,11,Item 7,Management's Discussion and Analysis of Financ...,PART II,44,"[11, N/A, N/A, 49, 49]","[True, False, False, True, True]",11,"[10, 11, 12, 48, 49, 50]",49,{10}------------------------------------------...,49.0,5.0,49


In [109]:
df_toc6[["index","id", "name", "parent"]]

Unnamed: 0,index,id,name,parent
0,0,,Forward-Looking Statements,
1,2,Item 1,Business,PART I
2,3,Item 1A,Risk Factors,Business
3,4,Item 1B,Unresolved Staff Comments,Business
4,5,Item 2,Properties,PART I
5,6,Item 3,Legal Proceedings,PART I
6,7,Item 4,Mine Safety Disclosures,PART I
7,9,Item 5,"Market for Registrant's Common Equity, Related...",PART II
8,10,Item 6,[Reserved],PART II
9,11,Item 7,Management's Discussion and Analysis of Financ...,PART II


In [110]:
classes = """
class Result(BaseModel):
    item_index: int # The index of the item
    item_id: str # The ID of the item
    item_name: str # The name of the item
    confidence: float # A number between 0 and 1 indicating your confidence
    reason: str # A brief explanation of why you chose this item
"""

exec(classes)

msg = f"""Given the following table of contents from an annual report, identify the item that most likely contains the Financial Statements (Balance Sheet, Income Statement/Profit and Loss, and Cash Flow Statement). The JSON output should begin with and end with @@@

---
Output format should be in JSON format matching this Pydantic model:
{classes}

---
Table of Contents:
{df_toc6[['index', 'id', 'name', 'parent']].to_string()}
---

Financial statements are typically found in sections like "Financial Statements", "Consolidated Financial Statements", or in the Management's Discussion and Analysis section.
"""

chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user", 
            "content": msg,
        }
    ],
    model="gpt-4o-mini",
)

ans = chat_completion.choices[0].message.content
ans2 = ans.split("@@@")[1]
_ = Result.model_validate_json(ans2)
json_ans = json.loads(ans2)
json_ans


{'item_index': 13,
 'item_id': 'Item 8',
 'item_name': 'Financial Statements and Supplementary Data',
 'confidence': 0.95,
 'reason': "This item is explicitly titled 'Financial Statements and Supplementary Data', which strongly indicates that it contains the Balance Sheet, Income Statement/Profit and Loss, and Cash Flow Statement."}

In [None]:
classes = """
class Result(BaseModel):
    item_index: int # The index of the item
    item_id: str # The ID of the item
    item_name: str # The name of the item
    confidence: float # A number between 0 and 1 indicating your confidence
    reason: str # A brief explanation of why you chose this item
"""

exec(classes)

msg = f"""Given the following table of contents from an annual report, identify the item that most likely contains the Financial Statements (Balance Sheet, Income Statement/Profit and Loss, and Cash Flow Statement). The JSON output should begin with and end with @@@

---
Output format should be in JSON format matching this Pydantic model:
{classes}

---
Table of Contents:
{df_toc6[['index', 'id', 'name', 'parent']].to_string()}
---

Financial statements are typically found in sections like "Financial Statements", "Consolidated Financial Statements", or in the Management's Discussion and Analysis section.
"""

chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user", 
            "content": msg,
        }
    ],
    model="gpt-4o",
)

ans = chat_completion.choices[0].message.content
ans2 = ans.split("@@@")[1]
_ = Result.model_validate_json(ans2)
json_ans = json.loads(ans2)
json_ans


{'item_index': 13,
 'item_id': 'Item 8',
 'item_name': 'Financial Statements and Supplementary Data',
 'confidence': 0.95,
 'reason': "This item is explicitly titled 'Financial Statements and Supplementary Data', which strongly indicates that it contains the Balance Sheet, Income Statement/Profit and Loss, and Cash Flow Statement."}