In [20]:
#Checks if GPU is available
import torch
print(torch.cuda.is_available())           # True if GPU is available
print(torch.cuda.device_count())           # Number of GPUs
print(torch.cuda.get_device_name(0)) 

True
1
NVIDIA GeForce RTX 4070 Laptop GPU


In [21]:
#DoclingLoader is similar to DocumentConverter, but is better integrated with the langchain ecosystem.
from langchain_docling import DoclingLoader
from langchain_docling.loader import ExportType
from langchain.output_parsers import ResponseSchema, StructuredOutputParser


from langchain_openai import OpenAIEmbeddings, OpenAI
from langchain.vectorstores import FAISS
from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
from langchain.chains import create_retrieval_chain

from pydantic import BaseModel, Field


from docling.document_converter import DocumentConverter
from docling.chunking import HybridChunker

import os
from dotenv import load_dotenv
import pandas as pd



In [22]:
def create_response_schema(extraction_key, extracted_field_name, extracted_FY, units = "millions", data_type="float"):
    return ResponseSchema(name = f"{extraction_key}", description = f"{extracted_FY} {extracted_field_name} in {units}", type = data_type)

def create_extraction_key(extracted_field_name):
    field_name_split = extracted_field_name.split(" ")
    extraction_key = "_".join(field_name_split).lower()
    return extraction_key

def run_chain_with_retries(chain, input_dict,  max_retries = 5):
    for attempt in range(1,max_retries + 1):
        try:
            return chain.invoke(input_dict)
        except Exception as e:
            print(f"Attempt {attempt} failed. Retrying...")
            if attempt == max_retries:
                raise

In [23]:
load_dotenv()

MODEL = "gpt-4o-mini"
openai_api_key = os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

llm = OpenAI(api_key=openai_api_key, model=MODEL, temperature=0.0, max_tokens=512)

In [24]:
#Set up dataframe schema. 

bs_cols = ['ticker', 'FP','total_assets', 'investment_properties', 'total_debt',
    'total_liabilities', 'net_assets', 'nta_per_unit',
    ]

is_cols = ['ticker',
    'FP',
    'total_revenue',
    'direct_property_expense',
    'responsible_entity_fees',
    'funds_from_operations',
    'statutory_net_profit',
    ]



In [28]:
#DoclingConverter is standalone/not integrated with the langchain ecosystem. Use this for direct document parsing and export. 

file_names = os.listdir(os.path.join("datasets","co_presentations"))

# file_names = ['INA_HY25_IP.pdf']

df_bs = pd.DataFrame(columns=bs_cols)
df_is = pd.DataFrame(columns=is_cols)

#Iterate over list of all file names. 
for file_name in file_names:

    ticker = file_name.split("_")[0]
    FP = file_name.split("_")[1]

    source = os.path.join("datasets","co_presentations", file_name)  # document per local path or URL
    converter = DocumentConverter()
    result = converter.convert(source)

    md_text = result.document.export_to_markdown()

    with open(f"{ticker}_{FP}_presentations.md", "w") as f:
        f.write(md_text)

    ls_bs = []
    ls_is = []

    for idx, table in enumerate(result.document.tables):
        try:
            print(f"Table {idx}:")
            # if idx == 12:

            df: pd.DataFrame = table.export_to_dataframe()
            print(df)
            text = " ".join(df.columns.to_list() + df.iloc[:,0].astype(str).tolist())
            print(text)

            is_bs = False
            is_is = False

            if all(word in text for word in ["assets","liabilities"]):
                is_bs = True
            
            if is_bs:
                ls_bs.append(df)

            if all(word in text for word in ["income","expense"]):
                is_is = True

            if is_is:
                ls_is.append(df)

        
        except:
            print("Error")

    df_str_bs = ls_bs[0].to_string(index=False) if len(ls_bs) > 0 else ""
    df_str_is = ls_is[0].to_string(index=False) if len(ls_is) > 0 else ""

    #Prepare data from chain invoke with retries

    #Set up fields to iterate through
    ls_bs_fields = ["Total assets", "Investment Properties","total debt", "total liabilities","net assets", "nta per unit"]
    ls_is_fields = ["Total Revenue", "Direct Property Expense", "Responsible Entity Fees", "Funds From Operations", "Statutory Net Profit" ]

    #Set up response schemas to use. 
    response_schemas_bs = [create_response_schema(create_extraction_key(field_name),field_name,"HY25") for field_name in ls_bs_fields]
    response_schemas_is = [create_response_schema(create_extraction_key(field_name),field_name,"HY25") for field_name in ls_is_fields]

    #Set up output parser to be used. Output parser will be set up using response schemas. Format instructions will be injected into prompt. 
    bs_output_parser = StructuredOutputParser.from_response_schemas(response_schemas_bs)
    is_output_parser = StructuredOutputParser.from_response_schemas(response_schemas_is)

    #Retrieve format instructions from parser. 
    bs_format_instruction = bs_output_parser.get_format_instructions()
    is_format_instruction = is_output_parser.get_format_instructions()


    # Set up template to be used. 
    system_prompt = """
    You are a meticulous assistant that can answer questions about the content of the COF HY25 Results Presentation document. Return the answer in JSON format. Do not hallucinate.
    """

    #Template inclusive of format instructions extracted from output parser. 

    template = PromptTemplate.from_template(
        "{system_prompt}\n"
        "{format_instructions}\n"
        "Context information is below.\n---------------------\n{context}\n---------------------\n"
        "Given the context information and not prior knowledge, answer the query.\n"
        "Query: {input}",   
    )

    #Set up QA chain using template, llm and output_parser
    bs_chain = template | llm | bs_output_parser
    is_chain = template | llm | is_output_parser

    #set up dictionary to be be passed into the chain.
    input_dict_bs = {'input': "Extract designated balance sheet items","system_prompt": system_prompt,"context": df_str_bs,"format_instructions":bs_format_instruction}
    input_dict_is = {'input': "Extract designated income statement items","system_prompt": system_prompt,"context": df_str_is,"format_instructions":is_format_instruction}

    bs_items_extracted = {}
    is_items_extracted = {}

    #Extract information from table
    bs_items_extracted = run_chain_with_retries(bs_chain,input_dict_bs)
    is_items_extracted = run_chain_with_retries(is_chain,input_dict_is)

    

    #If execution succeeds: 
    try:
        if bs_items_extracted:

            bs_items_extracted["ticker"] = ticker
            bs_items_extracted["FP"] = FP 

            is_items_extracted["ticker"] = ticker
            is_items_extracted["FP"] = FP 

            ls_all_is_dict = []
            ls_all_bs_dict = []

            ls_all_bs_dict.append(bs_items_extracted)
            ls_all_is_dict.append(is_items_extracted)

            print("ls_all_is_dict", ls_all_is_dict)

            try:
                df_is = pd.concat([df_is, pd.DataFrame(ls_all_is_dict)], ignore_index=True)
                df_bs = pd.concat([df_bs, pd.DataFrame(ls_all_bs_dict)], ignore_index=True) 
            except:
                print("error appending to dataframe")
    except Exception as e:
        print(f"Error during chain execution: {e}")

    

    

Table 0:
                                         Revenue        HY25   HY24 Variance
0                          Gross property income   $m   84.5   93.4    (8.9)
1                                Interest income   $m    0.4    0.3      0.1
2                                  Total revenue   $m   84.9   93.7    (8.8)
3                                       Expenses                            
4                       Direct property expenses   $m   21.1   21.8      0.7
5                        Responsible entity fees   $m    5.4    6.3      0.9
6                                  Finance costs   $m   22.9   23.0      0.1
7   Management and other administrative expenses   $m    0.9    0.7    (0.2)
8                                 Total expenses   $m   50.2   51.9      1.7
9                Funds from operations (FFO) 1,2   $m   34.7   41.8    (7.1)
10               Weighted average units on issue    m  597.3  597.3      0.0
11               Funds from operation per unit 1  cpu    5.8    7.0

  df_is = pd.concat([df_is, pd.DataFrame(ls_all_is_dict)], ignore_index=True)
  df_bs = pd.concat([df_bs, pd.DataFrame(ls_all_bs_dict)], ignore_index=True)


Table 0:
                              HY25     HY24   Change
0    Statutory loss ($m) 2  (28.6)  (271.4)    89.5%
1     Statutory loss (cps)  (1.09)   (10.4)    89.5%
2  Operating profit ($m) 2    55.1     83.7  (34.4%)
3   Operating profit (cps)    2.10     3.20  (34.4%)
4       Distributions ($m)    39.2     41.3   (5.1%)
5      Distributions (cps)    1.50     1.58   (5.1%)
6       Payout ratio 3 (%)   106.4     62.6    43.8%
 HY25 HY24 Change Statutory loss ($m) 2 Statutory loss (cps) Operating profit ($m) 2 Operating profit (cps) Distributions ($m) Distributions (cps) Payout ratio 3 (%)
Table 1:
                                         HY25 ($'M) HY24 ($'M ) Change (%)
0   Australian Investment Portfolio EBIT       78.0        78.0       0.0%
1             Funds and Asset Management                                  
2                         Australia EBIT        4.0         3.9       2.6%
3                            Europe EBIT        3.9         7.9    (50.6%)
4            Tota

  df_is = pd.concat([df_is, pd.DataFrame(ls_all_is_dict)], ignore_index=True)


Table 0:
                   Key metrics     1H FY24     1H FY25  Movement
0           Operating earnings      $78.6m      $73.1m    (7.0%)
1  Operating earnings per unit  13.5 cents  12.6 cents    (7.0%)
2       Distributions per unit  12.3 cents  12.3 cents         -
3                 NTA per unit       $4.54       $4.57     +0.7%
4    Weighted average cap rate       5.77%       5.82%  +0.05bps
Key metrics 1H FY24 1H FY25 Movement Operating earnings Operating earnings per unit Distributions per unit NTA per unit Weighted average cap rate
Table 1:
                    0         1
0  Shopping Centres 1  2.0-3.0%
1    Net Lease Retail       CPI
2                 HPI      3.6%
Error
Table 2:
                                                   $m 1H FY24 1H FY25 %change
0   Net property income from shopping centre retai...    82.0    84.1    2.5%
1     Net property income from net lease retail - LFL    24.0    25.1    4.5%
2             Net property income - assets transacted    15.5     7.8

  df_is = pd.concat([df_is, pd.DataFrame(ls_all_is_dict)], ignore_index=True)


Table 0:
         Key metrics ($ million)      HY25      FY24  \
0          Established portfolio  $2,409.4  $2,350.9   
1          Acquisition portfolio    $174.9    $164.5   
2          Stabilising portfolio    $219.0    $194.7   
3              Development sites    $275.1    $218.3   
4             Total store assets  $3,078.4  $2,928.4   
5       Goodwill and intangibles     $73.2     $72.6   
6      Cash and cash equivalents     $82.1     $89.0   
7                   Other assets     $33.9    $141.6   
8                   Total assets  $3,267.6  $3,231.6   
9   Interest bearing liabilities  $1,004.7    $990.2   
10          Distribution payable     $40.7     $39.4   
11             Other liabilities     $79.8     $84.9   
12             Total liabilities  $1,125.2  $1,114.5   
13                    Net assets  $2,142.4  $2,117.1   
14              Total securities  1,314.1m  1,314.1m   

                                             Comments  
0   103 stores valued at $4,363/m² (FY

In [29]:
df_bs

Unnamed: 0,ticker,FP,total_assets,investment_properties,total_debt,total_liabilities,net_assets,nta_per_unit
0,COF,HY25,1946.986,1917.951,845.333,922.437,1024.549,1.72
1,CMW,HY25,2865.9,2113.0,1290.4,1378.6,1487.3,0.57
2,CQR,HY25,4181.0,4048.0,1375.0,1562.0,2619.0,4.51
3,VCX,HY25,16419.5,15108.3,4764.6,5558.3,10861.2,2.35
4,HCW,HY25,1353.6,964.9,438.3,484.7,868.9,1.58
5,CLW,HY25,5252.5,2773.4,1803.2,1882.7,3369.8,4.66
6,SGP,HY25,1500.0,1200.0,800.0,900.0,600.0,3.0
7,CIP,HY25,1000.0,800.0,500.0,600.0,400.0,2.0
8,ASK,HY25,3267.6,3078.4,1004.7,1125.2,2142.4,1.63
9,ABG,HY25,2587.1,1817.7,943.1,1023.4,1563.7,1.75


In [30]:
df_is

Unnamed: 0,ticker,FP,total_revenue,direct_property_expense,responsible_entity_fees,funds_from_operations,statutory_net_profit
0,COF,HY25,84.9,21.1,5.4,34.7,
1,CMW,HY25,104.1,18.6,28.9,55.1,55.1
2,CQR,HY25,117.0,11.1,0.0,73.1,108.6
3,VCX,HY25,463.5,27.5,42.8,344.1,492.6
4,HCW,HY25,100.0,30.0,5.0,65.0,40.0
5,CLW,HY25,151.3,15.1,0.0,89.8,0.0
6,SGP,HY25,100.0,50.0,10.0,30.0,5.0
7,CIP,HY25,128.9,30.3,11.6,56.6,
8,ASK,HY25,113.5,42.1,10.3,43.3,3.9
9,ABG,HY25,82.0,17.2,14.1,40.2,40.2
