In [22]:
import requests
import os
import time
from random import random
import pandas as pd
from datetime import datetime



class DocumentDownloader():
    def __init__(self):
        print("DocumentDownloader initialized...")

    def get_announcements(self, ticker: str, limit: int = 20):
    
        url = f"https://data-api.marketindex.com.au/api/v1/announcements?codes={ticker}%3AAUD%3AXASX&limit={limit}"

        headers = {
            "User-Agent": "Mozilla/5.0",
            # "Referer": f"https://www.marketindex.com.au/asx/{ticker.lower()}",
            "Accept": "application/json",
        }
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        return response.json()["data"]["announcements"]

    def download_pdf(self, identifier: str, filename:str, save_dir: str = "pdfs"):
        base_url = "https://data-api.marketindex.com.au/api/v1/announcements/"
        pdf_url = base_url + identifier + "/pdf"
        os.makedirs(save_dir, exist_ok=True)
        # filename = pdf_url.split("/")[-2]
        filepath = os.path.join(save_dir, filename)

        headers = {
            "Accept": "application/json",
            "Accept-Encoding": "gzip, deflate, br, zstd",
            "Accept-Language": "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7,zh-TW;q=0.6",
            "Origin": "https://www.marketindex.com.au",
            "Sec-Ch-Ua": '"Not(A:Brand";v="99", "Google Chrome";v="133", "Chromium";v="133"',
            "Sec-Ch-Ua-Mobile": "?0",
            "Sec-Ch-Ua-Platform": "Linux",
            "Sec-Fetch-Dest": "empty",
            "Sec-Fetch-Mode": "cors",
            "Sec-Fetch-Site": "same-site",
            "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36",
        }

        #Check if file already exists. Only executes if file does not exist. 
        if not os.path.exists(filepath):
        
            response = requests.get(pdf_url,headers=headers)
            print(response)
            if response.status_code == 200:
                with open(filepath, "wb") as f:
                    f.write(response.content)
                    print(f"Downloaded: {filename}")
            else:
                print(f"Failed to download {filename}: {response.status_code}")


    def create_announcement_df(self, ticker_list:list,limit:int = 100):
    
        #Used for creating announcements dataframe.  
        ls_announcements = []

        #To be generalized in future to extract other documents such as sustainability reports and annual reports. 
        #Extracts only investor presentations. 
        for ticker in ticker_list:
            try:
                announcements = self.get_announcements(ticker,limit)
                for announcement in announcements:

                    #Check for whether announcement type is other; investor presentations are usuallly under 'other'. 
                    announcement_type = announcement['types'][0]['title']
                    announcement['ticker'] = ticker

                    # print(ticker, announcement['heading'],announcement_type) 
                
                ls_announcements.extend(announcements)

                time.sleep(2 + random()*4)
                
            except Exception as e:
                print(f"Error processing {ticker}: {e}")

        return pd.DataFrame(ls_announcements)


    def filter_df_heading_with_regex(self, df:pd.DataFrame ,regex_pattern:str):
        return df[df['heading'].str.contains(regex_pattern,case = False,na=False)]

    def filter_df_with_date(self,df:pd.DataFrame, from_date_YYYY_MM_DD:str, to_date_YYYY_MM_DD:str):
        df["ISO_date"] = pd.to_datetime(df['dateTime'])
        return df[df["ISO_date"].between(from_date_YYYY_MM_DD, to_date_YYYY_MM_DD)]

    def filter_df_heading_with_regex_and_date(self,df:pd.DataFrame, regex_pattern:str, from_date_YYYY_MM_DD:str, to_date_YYYY_MM_DD:str):
        df_filtered = self.filter_df_heading_with_regex(df, regex_pattern)
        df_filtered = self.filter_df_with_date(df_filtered, from_date_YYYY_MM_DD, to_date_YYYY_MM_DD)
        return df_filtered

    def download_pdfs_in_df(self, df: pd.DataFrame, doc_suffix, save_dir: str):
        
        #If no doc suffix is provided, dd will use heading as the suffix. 

        try: 
            for index, row in df.iterrows():
                identifier = row['identifier']
                print("Downloading ",row['heading'], ".....")
                
                if doc_suffix is None:
                    filename = f"{row['ticker']}_{row['heading']}"
                else:
                    filename = f"{row['ticker']}_{doc_suffix}"

            
                self.download_pdf(identifier, filename, save_dir)
                time.sleep(3+random()*10)   # Sleep to avoid overwhelming the server

        except Exception as e:
            print(f"Error processing row {row}: {e}")

    # General method for downloading files. Regex needs to be specified. 
    def download_documents(self, ticker_list: list, from_date_YYYY_MM_DD:str, to_date_YYYY_MM_DD:str, save_dir: str, regex_pattern: str, doc_suffix = None ,limit: int = 100):
        """
        Downloads the investor presentations for the given ticker list.
        
        Parameters:
        - ticker_list: List of stock tickers to process.
        - limit: Number of announcements to fetch per ticker.
        - save_dir: Directory to save the downloaded PDFs.
        - regex_pattern: Regex pattern to filter the announcements.
        - from_date_YYYY_MM_DD: Start date in YYYY-MM-DD format.
        """
        df_announcements = self.create_announcement_df(ticker_list, limit)
        df_filtered = self.filter_df_heading_with_regex_and_date(df_announcements, regex_pattern, from_date_YYYY_MM_DD=from_date_YYYY_MM_DD, to_date_YYYY_MM_DD=to_date_YYYY_MM_DD)
        
        print(f"Filtered {len(df_filtered)} announcements matching the criteria.")
        
        
        self.download_pdfs_in_df(df_filtered, doc_suffix = doc_suffix, save_dir = save_dir)


    #Converts date to CY. 
    def assign_period_CY(self,date_input):
        
        if isinstance(date_input, str):
            date = datetime.strptime(date_input, "%Y-%m-%d")
        else:
            date = date_input

        if date.month in [1, 2, 3]:
            return f"CY{str(date.year - 1)[-2:]}"
        elif date.month in [7, 8, 9]:
            return f"HY{str(date.year)[-2:]}"
        else:
            return None
        
    #Converts date to FY. 
    def assign_period_FY(self,date_input):
        
        if isinstance(date_input, str):
            date = datetime.strptime(date_input, "%Y-%m-%d")
        else:
            date = date_input

        if date.month in [1, 2, 3]:
            return f"HY{str(date.year)[-2:]}"
        elif date.month in [7, 8, 9]:
            return f"FY{str(date.year)[-2:]}"
        else:
            return None

    def download_results_presentations(self, ticker_list: list, from_date_YYYY_MM_DD:str, to_date_YYYY_MM_DD:str, limit: int = 100, save_dir: str = "datasets/co_presentations", regex_pattern: str = r'results presentation|result presentation'):
        """
        Downloads the results presentations for the given ticker list.
        
        Parameters:
        - ticker_list: List of stock tickers to process.
        - limit: Number of announcements to fetch per ticker.
        - save_dir: Directory to save the downloaded PDFs.
        - regex_pattern: Regex pattern to filter the announcements.
        - from_date_YYYY_MM_DD: Start date in YYYY-MM-DD format.
        """
        df_announcements = self.create_announcement_df(ticker_list, limit)
        df_filtered = self.filter_df_heading_with_regex_and_date(df_announcements, regex_pattern, from_date_YYYY_MM_DD=from_date_YYYY_MM_DD, to_date_YYYY_MM_DD=to_date_YYYY_MM_DD)
        
        print(f"Filtered {len(df_filtered)} announcements matching the criteria.")
        
        # self.download_pdfs_in_df(df_filtered, save_dir)

        try: 
            for index, row in df_filtered.iterrows():
                identifier = row['identifier']
                print("Downloading ",row['heading'], ".....")
                
                CY = self.assign_period_FY(row["ISO_date"])

                filename = f"{row['ticker']}_{CY}_IP"
                print(filename)
                self.download_pdf(identifier, filename, save_dir)
                time.sleep(3+random()*10)   # Sleep to avoid overwhelming the server

        except Exception as e:
            print(f"Error processing row {row}: {e}")

    

# Instantiate Document Downloader

In [23]:
#Instantiate DocumentDownloader class
dd = DocumentDownloader()

DocumentDownloader initialized...


# To download all investor presentations for a set of tickers published between two dates. 

In [None]:
#Inputs
ticker_list = ["COF"]
from_date_YYYY_MM_DD = "2024-01-31"
to_date_YYYY_MM_DD = "2025-03-31"
save_dir = "datasets/test"

#download statements for tickers in ticker list.  
dd.download_results_presentations(
    ticker_list = ticker_list,
    from_date_YYYY_MM_DD=from_date_YYYY_MM_DD,
    to_date_YYYY_MM_DD=to_date_YYYY_MM_DD,
    save_dir=save_dir
    )

DocumentDownloader initialized. Use the methods to download documents and create dataframes.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["ISO_date"] = pd.to_datetime(df['dateTime'])


Filtered 3 announcements matching the criteria.
Downloading  COF HY25 Results Presentation .....
COF_HY25_IP
Downloading  COF FY24 Results Presentation .....
COF_FY24_IP
Downloading  COF HY24 Results Presentation .....
COF_HY24_IP


# Analyze what announcements are available 

In [24]:
dd.create_announcement_df(ticker_list= ticker_list, limit=100).head()

Unnamed: 0,id,identifier,fileId,pageCount,symbolId,securityName,fileSize,heading,dateTime,fileType,isPriceSensitive,fileKey,types,ticker
0,1909132,XASX:COF:2A1600573,2A1600573,1,COF:AUD:XASX,Centuria Office REIT,169606,June 2025 Distribution Declaration,2025-06-05T23:07:10.000Z,pdf,True,announcements/xasx/2025/06/06/2A1600573.pdf,"[{'id': 4, 'identifier': '4', 'title': 'Divide...",COF
1,1909129,XASX:COF:2A1600570,2A1600570,4,COF:AUD:XASX,Centuria Office REIT,13770,Dividend/Distribution - COF,2025-06-05T23:04:25.000Z,pdf,True,announcements/xasx/2025/06/06/2A1600570.pdf,"[{'id': 4, 'identifier': '4', 'title': 'Divide...",COF
2,1901199,XASX:COF:2A1596683,2A1596683,2,COF:AUD:XASX,Centuria Office REIT,231397,Q3 FY25 Operating Update,2025-05-14T23:07:04.000Z,pdf,True,announcements/xasx/2025/05/15/2A1596683.pdf,"[{'id': 2, 'identifier': '2', 'title': 'Period...",COF
3,1874219,XASX:COF:2A1583347,2A1583347,1,COF:AUD:XASX,Centuria Office REIT,169175,March 2025 Distribution Declaration,2025-03-06T22:44:26.000Z,pdf,True,announcements/xasx/2025/03/07/2A1583347.pdf,"[{'id': 4, 'identifier': '4', 'title': 'Divide...",COF
4,1874212,XASX:COF:2A1583345,2A1583345,4,COF:AUD:XASX,Centuria Office REIT,13770,Dividend/Distribution - COF,2025-03-06T22:39:21.000Z,pdf,True,announcements/xasx/2025/03/07/2A1583345.pdf,"[{'id': 4, 'identifier': '4', 'title': 'Divide...",COF


# To download specific reports available in Announcement Dataframe

In [None]:
#Inputs
regex_pattern = r'operating update'
from_date_YYYY_MM_DD="2024-01-31"
to_date_YYYY_MM_DD="2025-03-31",
save_dir="datasets/test"
limit = 100


dd.download_documents(
    ticker_list=ticker_list,
    from_date_YYYY_MM_DD=from_date_YYYY_MM_DD,
    to_date_YYYY_MM_DD=to_date_YYYY_MM_DD,
    save_dir=save_dir,
    regex_pattern=regex_pattern,
    limit=limit
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["ISO_date"] = pd.to_datetime(df['dateTime'])


Filtered 2 announcements matching the criteria.
Downloading  Q1 FY25 Operating Update .....
<Response [200]>
Downloaded: COF_Q1 FY25 Operating Update
Downloading  Q3 FY24 Operating Update .....
<Response [200]>
Downloaded: COF_Q3 FY24 Operating Update


In [None]:
tickers_prospectus = [
    "GYG", "DGT", "TEA", "MAC", "CCL", "SYL", "MRE", "WHI", "PCX", "BWN",
    # "SS1", "VFY", "GHM", "BB1", "AAL", "WAG", "D3E", "ORD", "RNV", "RAU",
    # "AXL", "MNC", "PR2", "KM1", "FNR", "I88", "MHM", "LMS", "FUL"
]