# Belfius Alytics (Part 2)
Inspiration:

-https://github.com/pinecone-io/examples/blob/master/learn/generation/langchain/handbook/08-langchain-retrieval-agent.ipynb

-https://www.youtube.com/watch?v=RIWbalZ7sTo

-https://colab.research.google.com/drive/13FpBqmhYa5Ex4smVhivfEhk2k4S5skwG?usp=sharing#scrollTo=RSdomqrHNCUY

-https://github.com/openai/openai-cookbook/blob/main/examples/How_to_format_inputs_to_ChatGPT_models.ipynb

### Handle imports:

In [2]:
# Move to root directory
import os

notebooks_dir = 'notebooks'
if notebooks_dir in os.path.abspath(os.curdir):
    while not os.path.abspath(os.curdir).endswith('notebooks'):
        print(os.path.abspath(os.curdir))
        os.chdir('..')
    os.chdir('..')  # to get to root

print(os.path.abspath(os.curdir))

C:\Users\MD726YR\PycharmProjects\eyalytics


In [3]:
# Supress SSL verification (EY problem):
import requests

from requests.packages.urllib3.exceptions import InsecureRequestWarning

# Suppress the warning from urllib3.
requests.packages.urllib3.disable_warnings(category=InsecureRequestWarning)

old_send = requests.Session.send

def new_send(*args, **kwargs):
    kwargs['verify'] = False
    return old_send(*args, **kwargs)

requests.Session.send = new_send

In [4]:
# Import relevant libraries for langchain retrieval:
import openai
import tiktoken

from langchain import OpenAI,  LLMChain, PromptTemplate
from langchain.prompts import StringPromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS  # facebook ai similarity search 
from langchain.chains import LLMMathChain
from langchain.tools import BaseTool
from langchain.agents import (
    AgentExecutor, LLMSingleActionAgent, AgentOutputParser, 
    AgentType, initialize_agent, Tool
)
from langchain.callbacks import get_openai_callback
from langchain.schema import AgentAction, AgentFinish

**In case you want to use Chroma instead of FAISS:**

`from langchain.vectorstores import Chroma`
    
Note, to use Chroma you will have to install chromadb. This requires having Microsoft Visual C++ 14.0 installed. To install that simply: 

a. Install Microsoft C++ Build Tools: Visit the link provided in the error message (https://visualstudio.microsoft.com/visual-cpp-build-tools/) and install the Microsoft C++ Build Tools.

b. Ensure the Correct Version: Ensure that you have the required version (14.0 or greater) of the C++ build tools installed.

c. Add to PATH: Ensure the tools are added to your system PATH. Usually, the installer should take care of this. But if the problem persists, you might need to verify and add them manually.

d. Restart Your System: Sometimes, after installing such tools, a system restart might be required for the environment variables (like PATH) to update correctly.

**Checks:**
Check if Visual C++ Build Tools is Installed:
- Press Windows + I to open the Settings app.
- Go to "Apps".
- Now in the "Apps & features" tab, search for "Visual Studio".
- Check if there's an installation called "Microsoft Visual Studio" (it might also be "Visual Studio Build Tools").

Check for the Required Components:
- If you find "Microsoft Visual Studio" or "Visual Studio Build Tools" in the list, click on it and then select "Modify".
- This will bring up the Visual Studio Installer.
- Here, ensure that the "Desktop development with C++" workload is checked. Specifically, make sure "MSVC v142 - VS 2019 C++ x64/x86 build tools" (or a similar option) is selected. This provides the C++ compiler that's needed.

In [5]:
# libraries for URL pdf loading
import time
import docx
import pyautogui
from docx.oxml.table import CT_Tbl
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [29]:
# Other libraries:
import re
import pickle 
import difflib
import math

# for progress bars in loops
from uuid import uuid4
from tqdm.auto import tqdm
from typing import List, Any, Union, Optional

In [15]:
# Get API and ENV keys:
from dotenv import load_dotenv

load_dotenv()
if not os.getenv("OPENAI_API_KEY"):
    raise KeyError(
        "You will need an OPENAI_API_KEY to use the LLM models in this notebook."
    )
openai.api_key = os.getenv("OPENAI_API_KEY")

## Commence Langchain Retrieval Augmentation Tool Development:

In [60]:
# URL -> file name convertor:
def url2fname(url):
    # Split the URL by '/' and get the last segment
    last_segment = url.split('/')[-1]
    
    # Use regex to remove any suffix after the dot and the dot itself
    cleaned_name = re.sub(r'\..*$', '', last_segment)
    
    return cleaned_name
    
    
# Create URL loader:
def wait_for_file(file_path: str, timeout: int = 60) -> bool:
    """
    Wait for a file to be present at a specified path within a given timeout.
    
    Args:
        file_path (str): Path to the file.
        timeout (int): Maximum waiting time in seconds. Default is 60 seconds.

    Returns:
        bool: True if file is found within the timeout, False otherwise.
    """
    start_time = time.time()

    while time.time() - start_time < timeout:
        if os.path.exists(file_path):
            return True
        time.sleep(1)

    return False


def download_pdf_from_url(url: str, save_path: str) -> str:
    """
    Download a PDF from the specified URL and save it to a local path.
    
    Args:
        url (str): URL of the PDF.
        save_path (str): Local path to save the downloaded PDF.

    Returns:
        str: Path to the saved PDF if successful, None otherwise.
    """
    response = requests.get(url)
    if response.status_code == 200:
        with open(save_path, 'wb') as file:
            file.write(response.content)
        if os.path.exists(save_path):
            return save_path
    return None


def convert_pdf_to_docx(
    pdf_filename: str, driver_path: str, pdf_folder_path: str, docx_folder_path: str
) -> str:
    """
    Convert a PDF to a DOCX using Adobe's online tool.
    
    Args:
        pdf_filename (str): Filename of the PDF.
        driver_path (str): Path to the geckodriver executable.
        pdf_folder_path (str): Directory where the PDF is located.
        docx_folder_path (str): Directory where the converted DOCX should be saved.

    Returns:
        str: Path to the converted DOCX if successful, None otherwise.
    """
    # WebDriver setup and configurations
    firefox_options = webdriver.FirefoxOptions()
    firefox_options.set_preference("browser.download.folderList", 2)
    firefox_options.set_preference("browser.download.dir", docx_folder_path)
    firefox_options.set_preference("browser.download.useDownloadDir", True)
    firefox_options.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/vnd.openxmlformats-officedocument.wordprocessingml.document")
    
    service = Service(driver_path)
    driver = webdriver.Firefox(service=service, options=firefox_options)
    wait = WebDriverWait(driver, 180)
    driver.get("https://www.adobe.com/be_en/acrobat/online/pdf-to-word.html")

    # Upload the PDF
    upload_btn = wait.until(EC.element_to_be_clickable((By.ID, "lifecycle-nativebutton")))
    upload_btn.click()

    full_pdf_path = os.path.join(pdf_folder_path, pdf_filename)
    if not os.path.exists(full_pdf_path):
        print(f"File path\n{full_pdf_path}\nis not valid.")
        return None
    
    # Wait for the file selection dialog and input the file path using pyautogui
    time.sleep(5)
    # Use the path in pyautogui
    pyautogui.typewrite(full_pdf_path)

    # Add a slight delay and then press 'enter' multiple times
    time.sleep(2)
    for _ in range(3):
        pyautogui.press('enter')
        time.sleep(0.1)
    time.sleep(10)
    
    # Handle cookies and start conversion
    try:
        cookie_reject_btn = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#onetrust-reject-all-handler")))
        if cookie_reject_btn:
            cookie_reject_btn.click()
    except Exception:
        print("Cookie settings notification not found or failed to click.")

    download_btn = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'button.Download__downloadButton___2qFEa')))
    download_btn.click()
    time.sleep(10)
    driver.quit()

    expected_docx_filename = pdf_filename.replace('.pdf', '.docx')
    expected_docx_filepath = os.path.join(docx_folder_path, expected_docx_filename)

    return expected_docx_filepath if wait_for_file(expected_docx_filepath, 50) else None


def convert_url_pdf_to_docx(
    pdf_url: str, 
    driver_path: str = "./drivers/geckodriver.exe", 
    pdf_folder_path: str = None, 
    docx_folder_path: str = None
) -> str:
    """
    Download a PDF from a URL, convert it to DOCX, and save it locally.
    
    Args:
        pdf_url (str): URL of the PDF.
        driver_path (str): Path to the geckodriver executable. Default is './drivers/geckodriver.exe'.
        pdf_folder_path (str): Directory to save the downloaded PDF. Default is '../data/pdf_db'.
        docx_folder_path (str): Directory to save the converted DOCX. Default is '../data/docx_db'.

    Returns:
        str: Path to the converted DOCX if successful, None otherwise.
    """
    cwd = os.getcwd()
    pdf_folder_path = pdf_folder_path or os.path.join(cwd, "..", "data", "pdf_db")
    docx_folder_path = docx_folder_path or os.path.join(cwd, "..", "data", "docx_db")

    os.makedirs(pdf_folder_path, exist_ok=True)
    os.makedirs(docx_folder_path, exist_ok=True)

    pdf_filename = pdf_url.split('/')[-1]
    pdf_save_path = os.path.join(pdf_folder_path, pdf_filename)

    if download_pdf_from_url(pdf_url, pdf_save_path):
        return convert_pdf_to_docx(pdf_filename, driver_path, pdf_folder_path, docx_folder_path)
    return None


def read_docx(file_path: str) -> dict:
    """
    Extract content from a DOCX file.
    
    Args:
        file_path (str): Path to the DOCX file.

    Returns:
        dict: Extracted content including metadata, text, and tables.
    """
    
    # TOO DIFFICULT TO GET ALL THE RULES CORRECT:
#     def is_potential_figure_data(text):
#         # Consider typical figure-related symbols
#         figure_related_chars = set("0123456789.%-\t")
#         text_count = len(text) - text.count(' ')
#         figure_char_count = sum(1 for char in text if char in figure_related_chars)
#         char_count = text_count - figure_char_count

#         # if text is only spaces OR
#         # if the text is long, it's probably not figure data
#         if (text_count == 0) or (char_count > 35):
#             return False

#         # Check for patterns indicative of figure data (numbers, percentages, etc.)
#         figure_patterns = [
#             r'\b\d{1,3}\%',                   # Percentage
#             r'\b\d+\.\d+\%',                  # Decimal percentage, e.g. "70.5%"
#             r'\b\d+\.\d+\b',                  # Decimal numbers
#             r'\b-\d+\b',                      # Negative numbers isolated
#             r'\b\d{1,3} - \d{1,3}\b',         # Ranges
#             r'[\w\s]+\t \d{1,3}%?',           # Header followed by a number with optional percentage (e.g. "CORN\t 70" or "CORN\t 70%")
#             r'[\w\s]+\t \d+\.\d+%?',          # Header followed by a decimal number with optional percentage (e.g. "CORN\t 70.5" or "CORN\t 70.5%")
#             r'[\w\s]+\t?\s?\d+\b',            # Header followed by a number (e.g. "CORN 70")
#             r'[\w\s]+\t?\s?\d+\.\d+\b',       # Header followed by a decimal number (e.g. "CORN 70.5")
#         ]

#         if any(re.search(pattern, text) for pattern in figure_patterns):
#             return True

#         return False
    
    def is_potential_figure_data(text):
        figure_related_chars = set("0123456789.%-")
        text_count = len(text) - text.count(' ')
        figure_char_count = sum(1 for char in text if char in figure_related_chars)
        char_count = text_count - figure_char_count

        # if text is only spaces OR
        # if text end with . (i.e end of sentence) or :, ;, etc OR
        # if the text is long, it's probably not a figure data
        if (text_count == 0) or text.endswith(('.', ':', ';', ',')) or (char_count > 20):
            return False

        # if text is only numbers OR
        # if digits make up at least n% of the text consider it as potential figure data
        if (char_count == 0) or (figure_char_count / (char_count + 1e-10) > 0.1):
            return True

        return False

    COMMON_FOOTNOTES = ["read more", "see above", "note:", "see below", "*", "..."]
    
    doc = docx.Document(file_path)
    result = {
        'metadata': {
            'title': doc.core_properties.title,
            'author': doc.core_properties.author,
            'created': doc.core_properties.created
        },
        'text': [],
        'tables': [],
        'potential_figures': []
    }
    
    # Extract text and figures:
    figure_data_group = {'title': None, 'data': []}
    previous_para = None
    previous_was_figure_data = False
    skip_next = False

    for para in doc.paragraphs:
        clean_text = para.text.strip()

        if len(clean_text) == 0:  # Ignore empty lines
            continue

        # Check if the line is a common footnote id
        if clean_text.lower() in COMMON_FOOTNOTES:
            skip_next = True  # Signal to skip next iteration for figure data processing
            continue

        if skip_next:  # If the line is footnote text
            result['text'].append(clean_text)  # Store it under 'text'
            skip_next = False  # Reset the flag
            continue

        # Identify if the current line is potential figure data
        current_is_figure_data = is_potential_figure_data(clean_text)

        if current_is_figure_data:
            # If previous line was also figure data, they belong to the same figure
            if previous_was_figure_data:
                figure_data_group['data'].append(clean_text)
            else:
                # If a new figure starts, save the previous figure (if there was any)
                if figure_data_group['data']:
                    result['potential_figures'].append(figure_data_group)
                    figure_data_group = {'title': None, 'data': []}

                # Assign the previous line as the title for the current figure
                figure_data_group['title'] = previous_para
                figure_data_group['data'].append(clean_text)
        else:
            # Not a figure line, add to text
            result['text'].append(clean_text)

        # Keep track of the current paragraph's text for the next iteration
        previous_para = clean_text

        # Set the current line's status for the next iteration
        previous_was_figure_data = current_is_figure_data

    if figure_data_group['data']:  # Append any remaining figure data group
        figure_data_group['title'] = previous_para
        result['potential_figures'].append(figure_data_group)

    # Extract tables
    potential_table_header = None
    for table in doc.tables:
        if not potential_table_header:
            # Attempting to infer header from the first row
            potential_table_header = [cell.text.strip() for cell in table.rows[0].cells]

        table_content = {'headers': potential_table_header, 'rows': []}

        # If we inferred header from the table, start iterating rows from the second row
        start_row_index = 1 if not potential_table_header else 0
        for row in table.rows[start_row_index:]:
            row_content = {}
            for index, cell in enumerate(row.cells):
                cell_text = ' '.join(paragraph.text.strip() for paragraph in cell.paragraphs if paragraph.text.strip())
                header = potential_table_header[index] if index < len(potential_table_header) else f"Column_{index+1}"
                row_content[header] = cell_text

            table_content['rows'].append(row_content)
        result['tables'].append(table_content)

        # Reset the potential table header
        potential_table_header = None

    return result

    
# Testing the function
pdf_url = "https://www.coca-colacompany.com/content/dam/company/us/en/reports/coca-cola-business-and-sustainability-report-2022.pdf"
fname = url2fname(pdf_url)  # used to save vectordb
try:
#     docx_path = convert_url_pdf_to_docx(pdf_url)

#     if not docx_path:
#         print("Failed to convert PDF to DOCX. Exiting...")
#         exit(1)

    doc_contents = read_docx(docx_path)
    
    # TOO DIFFICULT TO PROPERLY IMPLEMENT WITH PATTER MATCHING IS FIGURES FUNCTION
#     processed_figures = []
#     for i, figure in enumerate(doc_contents['potential_figures']):
#         title = figure['title']
#         data = figure['data']
#         # Check if the title looks like a year
#         if re.match(r'^\d{4}$', title) and len(data) == 1:
#             # Check if the previous figure exists and its title isn't a year
#             if (i-1 >= 0 and not re.match(r'^\d{4}$', doc_contents['potential_figures'][i-1]['title'])):
#                 # Merge the data
#                 processed_figures[-1]['data'].append(f"{title}: {data[0]}")
#             else:
#                 # If there is no previous non-year title, just add this as a new figure
#                 processed_figures.append({'title': title, 'data': data})
#         else:
#             processed_figures.append({'title': title, 'data': data})

#     doc_contents['figures'] = processed_figures
#     del doc_contents['potential_figures']  # Remove the potential figures if you don't need them anymore

    # Example usage:
    print("Text Extracted:")
    for para in doc_contents['text']:
        print(para)
    print('---'*50)

    print("\n\nFigures Extracted:")
    for figure in doc_contents['potential_figures']:
        print(figure)
    print('---'*50)

    print("\n\nTables Extracted:")
    for table in doc_contents['tables']:
        print(table)
except Exception as e:
    print(f"An error occurred: {e}")

Text Extracted:
Refresh the World. Make a Difference.
CONTENTS
We build loved brands that bring joy to our consumers’ lives with beverage choices for all occasions, tastes and lifestyles. Our growth strategy is grounded in our core values and commitment to social and environmental responsibility.
SCOPE OF THIS REPORT
This 2022 Business & Sustainability Report is The Coca-Cola Company’s fifth report to integrate overall business and sustainability performance, data and context, reflecting our continued journey toward driving sustainable business practices into our core strategy.
Except as otherwise noted, this report covers the 2022 performance of The Coca-Cola Company and the Coca-Cola system (our company and our bottling partners), as applicable.
As used in this report, the terms “material,” “materiality,” “immaterial,” “substantive,” “significant” and other similar terminology are not used, or intended to be construed, as they have been defined by or construed in accordance with the 

In [None]:
# URL -> file name convertor:
def url2fname(url):
    # Split the URL by '/' and get the last segment
    last_segment = url.split('/')[-1]
    
    # Use regex to remove any suffix after the dot and the dot itself
    cleaned_name = re.sub(r'\..*$', '', last_segment)
    
    return cleaned_name
    
    
# Create URL loader:
def wait_for_file(file_path: str, timeout: int = 60) -> bool:
    """
    Wait for a file to be present at a specified path within a given timeout.
    
    Args:
        file_path (str): Path to the file.
        timeout (int): Maximum waiting time in seconds. Default is 60 seconds.

    Returns:
        bool: True if file is found within the timeout, False otherwise.
    """
    start_time = time.time()

    while time.time() - start_time < timeout:
        if os.path.exists(file_path):
            return True
        time.sleep(1)

    return False


def download_pdf_from_url(url: str, save_path: str) -> str:
    """
    Download a PDF from the specified URL and save it to a local path.
    
    Args:
        url (str): URL of the PDF.
        save_path (str): Local path to save the downloaded PDF.

    Returns:
        str: Path to the saved PDF if successful, None otherwise.
    """
    response = requests.get(url)
    if response.status_code == 200:
        with open(save_path, 'wb') as file:
            file.write(response.content)
        if os.path.exists(save_path):
            return save_path
    return None


def convert_pdf_to_docx(
    pdf_filename: str, driver_path: str, pdf_folder_path: str, docx_folder_path: str
) -> str:
    """
    Convert a PDF to a DOCX using Adobe's online tool.
    
    Args:
        pdf_filename (str): Filename of the PDF.
        driver_path (str): Path to the geckodriver executable.
        pdf_folder_path (str): Directory where the PDF is located.
        docx_folder_path (str): Directory where the converted DOCX should be saved.

    Returns:
        str: Path to the converted DOCX if successful, None otherwise.
    """
    # WebDriver setup and configurations
    firefox_options = webdriver.FirefoxOptions()
    firefox_options.set_preference("browser.download.folderList", 2)
    firefox_options.set_preference("browser.download.dir", docx_folder_path)
    firefox_options.set_preference("browser.download.useDownloadDir", True)
    firefox_options.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/vnd.openxmlformats-officedocument.wordprocessingml.document")
    
    service = Service(driver_path)
    driver = webdriver.Firefox(service=service, options=firefox_options)
    wait = WebDriverWait(driver, 180)
    driver.get("https://www.adobe.com/be_en/acrobat/online/pdf-to-word.html")

    # Upload the PDF
    upload_btn = wait.until(EC.element_to_be_clickable((By.ID, "lifecycle-nativebutton")))
    upload_btn.click()

    full_pdf_path = os.path.join(pdf_folder_path, pdf_filename)
    if not os.path.exists(full_pdf_path):
        print(f"File path\n{full_pdf_path}\nis not valid.")
        return None
    
    # Wait for the file selection dialog and input the file path using pyautogui
    time.sleep(5)
    # Use the path in pyautogui
    pyautogui.typewrite(full_pdf_path)

    # Add a slight delay and then press 'enter' multiple times
    time.sleep(2)
    for _ in range(3):
        pyautogui.press('enter')
        time.sleep(0.1)
    time.sleep(10)
    
    # Handle cookies and start conversion
    try:
        cookie_reject_btn = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#onetrust-reject-all-handler")))
        if cookie_reject_btn:
            cookie_reject_btn.click()
    except Exception:
        print("Cookie settings notification not found or failed to click.")

    download_btn = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'button.Download__downloadButton___2qFEa')))
    download_btn.click()
    time.sleep(10)
    driver.quit()

    expected_docx_filename = pdf_filename.replace('.pdf', '.docx')
    expected_docx_filepath = os.path.join(docx_folder_path, expected_docx_filename)

    return expected_docx_filepath if wait_for_file(expected_docx_filepath, 50) else None


def convert_url_pdf_to_docx(
    pdf_url: str, 
    driver_path: str = "./drivers/geckodriver.exe", 
    pdf_folder_path: str = None, 
    docx_folder_path: str = None
) -> str:
    """
    Download a PDF from a URL, convert it to DOCX, and save it locally.
    
    Args:
        pdf_url (str): URL of the PDF.
        driver_path (str): Path to the geckodriver executable. Default is './drivers/geckodriver.exe'.
        pdf_folder_path (str): Directory to save the downloaded PDF. Default is '../data/pdf_db'.
        docx_folder_path (str): Directory to save the converted DOCX. Default is '../data/docx_db'.

    Returns:
        str: Path to the converted DOCX if successful, None otherwise.
    """
    cwd = os.getcwd()
    pdf_folder_path = pdf_folder_path or os.path.join(cwd, "..", "data", "pdf_db")
    docx_folder_path = docx_folder_path or os.path.join(cwd, "..", "data", "docx_db")

    os.makedirs(pdf_folder_path, exist_ok=True)
    os.makedirs(docx_folder_path, exist_ok=True)

    pdf_filename = pdf_url.split('/')[-1]
    pdf_save_path = os.path.join(pdf_folder_path, pdf_filename)

    if download_pdf_from_url(pdf_url, pdf_save_path):
        return convert_pdf_to_docx(pdf_filename, driver_path, pdf_folder_path, docx_folder_path)
    return None


def read_docx(file_path: str) -> dict:
    """
    Extract content from a DOCX file.
    
    Args:
        file_path (str): Path to the DOCX file.

    Returns:
        dict: Extracted content including metadata, text, and tables.
    """
    
    def is_potential_figure_data(text):
        figure_related_chars = set("0123456789.%-")
        text_count = len(text) - text.count(' ')
        figure_char_count = sum(1 for char in text if char in figure_related_chars)
        char_count = text_count - figure_char_count

        # if text is only spaces OR
        # if text end with . (i.e end of sentence) or :, ;, etc OR
        # if the text is long, it's probably not a figure data
        if (text_count == 0) or text.endswith(('.', ':', ';', ',')) or (char_count > 20):
            return False

        # if text is only numbers OR
        # if digits make up at least n% of the text consider it as potential figure data
        if (char_count == 0) or (figure_char_count / (char_count + 1e-10) > 0.1):
            return True

        return False

    COMMON_FOOTNOTES = ["read more", "see above", "note:", "see below", "*", "..."]
    
    doc = docx.Document(file_path)
    result = {
        'metadata': {
            'title': doc.core_properties.title,
            'author': doc.core_properties.author,
            'created': doc.core_properties.created
        },
        'text': [],
        'tables': [],
        'potential_figures': []
    }
    
    # Extract text and figures:
    figure_data_group = {'title': None, 'data': []}
    previous_para = None
    previous_was_figure_data = False
    skip_next = False

    for para in doc.paragraphs:
        clean_text = para.text.strip()

        if len(clean_text) == 0:  # Ignore empty lines
            continue

        # Check if the line is a common footnote id
        if clean_text.lower() in COMMON_FOOTNOTES:
            skip_next = True  # Signal to skip next iteration for figure data processing
            continue

        if skip_next:  # If the line is footnote text
            result['text'].append(clean_text)  # Store it under 'text'
            skip_next = False  # Reset the flag
            continue

        # Identify if the current line is potential figure data
        current_is_figure_data = is_potential_figure_data(clean_text)

        if current_is_figure_data:
            # If previous line was also figure data, they belong to the same figure
            if previous_was_figure_data:
                figure_data_group['data'].append(clean_text)
            else:
                # If a new figure starts, save the previous figure (if there was any)
                if figure_data_group['data']:
                    result['potential_figures'].append(figure_data_group)
                    figure_data_group = {'title': None, 'data': []}

                # Assign the previous line as the title for the current figure
                figure_data_group['title'] = previous_para
                figure_data_group['data'].append(clean_text)
        else:
            # Not a figure line, add to text
            result['text'].append(clean_text)

        # Keep track of the current paragraph's text for the next iteration
        previous_para = clean_text

        # Set the current line's status for the next iteration
        previous_was_figure_data = current_is_figure_data

    if figure_data_group['data']:  # Append any remaining figure data group
        figure_data_group['title'] = previous_para
        result['potential_figures'].append(figure_data_group)

    # Extract tables
    potential_table_header = None
    for table in doc.tables:
        if not potential_table_header:
            # Attempting to infer header from the first row
            potential_table_header = [cell.text.strip() for cell in table.rows[0].cells]

        table_content = {'headers': potential_table_header, 'rows': []}

        # If we inferred header from the table, start iterating rows from the second row
        start_row_index = 1 if not potential_table_header else 0
        for row in table.rows[start_row_index:]:
            row_content = {}
            for index, cell in enumerate(row.cells):
                cell_text = ' '.join(paragraph.text.strip() for paragraph in cell.paragraphs if paragraph.text.strip())
                header = potential_table_header[index] if index < len(potential_table_header) else f"Column_{index+1}"
                row_content[header] = cell_text

            table_content['rows'].append(row_content)
        result['tables'].append(table_content)

        # Reset the potential table header
        potential_table_header = None

    return result


def read_docx(file_path: str) -> dict:
    
    def extract_footnotes_from_para(para):
        """Extract footnote references and actual footnotes from a paragraph."""
        footnotes = []

        # Look for footnote references in the XML
        footnote_refs = para._element.findall('.//w:footnoteReference', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'})

        for ref in footnote_refs:
            footnote_id = ref.get("{http://schemas.openxmlformats.org/wordprocessingml/2006/main}id")
            footnote = para.part.footnotes_part.footnote_dict[footnote_id]
            footnotes.append(footnote.text)

        return footnotes
    
    doc = docx.Document(file_path)
    result = {
        'metadata': {
            'title': doc.core_properties.title,
            'author': doc.core_properties.author,
            'created': doc.core_properties.created
        },
        'text': [],
        'tables': [],
        'potential_figures': []
    }

    elements = doc.element.body
    i = 0
    table_title = None

    while i < len(elements):
        elem = elements[i]
        
        # Paragraph
        if elem.tag.endswith('p'):
            para = docx.text.paragraph.Paragraph(elem, None)
            
            processed_text = para.text.strip()

            # Remove bookmark start and end tags
            for run in para.runs:
                if "bookmarkStart" in run._element.xml or "bookmarkEnd" in run._element.xml:
                    run.clear()

            # Process footnotes
            footnotes = extract_footnotes_from_para(para)
        
            # Attach footnotes to the text.
            for fn_idx, footnote in enumerate(footnotes, 1):
                processed_text = processed_text.replace(f"[{fn_idx}]", f"({footnote})")
        
            if processed_text:
                result['text'].append(processed_text)

            # Check if this paragraph might be a table title
            if i+1 < len(elements) and elements[i+1].tag.endswith('tbl'):
                table_title = processed_text

        # Table
        elif elem.tag.endswith('tbl'):
            table = docx.table.Table(elem, None)
            headers = [cell.text.strip() for cell in table.rows[0].cells]
        
            rows = []
            for row in table.rows[1:]:
                row_data = {headers[j]: cell.text.strip() for j, cell in enumerate(row.cells)}
                rows.append(row_data)
        
            result['tables'].append({
                'title': table_title,
                'col_headers': headers,
                'table': rows
            })

            # Reset table title
            table_title = None
        
        i += 1

    return result

  
# Testing the function
pdf_url = "https://www.coca-colacompany.com/content/dam/company/us/en/reports/coca-cola-business-and-sustainability-report-2022.pdf"
fname = url2fname(pdf_url)  # used to save vectordb
try:
    docx_path = convert_url_pdf_to_docx(pdf_url)

    if not docx_path:
        print("Failed to convert PDF to DOCX. Exiting...")
        exit(1)

    doc_contents = read_docx(docx_path)
    
    # Example usage:
    print("Text Extracted:")
    for para in doc_contents['text']:
        print(para)
    print('---'*50)

    print("\n\nTables Extracted:")
    for table in doc_contents['tables']:
        print(table)
        
    print("\n\nFigures Extracted:")
    for figure in doc_contents['potential_figures']:
        print(figure)
    print('---'*50)
    
except Exception as e:
    print(f"An error occurred: {e}")

In [None]:
# URL -> file name convertor:
def url2fname(url):
    # Split the URL by '/' and get the last segment
    last_segment = url.split('/')[-1]
    
    # Use regex to remove any suffix after the dot and the dot itself
    cleaned_name = re.sub(r'\..*$', '', last_segment)
    
    return cleaned_name
    
    
# Create URL loader:
def wait_for_file(file_path: str, timeout: int = 60) -> bool:
    """
    Wait for a file to be present at a specified path within a given timeout.
    
    Args:
        file_path (str): Path to the file.
        timeout (int): Maximum waiting time in seconds. Default is 60 seconds.

    Returns:
        bool: True if file is found within the timeout, False otherwise.
    """
    start_time = time.time()

    while time.time() - start_time < timeout:
        if os.path.exists(file_path):
            return True
        time.sleep(1)

    return False


def download_pdf_from_url(url: str, save_path: str) -> str:
    """
    Download a PDF from the specified URL and save it to a local path.
    
    Args:
        url (str): URL of the PDF.
        save_path (str): Local path to save the downloaded PDF.

    Returns:
        str: Path to the saved PDF if successful, None otherwise.
    """
    response = requests.get(url)
    if response.status_code == 200:
        with open(save_path, 'wb') as file:
            file.write(response.content)
        if os.path.exists(save_path):
            return save_path
    return None


def convert_pdf_to_docx(
    pdf_filename: str, driver_path: str, pdf_folder_path: str, docx_folder_path: str
) -> str:
    """
    Convert a PDF to a DOCX using Adobe's online tool.
    
    Args:
        pdf_filename (str): Filename of the PDF.
        driver_path (str): Path to the geckodriver executable.
        pdf_folder_path (str): Directory where the PDF is located.
        docx_folder_path (str): Directory where the converted DOCX should be saved.

    Returns:
        str: Path to the converted DOCX if successful, None otherwise.
    """
    # WebDriver setup and configurations
    firefox_options = webdriver.FirefoxOptions()
    firefox_options.set_preference("browser.download.folderList", 2)
    firefox_options.set_preference("browser.download.dir", docx_folder_path)
    firefox_options.set_preference("browser.download.useDownloadDir", True)
    firefox_options.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/vnd.openxmlformats-officedocument.wordprocessingml.document")
    
    service = Service(driver_path)
    driver = webdriver.Firefox(service=service, options=firefox_options)
    wait = WebDriverWait(driver, 180)
    driver.get("https://www.adobe.com/be_en/acrobat/online/pdf-to-word.html")

    # Upload the PDF
    upload_btn = wait.until(EC.element_to_be_clickable((By.ID, "lifecycle-nativebutton")))
    upload_btn.click()

    full_pdf_path = os.path.join(pdf_folder_path, pdf_filename)
    if not os.path.exists(full_pdf_path):
        print(f"File path\n{full_pdf_path}\nis not valid.")
        return None
    
    # Wait for the file selection dialog and input the file path using pyautogui
    time.sleep(5)
    # Use the path in pyautogui
    pyautogui.typewrite(full_pdf_path)

    # Add a slight delay and then press 'enter' multiple times
    time.sleep(2)
    for _ in range(3):
        pyautogui.press('enter')
        time.sleep(0.1)
    time.sleep(10)
    
    # Handle cookies and start conversion
    try:
        cookie_reject_btn = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#onetrust-reject-all-handler")))
        if cookie_reject_btn:
            cookie_reject_btn.click()
    except Exception:
        print("Cookie settings notification not found or failed to click.")

    download_btn = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'button.Download__downloadButton___2qFEa')))
    download_btn.click()
    time.sleep(10)
    driver.quit()

    expected_docx_filename = pdf_filename.replace('.pdf', '.docx')
    expected_docx_filepath = os.path.join(docx_folder_path, expected_docx_filename)

    return expected_docx_filepath if wait_for_file(expected_docx_filepath, 55) else None


def convert_url_pdf_to_docx(
    pdf_url: str, 
    driver_path: str = "./drivers/geckodriver.exe", 
    pdf_folder_path: str = None, 
    docx_folder_path: str = None
) -> str:
    """
    Download a PDF from a URL, convert it to DOCX, and save it locally.
    
    Args:
        pdf_url (str): URL of the PDF.
        driver_path (str): Path to the geckodriver executable. Default is './drivers/geckodriver.exe'.
        pdf_folder_path (str): Directory to save the downloaded PDF. Default is '../data/pdf_db'.
        docx_folder_path (str): Directory to save the converted DOCX. Default is '../data/docx_db'.

    Returns:
        str: Path to the converted DOCX if successful, None otherwise.
    """
    cwd = os.getcwd()
    pdf_folder_path = pdf_folder_path or os.path.join(cwd, "data", "pdf_db")
    docx_folder_path = docx_folder_path or os.path.join(cwd, "data", "docx_db")

    os.makedirs(pdf_folder_path, exist_ok=True)
    os.makedirs(docx_folder_path, exist_ok=True)

    pdf_filename = pdf_url.split('/')[-1]
    pdf_save_path = os.path.join(pdf_folder_path, pdf_filename)

    if download_pdf_from_url(pdf_url, pdf_save_path):
        return convert_pdf_to_docx(pdf_filename, driver_path, pdf_folder_path, docx_folder_path)
    return None


def read_docx(file_path: str) -> dict:
    
    def extract_footnotes_from_para(para):
        """Extract footnote references and actual footnotes from a paragraph."""
        footnotes = []

        # Look for footnote references in the XML
        footnote_refs = para._element.findall(
            './/w:footnoteReference', 
            namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
        )

        for ref in footnote_refs:
            footnote_id = ref.get(
                "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}id"
            )
            footnote = para.part.footnotes_part.footnote_dict[footnote_id]
            footnotes.append(footnote.text)

        return footnotes
    
    def is_potential_figure_data(text):
        if text is None:
            return False
        
        figure_related_chars = set("0123456789.%-")
        text_count = len(text) - text.count(' ')
        figure_char_count = sum(1 for char in text if char in figure_related_chars)
        char_count = text_count - figure_char_count

        # if text is only spaces OR
        # if text end with . (i.e end of sentence) or :, ;, etc OR
        # if the text is long, it's probably not a figure data
        if (text_count == 0) or text.endswith(('.', ':', ';', ',')) or (char_count > 20):
            return False

        # if text is only numbers OR
        # if digits make up at least n% of the text consider it as potential figure data
        if (char_count == 0) or (figure_char_count / (char_count + 1e-10) > 0.1):
            return True

        return False
    
    doc = docx.Document(file_path)
    result = {
        'metadata': {
            'title': doc.core_properties.title,
            'author': doc.core_properties.author,
            'created': doc.core_properties.created
        },
        'text': [],
        'tables': [],
        'potential_figures': []
    }

    elements = doc.element.body
    
    figure_data_group = {'title': None, 'data': []}
    
    previous_para = None
    current_para = None
    next_para = docx.text.paragraph.Paragraph(elements[0], None) if len(elements) > 0 else None
    
    previous_was_figure_data = False
    current_is_figure_data = False
    next_is_figure_data = is_potential_figure_data(next_para.text)
            
    table_title = None
    
    for i in tqdm.tqdm(range(len(elements))):
                
        elem = elements[i]
        # Paragraph
        if elem.tag.endswith('p'):
            
            print('\n')
            print(i)
            
            # Move the window forward
            previous_para = current_para if len(current_para.text.strip()) > 0 else previous_para
            current_para = docx.text.paragraph.Paragraph(elem, None)
            next_para = docx.text.paragraph.Paragraph(elements[i + 1], None) if i + 1 < len(elements) else None
            
            processed_text = current_para.text.strip()
            if len(processed_text) == 0:  # Ignore empty lines
                print('skip')
                continue
            
            try: 
                next_text = next_para.text
            except AttributeError: 
                next_text = None
            
            print('previous para, current para, next para text')
            if previous_para is not None:
                print(previous_para.text)
                print('-0-0-')
            print(processed_text)
            print('-0-0-')
            print(next_text)

            # Process footnotes
            footnotes = extract_footnotes_from_para(current_para)
            for idx, footnote in enumerate(footnotes, 1):
                footnote_patterns = [
                    re.compile(r"\[{}\]".format(idx)),  # [1], [2], ...
                    re.compile(r"^{}\b".format(idx)),  # Starts with number: 1Blah ...
                    re.compile(r"\b{}\b".format(idx)),  # Between words: blah 1 blah ...
                    re.compile(r"\b{}\.".format(idx))  # End of sentence: blah.1 or word: blah1
                ]
                for pattern in footnote_patterns:
                    if pattern.search(processed_text):
                        print(f'text prior to footnote fill: {processed_text}')
                        processed_text = pattern.sub("[{}]".format(footnote), processed_text).strip()
                        print(f'text after fill: {processed_text}')
            
            # Identify if the current line is potential figure data
            previous_was_figure_data = current_is_figure_data
            current_is_figure_data = next_is_figure_data
            next_is_figure_data = is_potential_figure_data(next_text)
            
            print('Is figure: previous, current, next...')
            print(previous_was_figure_data, current_is_figure_data, next_is_figure_data)
            
            if current_is_figure_data:
                
                print('Figure data group in if clause (start)')
                print(figure_data_group)
                
                # If previous line was also figure data, they belong to the same figure
                if previous_was_figure_data:
                    figure_data_group['data'].append(processed_text)
                else:
                    # If a new figure starts, save the previous figure (if there was any)
                    if figure_data_group['data']:
                        result['potential_figures'].append(figure_data_group)
                        figure_data_group = {'title': None, 'data': []}

                    # Assign the previous line as the title for the current figure
                    figure_data_group['title'] = previous_para.text.strip()
                    figure_data_group['data'].append(processed_text)
                print('group after if...')
                print(figure_data_group)
            elif not next_is_figure_data:  # neither next or current text is figure
                # Not a figure line, add to text
                result['text'].append(processed_text)
                print('in else clause where text is stored... showing last element of text')
                print(result['text'][-1])
            else:  # next text is figure, meaning that current text will be stored as title.
                pass

            # Update trackers for figures:
            # Check if this paragraph might be a table title
            if elements[i+1].tag.endswith('tbl'):
                table_title = processed_text
                print('In table title where...')
                print(table_title)
                continue
            
            # Update trackers for figures:
            # Handles case when text interupts figure.
            if previous_was_figure_data and \
                not current_is_figure_data and \
                next_is_figure_data:
                print('Reached case when text interupt figure...')
                current_is_figure_data = True
                
            print('----------------\n')
                
        # Table
        elif elem.tag.endswith('tbl'):
            table_index = [tbl._element for tbl in doc.tables].index(elem)
            table = doc.tables[table_index]

            headers = [cell.text.strip() for cell in table.rows[0].cells]

            rows = []
            for row in table.rows[1:]:
                row_data = {headers[j]: cell.text.strip() for j, cell in enumerate(row.cells)}
                rows.append(row_data)

            result['tables'].append({
                'title': table_title,
                'col_headers': headers,
                'table': rows
            })

            # Reset table title
            table_title = None

    return result

  
# Testing the function
pdf_url = "https://www.coca-colacompany.com/content/dam/company/us/en/reports/coca-cola-business-and-sustainability-report-2022.pdf"
fname = url2fname(pdf_url)  # used to save vectordb
# try:
#     docx_path = convert_url_pdf_to_docx(pdf_url)

#     if not docx_path:
#         print("Failed to convert PDF to DOCX. Exiting...")
#         exit(1)
        
doc_contents = read_docx(docx_path)

# Example usage:
print("Text Extracted:")
for para in doc_contents['text']:
    print(para)
print('---'*50)

print("\n\nTables Extracted:")
for table in doc_contents['tables']:
    print(table)

print("\n\nFigures Extracted:")
for figure in doc_contents['potential_figures']:
    print(figure)
print('---'*50)
    
# except Exception as e:
#     print(f"An error occurred: {e}")

### Obtain short summaries for tables.
To facilitate the encoding of tables, we will ask an LLM to generate a textual summary of a table's contents. The idea being that this summary will yield better vector encodings than if we simply tried to encode the table. It's an added cost but one that will hopefully yield better context for our LLMs. Note that the table with summaries will be fed to the engine if the tabular chunk is selected as context. The key when summarising is obtaining short summaries!

In [25]:
doc_contents['table_summary'] = []
parameters = {
    'model': 'gpt-3.5-turbo', 
    "temperature": 0,
}

for i, table in enumerate(doc_contents['tables'], start=1):
    parameters['messages'] = [
        {"role": "system", "content": "You are a table summarizer."}, 
        {
            "role": "user", 
            "content": f"In 100 words or less, describe what the following table aims to display: {table}"
        }, 
    ]

    response = openai.ChatCompletion.create(
      **parameters
    )
    doc_contents['table_summary'].append(response['choices'][0]['message']['content'])
    
    print(f"Table {i} Summary;\n {response['choices'][0]['message']['content']}\n\n")

Table 0 Summary;
 The table aims to display the goals and current status of reducing greenhouse gas (GHG) emissions. The headers indicate the categories "GOAL" and "2022 STATUS". The rows provide specific information about the goal to reduce absolute GHG emissions by 25% by 2030, against a 2015 baseline. The 2022 status shows that there has been a 7% reduction in GHG emissions so far.


Table 1 Summary;
 The table aims to display the ambition of achieving net zero emissions by 2050. The header "AMBITION" indicates that the table is focused on this specific goal. The single row in the table states the same ambition, emphasizing the commitment to reduce greenhouse gas emissions to a level where they are balanced by the removal of an equivalent amount from the atmosphere by 2050.


Table 2 Summary;
 The table aims to display the gender distribution within different levels of the organization, including senior leadership, middle management, professionals, and the total workforce. It provid

Table 19 Summary;
 The table aims to display data related to human rights cases reported by category, including the number of workplace rights questions asked, cases of child labor, discrimination, forced labor, freedom of association, retaliation, safe and healthy workplace, work hours and wages, workplace security, and total cases. It also includes information on the investment back into local communities and the amount of charitable contributions made by The Coca-Cola Company and The Coca-Cola Foundation. Additionally, it provides the percentage of the company's operating income for each year.


Table 20 Summary;
 The table aims to display various data points for different years related to human rights audits by region, the number of women economically enabled, and the percentage of key agricultural ingredients sustainably sourced. It provides information on the total number of human rights audits conducted each year, broken down by region. It also shows the cumulative number of wom

### Text Splitting, Embedding Models, and Vector DB
We'll be using OpenAI's text-embedding-ada-002 model. 

In [19]:
# Prepare text for chunking:
text = "\n\n".join(doc_contents["text"])

In [27]:
sum([10, 20])

30

In [26]:
model = 'gpt-3.5-turbo'  # open ai LLM model we will be using later.
# model = 'text-davinci-003'  # open ai LLM model we will be using later.
enc_code = tiktoken.encoding_for_model(model).name
tokenizer = tiktoken.get_encoding(enc_code)

# Determine length of input after tokenization
def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=600,  # in order to be able to fit 3 chunks in context window
    chunk_overlap=100,  
    length_function=tiktoken_len,
    separators=["\n\n", "\n", " ", ""]  # order in which splits are prioritized
)
chunks = text_splitter.split_text(text)
print(
    f'The input text of lenght {len(text)} was split into {len(chunks)} chunks.'
)

The input text of lenght 187603 was split into 76 chunks.


In [30]:
def assert_roughly_equal(value1, value2, tolerance, message=None):
    if not math.isclose(value1, value2, rel_tol=tolerance):
        if message is None:
            message = f"{value1} and {value2} are not roughly equal within {tolerance} tolerance"
        raise AssertionError(message)

assert_roughly_equal(sum([len(chunk) for chunk in chunks]), len(text), 100)

In [31]:
# extend chunks with table descriptions:
chunks.extend(doc_contents['table_summary'])

98


In [37]:
divider = '-'*100
for chunk in chunks:
    print(f'{chunk}\n{divider}\n\n')

Refresh the World. Make a Difference.

CONTENTS

We build loved brands that bring joy to our consumers’ lives with beverage choices for all occasions, tastes and lifestyles. Our growth strategy is grounded in our core values and commitment to social and environmental responsibility.

SCOPE OF THIS REPORT

This 2022 Business & Sustainability Report is The Coca-Cola Company’s fifth report to integrate overall business and sustainability performance, data and context, reflecting our continued journey toward driving sustainable business practices into our core strategy.

Except as otherwise noted, this report covers the 2022 performance of The Coca-Cola Company and the Coca-Cola system (our company and our bottling partners), as applicable.

As used in this report, the terms “material,” “materiality,” “immaterial,” “substantive,” “significant” and other similar terminology are not used, or intended to be construed, as they have been defined by or construed in accordance with the securities

### Indexing:
Store the indexes to avoid pointlessly rerunning the embedding code.

In [10]:
fpath = f"./data/faiss_db/{fname}.pkl"
if os.path.exists(fpath):
    with open(fpath, 'rb') as f:
        vector_store = pickle.load(f)
else:
    # Init embedding model:
    embed = OpenAIEmbeddings(
        model='text-embedding-ada-002'
    )
    vector_store = FAISS.from_texts(chunks, embedding=embed)
    with open(fpath, 'wb') as f:
        pickle.dump(vector_store, f)

### Set up QA Agent:
We will create an agent cabale of:
- Fetching contextual information from the vector store
- Performing basic math operations 
The thought being that this way the agent may be able to compute KPIs which require basic math operations to compute.

In [11]:
llm = OpenAI(
    temperature=0,
    model_name=model
)

In [89]:
class ContextRetrieval(BaseTool):
    
    name = "information_retrival"
    description = """
        Fetch the most recent information about a company's financials and ESG initiatives.
    """
    output_chunks = []

    @staticmethod
    def string_similarity(s1, s2):
        seq_matcher = difflib.SequenceMatcher(None, s1, s2)
        return seq_matcher.ratio()

    def _similarity_search(self, chunk: str) -> str:
        for _chunk in self.output_chunks:
            if self.string_similarity(chunk, _chunk) > 0.9:
                return f"The information was shared in the previous {self.name} calls."
        
        # If no highly similar string is found in the outputs, 
        # append the query to outputs and return True
        self.output_chunks.append(chunk)
        return chunk
    
    def _run(self, query: str) -> str:
        contents = [
            self._similarity_search(doc.page_content) 
            for doc in vector_store.similarity_search(query, k=2)
        ]
        
        full_content = '\n\n'.join(
            [
                f"Rank: {rank} | Content: {_content}" 
                for rank, _content in enumerate(contents, start=1)
            ]
        )
        return full_content + "\n\nContextual information sorted from most relevant to least relevant."
    
    def _arun(self, query: str):
        raise NotImplementedError(
            f"{self.__class__.__name__} does not currently support async run."
        )
        
llm_math = LLMMathChain(llm=llm)

# initialize the math tool
math_tool = Tool(
    name='Calculator',
    func=llm_math.run,
    description='Useful when you need to perform math operations.'
)
# when giving tools to an LLM, we must pass them as a list of tools.
tools = [math_tool, ContextRetrieval()]

**Setup an agent just like in url_retrieval.ipynb.**

Note that a `utils.py` file will be created to store these in the future.

In [90]:
# Set up the base template
template = """You are an analyst tasked with aggregating financial and ESG KPIs about companies. You have access to the following tools:

{tools}

Use the following format:

Question: the input question you must answer
Thought: you should always think about what to do
Action: the action to take, should be one of [{tool_names}]
Action Input: the input to the action
Observation: the result of the action
... (this Thought/Action/Action Input/Observation can repeat N times)
Thought: I now know the final answer
Final Answer: the final answer to the original input question

Begin! Remember to answer as succinctly as possible when giving your final answer. The final answer, where possible, should just be a number or a boolean.

Question: {input}
{agent_scratchpad}"""

In [91]:
# Set up a prompt template which breaksup the intermediate_steps
# into thoughts that are used to fill the agent_scratchpad, 
# tools, and tool_names in the base template:
class CustomPromptTemplate(StringPromptTemplate):
    # The template to use
    template: str
    # The list of tools available
    tools: List[BaseTool or Tool]
    
    def format(self, **kwargs) -> str:
        # Get the intermediate steps (AgentAction, Observation tuples)
        # Format them in a particular way
        intermediate_steps = kwargs.pop("intermediate_steps")
        thoughts = ""
        for action, observation in intermediate_steps:
            thoughts += action.log
            thoughts += f"\nObservation: {observation}\nThought: "
        # Set the agent_scratchpad variable to that value
        kwargs["agent_scratchpad"] = thoughts
        # Create a tools variable from the list of tools provided
        kwargs["tools"] = "\n".join([f"{tool.name}: {tool.description}" for tool in self.tools])
        # Create a list of tool names for the tools provided
        kwargs["tool_names"] = ", ".join([tool.name for tool in self.tools])
        return self.template.format(**kwargs)

In [92]:
prompt = CustomPromptTemplate(
    template=template,
    tools=tools,
    # This omits the `agent_scratchpad`, `tools`, and `tool_names` variables because those are generated dynamically
    # This includes the `intermediate_steps` variable because that is needed
    input_variables=["input", "intermediate_steps"]
)

In [93]:
class CustomOutputParser(AgentOutputParser):
    
    def parse(self, llm_output: str) -> Union[AgentAction, AgentFinish]:
        # Check if agent should finish
        if "Final Answer:" in llm_output:
            return AgentFinish(
                # Return values is generally always a dictionary with a single `output` key
                # It is not recommended to try anything else at the moment :)
                return_values={"output": llm_output.split("Final Answer:")[-1].strip()},
                log=llm_output,
            )
        # Parse out the action and action input
        regex = r"Action\s*\d*\s*:(.*?)\nAction\s*\d*\s*Input\s*\d*\s*:[\s]*(.*)"
        match = re.search(regex, llm_output, re.DOTALL)
        if not match:
            raise ValueError(f"Could not parse LLM output: `{llm_output}`")
        action = match.group(1).strip()
        action_input = match.group(2)
        # Return the action and action input
        return AgentAction(tool=action, tool_input=action_input.strip(" ").strip('"'), log=llm_output)

In [94]:
llm = OpenAI(
    temperature=0,  # measure of randomness/creativity
    model_name=model
)

# LLM chain consisting of the LLM and a prompt
llm_chain = LLMChain(
    llm=llm, 
    prompt=prompt  # Custom Prompt
)

tool_names = [tool.name for tool in tools]

agent = LLMSingleActionAgent(
    llm_chain=llm_chain, 
    output_parser=CustomOutputParser(),
    stop=["\nObservation:"],  # you want this to be whatever token you use in the prompt to denote the start of an Observation
    allowed_tools=tool_names
) 

agent_executor = AgentExecutor.from_agent_and_tools(
    agent=agent, 
    tools=tools, 
    verbose=True,
    max_iterations=3
)

In [95]:
result = agent_executor.run(
    input="what were the net operating revenues of coca-cola in 2022"
)



[1m> Entering new  chain...[0m
[32;1m[1;3mThought: I need to find the most recent financial information about coca-cola
Action: information_retrival
Action Input: coca-cola financials[0m

Observation:[33;1m[1;3mRank: 1 | Content: Net income attributable to shareowners  

of The Coca-Cola Company

8,920

7,747

9,771

9,542

20

15

10

5

0

-5

-10

16%

16%

6%

2020

2019

2021

2022

(9%)

20

15

10

5

0

19%

13%

12%

0%
2020

2019

2021

2022

Organic Revenue Growth 
(Non-GAAP)1

Comparable Currency Neutral Operating 
Income Growth (Non-GAAP)2

Per Share Data

Basic earnings per share

$2.09

$1.80

$2.26

$2.20

Comparable Currency Neutral Diluted 
Earnings Per Share Growth (Non-GAAP)3

Adjusted Free Cash Flow Conversion Ratio 
(Non-GAAP)4

Diluted earnings per share

Cash dividends

Balance Sheet Data

Total assets

Long-term debt

2.07

1.60

1.79

1.64

2.25

1.68

2.19

1.76

$86,381

$87,296

$94,354

$ 92,763

27,516

40,125

38,116

36,377

20

15

10

5

0

-