## Test Scraping

In [31]:
import requests
from bs4 import BeautifulSoup

headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
}

base_link="https://unfccc.int/first-biennial-transparency-reports"

# Send a GET request to the URL
response = requests.get(base_link, headers=headers)

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')

# Find the div containing the table
div_table = soup.find('div', attrs = {'class': 'table-container'})

# Extract all rows from the div table
rows = div_table.find_all('tr')

# Initialize an empty list to store hrefs
href_list = []

# Iterate over each row and extract data
for row in rows:
    # Extract all columns in the current row with the specific style
    columns = row.find_all('td', style='vertical-align:top; width:177px')
    for column in columns:
        # Find all <p> elements within the column
        paragraphs = column.find_all('p')
        for paragraph in paragraphs:
            # Check if the paragraph contains the text "BTR"
            if 'BTR' in paragraph.get_text():
                # Find all <a> tags within the paragraph
                links = paragraph.find_all('a')
                for link in links:
                    # Get the href attribute
                    href = link.get('href')
                    # Append the href to the list
                    href_list.append(href)

# Print the list of hrefs
print(href_list)

AttributeError: 'NoneType' object has no attribute 'find_all'

In [32]:
print(div_table)

None


In [33]:
response

<Response [200]>

In [34]:
print(soup.prettify())

<html style="height:100%">
 <head>
  <meta content="NOINDEX, NOFOLLOW" name="ROBOTS"/>
  <meta content="telephone=no" name="format-detection"/>
  <meta content="initial-scale=1.0" name="viewport"/>
  <meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/>
 </head>
 <body style="margin:0px;height:100%">
  <iframe frameborder="0" height="100%" id="main-iframe" marginheight="0px" marginwidth="0px" src="/_Incapsula_Resource?SWUDNSAI=31&amp;xinfo=13-349259877-0%202NNN%20RT%281735709275449%20193%29%20q%280%20-1%20-1%20-1%29%20r%280%20-1%29%20B12%284%2c316%2c0%29&amp;incident_id=260000130640992918-1520974852994106189&amp;edet=12&amp;cinfo=04000000&amp;rpinfo=0&amp;cts=PNhRgPQY1X4en2DEn0QRahTCQilpuGXbqXRyPTL2dVI9lxzRU53mu3yDTRYz6WHz&amp;mth=GET" width="100%">
   Request unsuccessful. Incapsula incident ID: 260000130640992918-1520974852994106189
  </iframe>
 </body>
</html>



In [35]:
requests.get(base_link).text

'<html style="height:100%"><head><META NAME="ROBOTS" CONTENT="NOINDEX, NOFOLLOW"><meta name="format-detection" content="telephone=no"><meta name="viewport" content="initial-scale=1.0"><meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1"></head><body style="margin:0px;height:100%"><iframe id="main-iframe" src="/_Incapsula_Resource?SWUDNSAI=31&xinfo=13-349306138-0%202NNN%20RT%281735709549740%20292%29%20q%280%20-1%20-1%20-1%29%20r%280%20-1%29%20B12%284%2c316%2c0%29&incident_id=260000130640992918-1521187827537412941&edet=12&cinfo=04000000&rpinfo=0&cts=Ht0QstEPPJWuZasV91d7E1Jp2uu8fuObUGs%2f2MalcR2BRk6dL2cbLngDT5uzCiS0&mth=GET" frameborder=0 width="100%" height="100%" marginheight="0px" marginwidth="0px">Request unsuccessful. Incapsula incident ID: 260000130640992918-1521187827537412941</iframe></body></html>'

## Test RAG

1. [LlamaIndex DocSummary](https://docs.llamaindex.ai/en/stable/examples/index_structs/doc_summary/DocSummary/)
2. [Semi_structured_and_multi_modal_RAG](https://github.com/langchain-ai/langchain/blob/master/cookbook/Semi_structured_and_multi_modal_RAG.ipynb)
3. [multi_modal_RAG_chroma](https://github.com/langchain-ai/langchain/blob/master/cookbook/multi_modal_RAG_chroma.ipynb)
3. [Semi_Structured_RAG](https://github.com/langchain-ai/langchain/blob/master/cookbook/Semi_Structured_RAG.ipynb)
3. [CrewAI PDFSearch Tool](https://docs.crewai.com/tools/pdfsearchtool#pdfsearchtool)
4. [Advanced RAG with LlamaParse](https://docs.llamaindex.ai/en/stable/examples/cookbooks/oreilly_course_cookbooks/Module-8/Advanced_RAG_with_LlamaParse/)
7. [LangGraph Retrieval Agent](https://github.com/langchain-ai/langchain/blob/master/cookbook/langgraph_agentic_rag.ipynb)


Improvements: 
1. Query Transformation, (RAG Fusion?)
2. Text, Table Multi Vector Retriever, 
3. LangGraph Agent RAG
4. ReRanker

In [None]:
!pip install markitdown langchain-chroma langchain_community tiktoken langchain-openai langchainhub langchain langgraph

In [1]:
import os
os.chdir("/Users/josingh/hobby/climate-dashboard/")

btr_file_path = "data/btr/Singapore%20BTR1%202024.pdf"

#### Markitdown Extraction

[markitdown](https://github.com/microsoft/markitdown)

In [None]:
from markitdown import MarkItDown

md = MarkItDown()
result = md.convert(btr_file_path)
# print(result.text_content)

#### LlamaParse

In [None]:
!pip install llama-index
!pip install llama-index-postprocessor-flag-embedding-reranker
!pip install git+https://github.com/FlagOpen/FlagEmbedding.git
!pip install llama-parse

In [26]:
# llama-parse is async-first, running the async code in a notebook requires the use of nest_asyncio
import nest_asyncio

nest_asyncio.apply()


In [27]:
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import VectorStoreIndex
from llama_index.core import Settings

embed_model = OpenAIEmbedding(model="text-embedding-3-small")
llm = OpenAI(model="gpt-4o-mini")

Settings.llm = llm
Settings.embed_model = embed_model

In [30]:
# LlamaParse PDF reader for PDF Parsing
from llama_parse import LlamaParse

documents = LlamaParse(result_type="markdown").load_data(
    "./data/btr/Singapore%20BTR1%202024.pdf"
)
# Started parsing the file under job_id 3bd10bf8-1c99-4ba2-94f8-36576a34b06d

Started parsing the file under job_id 3bd10bf8-1c99-4ba2-94f8-36576a34b06d
.........

In [31]:
from llama_index.core.node_parser import MarkdownElementNodeParser

node_parser = MarkdownElementNodeParser(
    llm=OpenAI(model="gpt-4o-mini"), num_workers=8
)

nodes = node_parser.get_nodes_from_documents(documents)

0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
2it [00:00, 48489.06it/s]
  docstore.set_document_hash(doc.get_doc_id(), doc.hash)
1it [00:00, 11491.24it/s]
  docstore.set_document_hash(doc.get_doc_id(), doc.hash)
2it [00:00, 27776.85it/s]
  docstore.set_document_hash(doc.get_doc_id(), doc.hash)
1it [00:00, 16070.13it/s]
  docstore.set_document_hash(doc.get_doc_id(), doc.hash)
1it [00:00, 28532.68it/s]
  docstore.set_document_hash(doc.get_doc_id(), doc.hash)
1it [00:00, 16131.94it/s]
  docstore.set_document_hash(doc.get_doc_id(), doc.hash)
1it [00:00, 11125.47it/s]
  docstore.set_document_hash(doc.get_doc_id(), doc.hash)
1it [00:00, 14979.66it/s]
  docstore.set_document_hash(doc.get_doc_id(), doc.hash)
0it [00:00, ?it/s]
1it [00:00, 19691.57it/s]
  docstore.set_document_hash(doc.get_doc_id(), doc.hash)
2it [00:00, 41734.37it/s]
  docstore.set_document_hash(doc.get_doc_id(), doc.hash)
0it [00:00, ?it/s]
1it [00:00, 23831.27it/s]
  docstore.set_document_hash(d

In [32]:
text_nodes, index_nodes = node_parser.get_nodes_and_objects(nodes)

In [33]:
recursive_index = VectorStoreIndex(nodes=text_nodes + index_nodes)

In [3]:
# Save and load
from llama_index.core import load_index_from_storage, StorageContext

country = "Singapore"
# if not os.path.exists(f"./data/vector_store/btr/{country}"):
#     # build vector index
#     recursive_index.storage_context.persist(
#         persist_dir=f"./data/vector_store/btr/{country}"
#     )
# else:
recursive_index = load_index_from_storage(
    StorageContext.from_defaults(persist_dir=f"./data/vector_store/btr/{country}"),
)

In [6]:
# from llama_index.postprocessor.flag_embedding_reranker import (
#     FlagEmbeddingReranker,
# )

# reranker = FlagEmbeddingReranker(
#     top_n=5,
#     model="BAAI/bge-reranker-large",
# )

from llama_index.core.postprocessor import SentenceTransformerRerank

reranker = SentenceTransformerRerank(
    model="models/rerank", top_n=5
)



In [7]:
recursive_query_engine = recursive_index.as_query_engine(
    similarity_top_k=15, node_postprocessors=[reranker], verbose=True
)

In [None]:
query = "What are the list of mitigation measures pertaining to shifting to cleaner energy sources?"
# "ccc"
# Explain how Singapore is Alternative Energy Disadvantaged?

response_2 = recursive_query_engine.query(query)
print(response_2)

[1;3;38;2;11;159;203mRetrieval entering e38c4afd-d0ed-4ef3-95eb-fcbfe4240e1c: TextNode
[0m[1;3;38;2;237;90;200mRetrieving from object TextNode with query What are the list of mitigation measures pertaining to shifting to cleaner energy sources?
[0m[1;3;38;2;11;159;203mRetrieval entering e2e0c5de-6b6f-4667-8df3-a08e2efc8de4: TextNode
[0m[1;3;38;2;237;90;200mRetrieving from object TextNode with query What are the list of mitigation measures pertaining to shifting to cleaner energy sources?
[0m[1;3;38;2;11;159;203mRetrieval entering edb9663d-1bcf-4e55-9bc7-13ce2285244a: TextNode
[0m[1;3;38;2;237;90;200mRetrieving from object TextNode with query What are the list of mitigation measures pertaining to shifting to cleaner energy sources?
[0m[1;3;38;2;11;159;203mRetrieval entering bbbbb3c6-c90b-4a30-97b9-64c831328b96: TextNode
[0m[1;3;38;2;237;90;200mRetrieving from object TextNode with query What are the list of mitigation measures pertaining to shifting to cleaner energy sourc

### LangGraph Retrieval Agent

[langgraph_agentic_rag](https://github.com/langchain-ai/langchain/blob/master/cookbook/langgraph_agentic_rag.ipynb)

In [18]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_community.document_loaders import PyPDFLoader
from langchain_openai import OpenAIEmbeddings


loader = PyPDFLoader(btr_file_path)
docs_list = []
for page in loader.lazy_load():
    docs_list.append(page)

text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=100, chunk_overlap=50
)
doc_splits = text_splitter.split_documents(docs_list)

# Add to vectorDB
vectorstore = Chroma.from_documents(
    documents=doc_splits,
    collection_name="rag-chroma",
    embedding=OpenAIEmbeddings(),
)
retriever = vectorstore.as_retriever()

In [19]:
from langchain.tools.retriever import create_retriever_tool

tool = create_retriever_tool(
    retriever,
    "retrieve_biennial_transparency_report",
    "Search and return information about biennial transparency reports (BTR).",
)

tools = [tool]

from langgraph.prebuilt import ToolExecutor

tool_executor = ToolExecutor(tools)

  tool_executor = ToolExecutor(tools)


In [20]:
import operator
from typing import Annotated, Sequence, TypedDict

from langchain_core.messages import BaseMessage


class AgentState(TypedDict):
    messages: Annotated[Sequence[BaseMessage], operator.add]

In [21]:
import json
import operator
from typing import Annotated, Sequence, TypedDict

from langchain.output_parsers import PydanticOutputParser
from langchain.prompts import PromptTemplate
from langchain.tools.render import format_tool_to_openai_function
from langchain_core.messages import BaseMessage, FunctionMessage
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_openai import ChatOpenAI
from langgraph.prebuilt import ToolInvocation

### Edges


def should_retrieve(state):
    """
    Decides whether the agent should retrieve more information or end the process.

    This function checks the last message in the state for a function call. If a function call is
    present, the process continues to retrieve information. Otherwise, it ends the process.

    Args:
        state (messages): The current state of the agent, including all messages.

    Returns:
        str: A decision to either "continue" the retrieval process or "end" it.
    """
    print("---DECIDE TO RETRIEVE---")
    messages = state["messages"]
    last_message = messages[-1]
    # If there is no function call, then we finish
    if "function_call" not in last_message.additional_kwargs:
        print("---DECISION: DO NOT RETRIEVE / DONE---")
        return "end"
    # Otherwise there is a function call, so we continue
    else:
        print("---DECISION: RETRIEVE---")
        return "continue"


def check_relevance(state):
    """
    Determines whether the Agent should continue based on the relevance of retrieved documents.

    This function checks if the last message in the conversation is of type FunctionMessage, indicating
    that document retrieval has been performed. It then evaluates the relevance of these documents to the user's
    initial question using a predefined model and output parser. If the documents are relevant, the conversation
    is considered complete. Otherwise, the retrieval process is continued.

    Args:
        state messages: The current state of the conversation, including all messages.

    Returns:
        str: A directive to either "end" the conversation if relevant documents are found, or "continue" the retrieval process.
    """

    print("---CHECK RELEVANCE---")

    # Output
    class FunctionOutput(BaseModel):
        binary_score: str = Field(description="Relevance score 'yes' or 'no'")

    # Create an instance of the PydanticOutputParser
    parser = PydanticOutputParser(pydantic_object=FunctionOutput)

    # Get the format instructions from the output parser
    format_instructions = parser.get_format_instructions()

    # Create a prompt template with format instructions and the query
    prompt = PromptTemplate(
        template="""You are a grader assessing relevance of retrieved docs to a user question. \n 
        Here are the retrieved docs:
        \n ------- \n
        {context} 
        \n ------- \n
        Here is the user question: {question}
        If the docs contain keyword(s) in the user question, then score them as relevant. \n
        Give a binary score 'yes' or 'no' score to indicate whether the docs are relevant to the question. \n 
        Output format instructions: \n {format_instructions}""",
        input_variables=["question"],
        partial_variables={"format_instructions": format_instructions},
    )

    model = ChatOpenAI(temperature=0, model="gpt-4o-mini")

    chain = prompt | model | parser

    messages = state["messages"]
    last_message = messages[-1]
    score = chain.invoke(
        {"question": messages[0].content, "context": last_message.content}
    )

    # If relevant
    if score.binary_score == "yes":
        print("---DECISION: DOCS RELEVANT---")
        return "yes"

    else:
        print("---DECISION: DOCS NOT RELEVANT---")
        print(score.binary_score)
        return "no"


### Nodes


# Define the function that calls the model
def call_model(state):
    """
    Invokes the agent model to generate a response based on the current state.

    This function calls the agent model to generate a response to the current conversation state.
    The response is added to the state's messages.

    Args:
        state (messages): The current state of the agent, including all messages.

    Returns:
        dict: The updated state with the new message added to the list of messages.
    """
    print("---CALL AGENT---")
    messages = state["messages"]
    model = ChatOpenAI(temperature=0, streaming=True, model="gpt-4o-mini")
    functions = [format_tool_to_openai_function(t) for t in tools]
    model = model.bind_functions(functions)
    response = model.invoke(messages)
    # We return a list, because this will get added to the existing list
    return {"messages": [response]}


# Define the function to execute tools
def call_tool(state):
    """
    Executes a tool based on the last message's function call.

    This function is responsible for executing a tool invocation based on the function call
    specified in the last message. The result from the tool execution is added to the conversation
    state as a new message.

    Args:
        state (messages): The current state of the agent, including all messages.

    Returns:
        dict: The updated state with the new function message added to the list of messages.
    """
    print("---EXECUTE RETRIEVAL---")
    messages = state["messages"]
    # Based on the continue condition
    # we know the last message involves a function call
    last_message = messages[-1]
    # We construct an ToolInvocation from the function_call
    action = ToolInvocation(
        tool=last_message.additional_kwargs["function_call"]["name"],
        tool_input=json.loads(
            last_message.additional_kwargs["function_call"]["arguments"]
        ),
    )
    # We call the tool_executor and get back a response
    response = tool_executor.invoke(action)
    # print(type(response))
    # We use the response to create a FunctionMessage
    function_message = FunctionMessage(content=str(response), name=action.tool)

    # We return a list, because this will get added to the existing list
    return {"messages": [function_message]}

In [22]:
from langgraph.graph import END, StateGraph

# Define a new graph
workflow = StateGraph(AgentState)

# Define the nodes we will cycle between
workflow.add_node("agent", call_model)  # agent
workflow.add_node("action", call_tool)  # retrieval

<langgraph.graph.state.StateGraph at 0x309573ef0>

In [23]:
# Call agent node to decide to retrieve or not
workflow.set_entry_point("agent")

# Decide whether to retrieve
workflow.add_conditional_edges(
    "agent",
    # Assess agent decision
    should_retrieve,
    {
        # Call tool node
        "continue": "action",
        "end": END,
    },
)

# Edges taken after the `action` node is called.
workflow.add_conditional_edges(
    "action",
    # Assess agent decision
    check_relevance,
    {
        # Call agent node
        "yes": "agent",
        "no": END,  # placeholder
    },
)

# Compile
app = workflow.compile()

In [24]:
import pprint

from langchain_core.messages import HumanMessage

inputs = {
    "messages": [
        HumanMessage(
            # content="What are the ways Singapore is reducing its greenhouse gas emissions?"
            content="Explain how Singapore is Alternative Energy Disadvantaged from the biennial transparency report"
        )
    ]
}
for output in app.stream(inputs):
    for key, value in output.items():
        pprint.pprint(f"Output from node '{key}':")
        pprint.pprint("---")
        pprint.pprint(value, indent=2, width=80, depth=None)
    pprint.pprint("\n---\n")

---CALL AGENT---
---DECIDE TO RETRIEVE---
---DECISION: RETRIEVE---
"Output from node 'agent':"
'---'
{ 'messages': [ AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{"query":"Singapore Alternative Energy Disadvantaged"}', 'name': 'retrieve_biennial_transparency_report'}}, response_metadata={'finish_reason': 'function_call', 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_72ed7ab54c'}, id='run-f8cc63fd-9048-4281-9cd3-484974cffb5b-0')]}
'\n---\n'
---EXECUTE RETRIEVAL---


  action = ToolInvocation(


---CHECK RELEVANCE---


AttributeError: type object 'FunctionOutput' has no attribute 'model_json_schema'