In [3]:
# Ollama
from openai import OpenAI

openai_api_key = "YOUR_API_KEY"
openai_api_base = "http://localhost:11434/v1"

llm = OpenAI(api_key=openai_api_key, base_url=openai_api_base)

In [4]:
import json
import chromadb
from chromadb.utils import embedding_functions
from langchain_chroma import Chroma
from sentence_transformers import SentenceTransformer

COLLECTION_NAME = "financial_docs"
EMBEDDING_MODEL = "multi-qa-mpnet-base-dot-v1"

# Langchain compatible embeddings
class CustomEmbeddings:
    def __init__(self, model="all-MiniLM-L6-v2"):
        self.model = SentenceTransformer(model, trust_remote_code=True)

    def embed_documents(self, texts: list[str]) -> list[list[float]]:
        return [self.model.encode(t).tolist() for t in texts]
    
    def embed_query(self, query: str) -> list[float]:
        return self.model.encode([query])[0].tolist()

# ChromaDB setup
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name=EMBEDDING_MODEL)
chromadb_client = chromadb.PersistentClient(path="./database")
db = chromadb_client.get_or_create_collection(name=COLLECTION_NAME, embedding_function=sentence_transformer_ef)

# Langchain Setup
vector_store = Chroma(
    client=chromadb_client,
    collection_name=COLLECTION_NAME,
    persist_directory="./db",  # Where to save data locally, remove if not neccesary
    embedding_function=CustomEmbeddings(EMBEDDING_MODEL)
)




In [5]:
# Display all documents in the collection

docs = db.get(limit=1000)
print(len(docs['ids']))

10


In [11]:
from langchain_core.documents import Document
import json
from tqdm.auto import tqdm

def format_document(entry):
    # Combine pre_text, post_text, and table content into a single text block
    combined_text = ""
    
    combined_text += "\n".join(entry['pre_text'])
    
    # Process the table to include in the text block
    table_text = []
    for row in entry['table']:
        # Join each cell in the row with a tab for clarity
        table_text.append("\t".join(row))

    table_text = "\n".join(table_text)

    combined_text += "\n\n" + table_text
    combined_text += "\n\n" + "\n".join(entry['post_text'])
    
    # Combine all text and table data
    full_text = combined_text + "\n\n" + "Table Data:\n" + table_text
    
    return Document(id=entry['id'], page_content=full_text, metadata={"qa": str(entry.get('qa'))})

def create_question_to_document_map(filepath, limit: int = None):
    q2d = {}

    with open(filepath, 'r') as f:
        data = json.load(f)
    
    QA_FIELDS = ["qa", *[f"qa_{i}" for i in range(10)]]
    if limit:
        data = data[:limit]

    for entry in tqdm(data):
        # Loop through every available QA field in the entry
        for qa_field in set(QA_FIELDS).intersection(entry.keys()):
            q2d[entry[qa_field]["question"]] = format_document(entry)
  
    return q2d


def parse_convfinqa_dataset(filepath, limit: int = None):
    with open(filepath, 'r') as f:
        data = json.load(f)
    docs = []
    
    if limit:
        data = data[:limit]

    for entry in data:
        doc = format_document(entry)
        docs.append(doc)
    
    return docs

In [12]:
DATA_PATH = 'ConvFinQA/data/train.json'
docs = parse_convfinqa_dataset(DATA_PATH, limit=10)
print(docs)
vector_store.add_documents(docs)

[Document(id='Single_JKHY/2009/page_28.pdf-3', metadata={'qa': "{'question': 'what was the percentage change in the net cash from operating activities from 2008 to 2009', 'answer': '14.1%', 'explanation': '', 'ann_table_rows': [6], 'ann_text_rows': [], 'steps': [{'op': 'minus2-1', 'arg1': '206588', 'arg2': '181001', 'res': '25587'}, {'op': 'divide2-2', 'arg1': '#0', 'arg2': '181001', 'res': '14.1%'}], 'program': 'subtract(206588, 181001), divide(#0, 181001)', 'gold_inds': {'table_6': '2008 the net cash from operating activities of year ended june 30 2009 2008 is $ 206588 ; the net cash from operating activities of year ended june 30 2009 2008 is $ 181001 ; the net cash from operating activities of year ended june 30 2009 is $ 174247 ;'}, 'exe_ans': 0.14136, 'program_re': 'divide(subtract(206588, 181001), 181001)'}"}, page_content='26 | 2009 annual report in fiscal 2008 , revenues in the credit union systems and services business segment increased 14% ( 14 % ) from fiscal 2007 .\nall re

['Single_JKHY/2009/page_28.pdf-3',
 'Single_RSG/2008/page_114.pdf-2',
 'Single_AAPL/2002/page_23.pdf-1',
 'Single_UPS/2009/page_33.pdf-2',
 'Double_UPS/2009/page_33.pdf',
 'Single_CE/2010/page_134.pdf-2',
 'Single_JPM/2013/page_104.pdf-2',
 'Double_MAS/2012/page_92.pdf',
 'Single_HIG/2004/page_122.pdf-2',
 'Single_SLG/2013/page_133.pdf-4']

In [13]:
q2d = create_question_to_document_map(DATA_PATH, limit=1000)

  0%|          | 0/1000 [00:00<?, ?it/s]

In [24]:
print(list(q2d.keys())[:10])

['what was the percentage change in the net cash from operating activities from 2008 to 2009', 'what was the percent of the growth in the revenues from 2007 to 2008', 'what was the percentage change in net sales from 2000 to 2001?', 'what was the difference in percentage cumulative return on investment for united parcel service inc . compared to the s&p 500 index for the five year period ended 12/31/09?', 'what is the roi of an investment in ups in 2004 and sold in 2006?', 'what portion of the total shares subject to outstanding awards is under the 2009 global incentive plan?', 'what was the percentage increase in litigation reserves in 2012?', "what was the percentage change in the company's warranty liability from 2011 to 2012?", 'what was the percent of the change in the company 2019s warranty liability from 2011 to 2012', 'what portion of total obligations are due within the next 3 years?']


In [42]:
def query_chromadb(question, collection):
    results = collection.query(query_texts=[question], n_results=5)
    return results

# Example usage
question = "what was the percentage change in the net cash from operating activities from 2008 to 2009"
response = query_chromadb(question, db)

print(response)


{'ids': [['Single_JKHY/2009/page_28.pdf-3', 'Single_JPM/2013/page_104.pdf-2', 'Double_UPS/2009/page_33.pdf', 'Single_UPS/2009/page_33.pdf-2', 'Single_AAPL/2002/page_23.pdf-1']], 'distances': [[32.32696090249248, 37.30684166449662, 42.85016820108445, 42.85016820108445, 43.314500159134724]], 'metadatas': [[{'qa': "{'question': 'what was the percentage change in the net cash from operating activities from 2008 to 2009', 'answer': '14.1%', 'explanation': '', 'ann_table_rows': [6], 'ann_text_rows': [], 'steps': [{'op': 'minus2-1', 'arg1': '206588', 'arg2': '181001', 'res': '25587'}, {'op': 'divide2-2', 'arg1': '#0', 'arg2': '181001', 'res': '14.1%'}], 'program': 'subtract(206588, 181001), divide(#0, 181001)', 'gold_inds': {'table_6': '2008 the net cash from operating activities of year ended june 30 2009 2008 is $ 206588 ; the net cash from operating activities of year ended june 30 2009 2008 is $ 181001 ; the net cash from operating activities of year ended june 30 2009 is $ 174247 ;'}, 'e

In [45]:
print(CustomEmbeddings().embed_query("asd"))



[-0.12403251975774765, -0.006126576103270054, -0.007643955294042826, 0.05117763578891754, -0.008589210920035839, -0.021051358431577682, 0.06903959065675735, -0.01971042901277542, -0.021294105798006058, -0.029979726299643517, 0.03589785471558571, -0.015056969597935677, -0.03433169797062874, -0.006830163765698671, -0.028852852061390877, -0.02722444012761116, 0.02340937778353691, -0.048426222056150436, -0.11704225093126297, -0.01990649662911892, -0.041199203580617905, 0.09209650009870529, 0.026396270841360092, 0.010833492502570152, 0.057644251734018326, -0.05959760770201683, 0.04232749715447426, -0.03867774456739426, 0.004769627004861832, -0.10280600935220718, 0.06895194947719574, -0.005827539134770632, 0.047956064343452454, 0.015917977318167686, -0.008895812556147575, -0.02368972636759281, 0.04214287921786308, -0.06624723225831985, -0.017846476286649704, 0.04278495907783508, -0.01611267402768135, 0.034301791340112686, -0.024904398247599602, 0.04754626005887985, 0.009209901094436646, -0.0

In [31]:
import re
from typing import Literal, TypedDict
from langchain_chroma import Chroma
from langgraph.graph import StateGraph, END, START
from langgraph.graph import add_messages
from langchain_core.messages import BaseMessage, HumanMessage, SystemMessage, AIMessage
from typing import TypedDict, Annotated, Sequence
from langchain.prompts import PromptTemplate
from operator import add
from langfuse import Langfuse
from langfuse.callback import CallbackHandler
from langchain_openai import ChatOpenAI
import pprint

langfuse_handler = CallbackHandler(
    secret_key="sk-lf-a65b23f9-0ffb-4063-8c09-5e1c265e9c4a",
    public_key="pk-lf-c5d11bfe-392e-4ae1-ad90-aa4a8639965c",
    host="https://cloud.langfuse.com"
  # host="https://us.cloud.langfuse.com", # 🇺🇸 US region
)

class AgentState(TypedDict):
    messages: Annotated[Sequence[BaseMessage], add_messages] # Chat messages
    steps: Annotated[list[str], add] # Agent steps
    question: str # Query executed against the database to retrieve documents
    documents: list[str] # Retrieved documents (context)
    prompt: str # Prompt used for generation
    answer: str # Generated final answer

# Prompt
prompt_template = PromptTemplate(
    template="""You are a investment analyst. You will be given: 
    <INSTRUCTIONS>
    You will be provided:
    1. a QUESTION asked by the user
    2. DOCUMENTS provided by an automated context retrieval system
    
    Your task is to use the context to provide a relevant ANSWER to the QUESTION

    Only answer what the user is asking and nothign else
    
    Explain your reasoning in a step-by-step manner. Ensure your reasoning and conclusion are correct. 

    Avoid simply stating the correct answer at the outset.

    If there is no relevant context provided, state that at the outset.

    At the end of your calculations create a section for the final answer submission. Example:
    <ANSWER>
    29.31%
    </ANSWER>
    </INSTRUCTIONS>
    <QUESTION>{question}</QUESTION>\n
    <DOCUMENTS>
    \n\n {documents}
    </DOCUMENTS>\n\n
    <QUESTION>{question}</QUESTION>\n
    """,
    input_variables=["question", "documents"],
)

system_prompt = """Be a helpful assistant"""

def extract_question(state: AgentState) -> AgentState:
    messages = state["messages"]
    question = messages[-1].content
    return {"question": question, "steps": ["extract_question"]}

def retrieve_from_vector_db(state: AgentState) -> AgentState:
    question = state["question"]
    result = vector_store.similarity_search(question, k=5)

    return {
        "steps": [f"retrieve('{question}')"], 
        "documents": result, 
    }

# Post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)
    
def rerank(state: AgentState) -> AgentState:
    question = state["question"]
    documents = state["documents"]

    # TODO: rerank

    return {"steps": ["rerank"]}
    
def retrieve_relevant_only(state: AgentState) -> AgentState:
    question = state['question']
    return {"documents": [q2d[question]]}

    
# Define the function that calls the model
def generate(state: AgentState) -> AgentState:
    messages = state["messages"]
    question = state["question"]
    documents = state["documents"]

    prompt = prompt_template.format(**{"question": question, "documents": format_docs(documents)})
    messages[-1] = HumanMessage(prompt)
    
    messages_openai = []
    for message in messages:
        if type(message) == HumanMessage:
            role = "user"
        elif type(message) == AIMessage:
            role = "assistant",
        elif type(message) == SystemMessage:
            role = "system"
        else:
            raise ValueError("No such message type allowed")
        messages_openai.append(({"role": role, "content": message.content}))


    response = llm.chat.completions.create(model="llama3.1", messages=messages_openai)
    response_message = AIMessage(response.choices[0].message.content)
    # We return a list, because this will get added to the existing list
    return {"messages": [response_message], "prompt": messages_openai}

def extract_answer(state: AgentState) -> AgentState:
    last_message = state["messages"][-1].content

    # Use a regex to find everything between <ANSWER> and </ANSWER>
    match = re.search(r'<ANSWER>(.*?)</ANSWER>', last_message, re.DOTALL)

    # Extract the content if the match is found
    extracted_answer = match.group(1).strip() if match else ""

    # Add the extracted answer back into the state (or use it as needed)

    return {"answer": extracted_answer}


# Define the config
class GraphConfig(TypedDict):
    retrieval_k: int = 5


# Define a new graph
workflow = StateGraph(AgentState, config_schema=GraphConfig)

# Define the two nodes we will cycle between
workflow.add_node("extract_question", extract_question)
# workflow.add_node("retriever", retrieve_from_vector_db)
workflow.add_node("cheating_retriever", retrieve_relevant_only)
# workflow.add_node("reranker", rerank)
workflow.add_node("generator", generate)
workflow.add_node("extract_answer", extract_answer)



# Define the flow
workflow.set_entry_point("extract_question")
workflow.add_edge("extract_question", "cheating_retriever")
workflow.add_edge("cheating_retriever", "generator")
# workflow.add_edge("extract_question", "retriever")
# workflow.add_edge("retriever", "reranker")
# workflow.add_edge("reranker", "generator")
workflow.add_edge("generator", "extract_answer")
workflow.set_finish_point("extract_answer")


# Finally, we compile it!
# This compiles it into a LangChain Runnable,
# meaning you can use it as you would any other runnable
graph = workflow.compile()

inputs = {
    "messages": [
        HumanMessage('what was the percentage change in the net cash from operating activities from 2008 to 2009'),
    ]
}


for output in graph.stream(inputs, config={"callbacks": [langfuse_handler]}):
    for key, value in output.items():
        pprint.pprint(f"Output from node '{key}':")
        pprint.pprint("---")
        pprint.pprint(value, indent=2, width=80, depth=None)
    pprint.pprint("\n---\n")

"Output from node 'extract_question':"
'---'
{ 'question': 'what was the percentage change in the net cash from operating '
              'activities from 2008 to 2009',
  'steps': ['extract_question']}
'\n---\n'
"Output from node 'cheating_retriever':"
'---'
{ 'documents': [ Document(id='Single_JKHY/2009/page_28.pdf-3', metadata={'qa': "{'question': 'what was the percentage change in the net cash from operating activities from 2008 to 2009', 'answer': '14.1%', 'explanation': '', 'ann_table_rows': [6], 'ann_text_rows': [], 'steps': [{'op': 'minus2-1', 'arg1': '206588', 'arg2': '181001', 'res': '25587'}, {'op': 'divide2-2', 'arg1': '#0', 'arg2': '181001', 'res': '14.1%'}], 'program': 'subtract(206588, 181001), divide(#0, 181001)', 'gold_inds': {'table_6': '2008 the net cash from operating activities of year ended june 30 2009 2008 is $ 206588 ; the net cash from operating activities of year ended june 30 2009 2008 is $ 181001 ; the net cash from operating activities of year ended june 3

In [19]:
from IPython.display import Image, display

try:
    display(Image(graph.get_graph(xray=True).draw_mermaid_png()))
except Exception:
    # This requires some extra dependencies and is optional
    pass