### Context of this Notebook
Define a RAG workflow for generating our CSV from a query using LangGraph.

In [19]:
# Document we are working with
file_name = "sample-new-fidelity-acnt-stmt"
local_llm = "llama3"

# Run OCR on PDF and gives us text file
%run -i pdf_to_text.py $file_name

In [32]:
# RETREIVER DEFINITION
import os
from langchain_community.vectorstores import faiss
from langchain_nomic.embeddings import NomicEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

# if not os.path.exists(f"{file_name}"):
#     os.makedirs("text_data")
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1024,
    chunk_overlap=200,
)
embedding = NomicEmbeddings(model="nomic-embed-text-v1.5", inference_mode="local")
os.makedirs("vectorstore", exist_ok=True)
if not os.path.exists(f"vectorstore/{file_name}.faiss"):
    with open(f"text_data/{file_name}.txt", "r") as f:
        pdf_text = f.read()
        passages = text_splitter.split_text(pdf_text)
        # Define open source embeddings
        vectorstore = faiss.FAISS.from_texts(
            texts=passages,
            embedding=embedding,
        )
        vectorstore.save_local("vectorstore",file_name)
else:
    vectorstore = faiss.FAISS.load_local("vectorstore",embedding, file_name, allow_dangerous_deserialization=True)
retriever = vectorstore.as_retriever(search_kwargs={"k": 20})
retriever.invoke("Quantity of stocks and securities for the purpose of underwriting.")

[Document(page_content='Total Stocks (32% of account holdings) $43,724.65 $37,376.42 $6,348.23 $304.68\nBonds\nPrice Ending Market Value Total Unrealized Est.Annual Coupon\nDescription Maturity Quantity Per Unit Accrued Interest (Al) Cost Basis Gain/Loss___Income (EAl) Rate\nAsset Backed Securities\nFEDERAL NATL MTG ASSN 12/17/13 10,000.00 $100.00 $10,000.00 $10,250.00! -$250.00 $125.00 1.25%\nCUSIP: 3136FPEX1\nInt. Semi-Annually\nTotal Asset Backed Securities (7% of account holdings) $10,000.00 $10,250.00 -$250.00 $125.00'),
 Document(page_content='Stocks\nPrice Ending Total Unrealized Est. Annual .\nDescription Quantity Per Unit Market Value Cost Basis Gain/Loss___Income (EAl) (EY)\nCommon Stocks\nAPPLE INC (AAPL) 25.00 525.31 $13,132.75 $9,350.12¢ $3,782.63 $304.68 2.32%\nAMERCO COM (UHAL) 30.00 203.15A 6,094.50 4,149.75¢ 1,944.75 _\nENSTAR GROUP LIMITED COM STK USD 1.00 ~100.00 137.10 -13,710.00 -14,510.99¢ 800.99 _\n(ESGR)\nTotal Common Stock (24% of account holdings) $5,517.25 $-

In [20]:
### RETRIEVAL GRADER (MINIMUM TEST)

from langchain_community.chat_models import ChatOllama
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import PromptTemplate
# LLM
llm = ChatOllama(model=local_llm, format="json", temperature=0)

prompt = PromptTemplate(
    template="""<|begin_of_text|><|start_header_id|>system<|end_header_id|> You are a grader assessing relevance 
    of a retrieved document to a user question. If the document contains keywords related to the user question, 
    grade it as relevant. It does not need to be a stringent test. The goal is to filter out erroneous retrievals. \n
    Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question. \n
    Provide the binary score as a JSON with a single key 'score' and no premable or explanation.
     <|eot_id|><|start_header_id|>user<|end_header_id|>
    Here is the retrieved document: \n\n {document} \n\n
    Here is the user question: {question} \n <|eot_id|><|start_header_id|>assistant<|end_header_id|>
    """,
    input_variables=["question", "document"],
)

retrieval_grader = prompt | llm | JsonOutputParser()
question = "agent memory"
docs = retriever.invoke(question)
doc_txt = docs[1].page_content
print(retrieval_grader.invoke({"question": question, "document": doc_txt}))

{'score': 'no'}


In [18]:
import langgraph

from typing import Annotated, Sequence, TypedDict
from langchain_core.messages import BaseMessage
from langgraph.graph.message import add_messages

class AgentState(TypedDict):
    # The add_messages function defines how an update should be processed
    # Default is to replace. add_messages says "append"
    messages: Annotated[Sequence[BaseMessage], add_messages]

def retrieval(state) -> Sequence[langgraph.Document]:
    last_message = state["messages"][-1]
    state["documents"] = retriever.invoke(last_message.text)
    return 