In [None]:
%cd ..

In [None]:
%reload_ext autoreload
%autoreload 2


In [None]:
import sys
def override_sys_breakpoint(frame=None):
    from IPython.core.debugger import set_trace

    set_trace(frame=frame)


sys.breakpointhook = override_sys_breakpoint

In [None]:
from pathlib import Path
import os
from dotenv import load_dotenv, find_dotenv

In [None]:
CUAD_PATH = Path("../data/CUAD_v1/")

In [None]:
load_dotenv(find_dotenv())
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_TRACING_V2"] = "false"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
LANGCHAIN_API_KEY = os.getenv("LANGCHAIN_API_KEY")
AZURE_OPENAI_ENDPOINT=os.getenv("AZURE_OPENAI_ENDPOINT")
CUAD_QNA_PATH = Path(os.getenv("CUAD_QNA_PATH"))
CUAD_QNA_SUBSET_PATH = Path(os.getenv("CUAD_QNA_SUBSET_PATH"))

# os.environ["LANGCHAIN_TRACING"] = "false"

In [None]:
pdf_files=list(CUAD_QNA_SUBSET_PATH.iterdir())

In [None]:
from supermat.core.parser import FileProcessor
from tqdm.auto import tqdm
from typing import TYPE_CHECKING, cast
from itertools import chain
from joblib import Parallel, delayed
parsed_files = Parallel(n_jobs=-1)(
    delayed(FileProcessor.parse_file)(path) for path in pdf_files
)
if TYPE_CHECKING:
    from supermat.core.models.parsed_document import ParsedDocumentType
    parsed_files = cast(list[ParsedDocumentType], parsed_files)

documents = list(chain.from_iterable(parsed_docs for parsed_docs in parsed_files))

if TYPE_CHECKING:
    from supermat.core.models.parsed_document import ParsedDocumentType
    documents = cast(ParsedDocumentType, documents)

In [None]:
from supermat.langchain.bindings import SupermatRetriever
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings


retriever = SupermatRetriever(
    parsed_docs=documents,
    vector_store=Chroma(
        embedding_function=HuggingFaceEmbeddings(
            model_name="thenlper/gte-base",
        ),
        persist_directory="./chromadb",
        collection_name="CUAD_TEST"
    ),
)

In [None]:
from langchain.smith import RunEvalConfig
from langchain.smith.evaluation.runner_utils import TestResult
from langchain_core.documents.base import Document
from langchain_core.embeddings import Embeddings
from langchain_core.language_models.chat_models import BaseChatModel
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import (
    RunnableLambda,
    RunnableParallel,
    RunnablePassthrough,
)
from langchain_core.runnables.base import Runnable

from supermat.langchain.metrics import (
    Accuracy,
    CosineSimilarity,
    FaithfullnessMetrics,
    Rouge1,
    Rouge1Precision,
    Rouge1Recall,
    Rouge2,
    Rouge2Precision,
    Rouge2Recall,
    RougeLsum,
    RougeLsumPrecision,
    RougeLsumRecall,
)

In [None]:
DEFAULT_TEMPLATE = """You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer the question.
If you don't know the answer, just say that you don't know.
Question: {Question}
context: {context}
Answer:
"""

In [None]:
# DEFAULT_TEMPLATE = """
# Use the below information to answer the subsequent question.
# Please cite the structure number of the most relevant document(s) found inside the metadata. Other metadata info need not be returned.
# Provide factual, verifiable information and include references to credible sources where possible. 
# Avoid speculative or unverified content.
# If the answer cannot be found, write "I don't know."
# Information:
# \"\"\"
# {context}
# \"\"\"
# Question: {Question}? Please cite the each structure numbers you take data from. Please provide factual and verifiable information, include references to credible sources where possible. 
# Avoid speculative or unverified content.
# Return a JSON object as output.
# """

In [None]:
# DEFAULT_TEMPLATE="""
# {
#   "task": "Use the context info to answer the given question. Please cite the structure number of the most relevant document(s) found inside the metadata. Other metadata info need not be returned. Provide factual, verifiable information and include references to credible sources where possible. Avoid speculative or unverified content. If the answer cannot be found, write 'I don't know.'",
#   "question": {Question},
#   "context": {context}
# }
# """

In [None]:
from langchain_openai import AzureChatOpenAI

# llm_model = AzureChatOpenAI(azure_deployment='gpt-35-turbo',api_version="2024-05-01-preview", temperature=0, model_kwargs={ "response_format": { "type": "json_object" } })
llm_model = AzureChatOpenAI(azure_deployment='gpt-35-turbo',api_version="2024-05-01-preview", temperature=0)

In [None]:
from langchain.schema.cache import BaseCache
from langchain_core.callbacks.base import Callbacks

RunEvalConfig.LabeledScoreString.model_rebuild()

In [None]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain


system_prompt = (
    "Use the given context to answer the question. "
    "If you don't know the answer, say you don't know. "
    "Use three sentence maximum and keep the answer concise. "
    "Context: {context}"
)
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{Question}"),
    ]
)
import json

def format_docs(docs: list[Document]) -> str:
    response = [f"{{'text':{doc.page_content}, 'metadata': {json.dumps(doc.metadata)}}}" for doc in docs]
    return str(response)


qa_chain2 = (
    {
        "context": retriever | format_docs,
        "question": RunnablePassthrough(),
        "Question": RunnablePassthrough(),
    }
    | prompt
    | llm_model
    | StrOutputParser()
)
qa_chain2 = (
    RunnableLambda(lambda x: x["Question"])
    | RunnableParallel({"context": retriever | format_docs, "Question": RunnablePassthrough()})
)
qa_chain2 = (
    RunnableLambda(lambda x: x["Question"])
    | RunnableParallel({"context": retriever | format_docs, "Question": RunnablePassthrough()})
    | prompt
    | llm_model
    | StrOutputParser()
)

In [None]:
from __future__ import annotations

from langchain_benchmarks.extraction.evaluators import get_eval_config

rag_evaluation = get_eval_config(llm_model)
eval_config = RunEvalConfig.model_validate(
    rag_evaluation.model_dump()
    | RunEvalConfig(
        custom_evaluators=[
            FaithfullnessMetrics(llm_model),
            Accuracy(llm_model),
            CosineSimilarity(),
            Rouge1(),
            Rouge1Precision(),
            Rouge1Recall(),
            Rouge2(),
            Rouge2Precision(),
            Rouge2Recall(),
            RougeLsum(),
            RougeLsumPrecision(),
            RougeLsumRecall(),
        ],
        input_key="Question",
    ).model_dump()
)


qa_chain = (
    RunnableLambda(lambda x: x["Question"])
    | RunnableParallel({"context": retriever, "Question": RunnablePassthrough()})
    # | RunnableLambda(lambda x: {
    #     "context": " ".join(f"{doc.page_content}\nSection: {doc.metadata["structure"]}\nKeys: {doc.metadata["key"]}" for doc in x["context"]),
    #     "Question": x["Question"]
    # })
    | ChatPromptTemplate.from_template(DEFAULT_TEMPLATE)
    | llm_model
    | StrOutputParser()
)

from langchain_benchmarks.utils import run_without_langsmith

from importlib import reload
from langchain_benchmarks import utils
reload(utils)

test_run = utils.run_without_langsmith(
    path_or_token_id=CUAD_QNA_PATH.as_posix(),
    llm_or_chain_factory=qa_chain2,
    evaluation=eval_config,
    verbose=True,
    concurrency_level=10,
)

In [None]:
# DEFAULT_TEMPLATE="""
# {
#   "task": "Use the context info to answer the given question. Please cite the structure number of the most relevant document(s) found inside the metadata. Other metadata info need not be returned. Provide factual, verifiable information and include references to credible sources where possible. Avoid speculative or unverified content. If the answer cannot be found, write 'I don't know.'",
#   "question": {Question},
#   "context": {context}
# }
# """

In [None]:
from pydantic import Field, BaseModel


In [None]:
class RequestFormat(BaseModel):
    task: str = Field("Use the context info to answer the given question. Please cite the structure number of the most relevant document(s) found inside the metadata. Other metadata info need not be returned. Provide factual, verifiable information and include references to credible sources where possible. Avoid speculative or unverified content. If the answer cannot be found, write 'I don't know.'", description="The answer to the user's question.")
    Document: str = Field(description="The user question.")
    context: str = Field(description="The list of documents used as context.")

In [None]:

# class ResponseFormat(BaseModel):
#     answer: str = Field(description="The answer to the user's question.")
#     Document: str = Field(description="The document name the info was taken from.")
#     structure: str = Field(description="The structure number(s) the information was taken from.")

# qa_chain2 = (
#     RunnableLambda(lambda x: x["Question"])
#     | RunnableParallel({"context": retriever, "Question": RunnablePassthrough()})
#     # | RunnableLambda(lambda x: {
#     #     "context": " ".join(f"{doc.page_content}\nSection: {doc.metadata["structure"]}\nKeys: {doc.metadata["key"]}" for doc in x["context"]),
#     #     "Question": x["Question"]
#     # })
#     | ChatPromptTemplate.from_template(DEFAULT_TEMPLATE)
#     # | llm_model
#     # | StrOutputParser()
# )


In [None]:
import orjson
with CUAD_QNA_PATH.with_name("full_cuad.json").open("rb") as fp:
    data = orjson.loads(fp.read())

In [None]:
dbg_chain = (
    RunnableLambda(lambda x: x["Question"])
    | RunnableParallel({"context": retriever, "Question": RunnablePassthrough()})
)

In [None]:
dbg_chain.invoke(data[0]["inputs"])

In [None]:
assert test_run is not None

run_agg = test_run.get_aggregate_feedback()
run_agg

In [None]:
import pandas as pd


In [None]:
def compare_baseline(baseline_pkl_path: Path, run_agg: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
    baseline_agg_results = pd.read_pickle(baseline_pkl_path)
    baseline_agg_results = baseline_agg_results = baseline_agg_results.droplevel(["llm_model", "vectorstore_name", "dataset"], axis=1)
    diff_to_baseline = (run_agg.select_dtypes("float64") - baseline_agg_results.select_dtypes("float64"))/baseline_agg_results.select_dtypes("float64")
    return baseline_agg_results, diff_to_baseline

In [None]:
benchmarks_file_name = "baseline_benchmarks"
benchmarks_file_name = "baseline_SemanticChunkerPercentile_benchmarks"

In [None]:
baseline_agg_results, diff_to_baseline = compare_baseline(fr"C:\repos\llm_rag\notebooks\{benchmarks_file_name}.pkl", run_agg)
diff_to_baseline

In [None]:
file_name = f"supermat_benchmarks"

In [None]:
import pandas as pd

assert test_run is not None

with pd.ExcelWriter(f"{file_name}.xlsx") as writer:
    test_run.to_dataframe().to_excel(writer, sheet_name="LLM Results", index=True)
    run_agg.to_excel(writer, sheet_name="Agg Results", index=True)
    diff_to_baseline.to_excel(writer, sheet_name="Baseline Diff Agg Results", index=True)

In [None]:
run_agg.to_pickle(f"{file_name}.pkl")