In [None]:
%cd ..

In [None]:
%reload_ext autoreload
%autoreload 2


In [None]:
import sys
def override_sys_breakpoint(frame=None):
    from IPython.core.debugger import set_trace

    set_trace(frame=frame)


sys.breakpointhook = override_sys_breakpoint

In [None]:
from pathlib import Path
import os
from dotenv import load_dotenv, find_dotenv

In [None]:
CUAD_PATH = Path("../data/CUAD_v1/")

In [None]:
load_dotenv(find_dotenv())
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_TRACING_V2"] = "false"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
LANGCHAIN_API_KEY = os.getenv("LANGCHAIN_API_KEY")
AZURE_OPENAI_ENDPOINT=os.getenv("AZURE_OPENAI_ENDPOINT")
CUAD_QNA_PATH = Path(os.getenv("CUAD_QNA_PATH"))
CUAD_QNA_SUBSET_PATH = Path(os.getenv("CUAD_QNA_SUBSET_PATH"))

# os.environ["LANGCHAIN_TRACING"] = "false"

In [None]:
pdf_files=list(CUAD_QNA_SUBSET_PATH.iterdir())

In [None]:
from supermat.core.parser import FileProcessor
from tqdm.auto import tqdm
from itertools import chain

documents = list(chain.from_iterable(FileProcessor.parse_file(path) for path in tqdm(pdf_files)))

In [None]:
from supermat.langchain.bindings import SupermatRetriever
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings


retriever = SupermatRetriever(
    parsed_docs=documents,
    vector_store=Chroma(
        embedding_function=HuggingFaceEmbeddings(
            model_name="thenlper/gte-base",
        ),
        persist_directory="./chromadb",
        collection_name="CUAD_TEST"
    ),
)

In [None]:
from langchain.smith import RunEvalConfig
from langchain.smith.evaluation.runner_utils import TestResult
from langchain_core.documents.base import Document
from langchain_core.embeddings import Embeddings
from langchain_core.language_models.chat_models import BaseChatModel
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import (
    RunnableLambda,
    RunnableParallel,
    RunnablePassthrough,
)
from langchain_core.runnables.base import Runnable

from supermat.langchain.metrics import (
    Accuracy,
    CosineSimilarity,
    FaithfullnessMetrics,
    Rouge1,
    Rouge1Precision,
    Rouge1Recall,
    Rouge2,
    Rouge2Precision,
    Rouge2Recall,
    RougeLsum,
    RougeLsumPrecision,
    RougeLsumRecall,
)

In [None]:
DEFAULT_TEMPLATE = """
You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer the question.
If you don't know the answer, just say that you don't know.
Question: {Question}
context: {context}
Answer:
"""

In [None]:
from langchain_openai import AzureChatOpenAI

llm_model = AzureChatOpenAI(azure_deployment='gpt-35-turbo',api_version="2024-05-01-preview", temperature=0)

In [None]:
from langchain.schema.cache import BaseCache
from langchain_core.callbacks.base import Callbacks

RunEvalConfig.LabeledScoreString.model_rebuild()

In [None]:
from __future__ import annotations

from langchain_benchmarks.extraction.evaluators import get_eval_config

rag_evaluation = get_eval_config(llm_model)
eval_config = RunEvalConfig.model_validate(
    rag_evaluation.model_dump()
    | RunEvalConfig(
        custom_evaluators=[
            FaithfullnessMetrics(llm_model),
            Accuracy(llm_model),
            CosineSimilarity(),
            Rouge1(),
            Rouge1Precision(),
            Rouge1Recall(),
            Rouge2(),
            Rouge2Precision(),
            Rouge2Recall(),
            RougeLsum(),
            RougeLsumPrecision(),
            RougeLsumRecall(),
        ],
        input_key="Question",
    ).model_dump()
)


qa_chain = (
    RunnableLambda(lambda x: x["Question"])
    | RunnableParallel({"context": retriever, "Question": RunnablePassthrough()})
    | RunnableLambda(lambda x: {
        "context": " ".join(doc.page_content for doc in x["context"]),
        "Question": x["Question"]
    })
    | ChatPromptTemplate.from_template(DEFAULT_TEMPLATE)
    | llm_model
    | StrOutputParser()
)

from langchain_benchmarks.utils import run_without_langsmith

from importlib import reload
from langchain_benchmarks import utils
reload(utils)

test_run = utils.run_without_langsmith(
    path_or_token_id=CUAD_QNA_PATH.as_posix(),
    llm_or_chain_factory=qa_chain,
    evaluation=eval_config,
    verbose=True,
    concurrency_level=10,
)

In [None]:
test_run.to_dataframe()

In [None]:
import pandas as pd

with pd.ExcelWriter("supermat_benchmarks.xlsx") as writer:
    test_run.to_dataframe().to_excel(writer, sheet_name="LLM Results", index=True)
    test_run.get_aggregate_feedback().to_excel(writer, sheet_name="Agg Results", index=True)