In [None]:
%cd ..

In [2]:
%reload_ext autoreload
%autoreload 2


In [3]:
import sys


def override_sys_breakpoint(frame=None):
    from IPython.core.debugger import set_trace

    set_trace(frame=frame)


sys.breakpointhook = override_sys_breakpoint

In [None]:
from pathlib import Path
import os
from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv())

In [5]:
CUAD_PATH = Path("../data/CUAD_v1/")

In [6]:
os.environ["LANGCHAIN_TRACING_V2"] = "false"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
CUAD_QNA_SUBSET_PATH = Path(os.getenv("CUAD_QNA_SUBSET_PATH"))

In [7]:
pdf_files = list(CUAD_QNA_SUBSET_PATH.iterdir())

In [None]:
from supermat.core.parser import FileProcessor
from tqdm.auto import tqdm
from typing import TYPE_CHECKING, cast
from itertools import chain
from joblib import Parallel, delayed

parsed_files = Parallel(n_jobs=-1, backend="threading")(delayed(FileProcessor.parse_file)(path) for path in pdf_files)
if TYPE_CHECKING:
    from supermat.core.models.parsed_document import ParsedDocumentType

    parsed_files = cast(list[ParsedDocumentType], parsed_files)

documents = list(chain.from_iterable(parsed_docs for parsed_docs in parsed_files))

if TYPE_CHECKING:
    from supermat.core.models.parsed_document import ParsedDocumentType

    documents = cast(ParsedDocumentType, documents)

In [9]:
from supermat.langchain.bindings import SupermatRetriever
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings


retriever = SupermatRetriever(
    parsed_docs=documents,
    vector_store=Chroma(
        embedding_function=HuggingFaceEmbeddings(
            model_name="thenlper/gte-base",
        ),
        persist_directory="./chromadb",
        collection_name="CUAD_TEST",
    ),
)

In [15]:
from langchain_ollama.llms import OllamaLLM
from supermat.langchain.bindings import get_default_chain

llm_model = OllamaLLM(model="deepseek-r1:8b", temperature=0.0)
chain = get_default_chain(retriever, llm_model, substitute_references=False, return_context=False)

In [None]:
test_doc = pdf_files[9]
test_doc.name

In [None]:
answer = chain.invoke(f"What is the name of the contract {test_doc.name.lower()}?")
print(answer)

### Output from Deepseek 8b model

> Tested on pdf document: `'NakedBrandGroupInc_20150731_POS AM (on S-1)_EX-10.75_9196027_EX-10.75_Endorsement Agreement.pdf'`


```
<think>
Okay, so I need to figure out the name of the contract mentioned in the document. The user provided a bunch of context, including parts of an endorsement agreement between Naked Brand Group Inc. and someone named Wade. 

Looking at the documents, there's a mention of "Endorsement Agreement" in several places, like in the title and within the content. For example, in Document 8.2.64.0, it says "Representations and Warranties of Naked and Wade," which probably refers back to the same agreement.

So, putting that together, the contract is called the "Endorsement Agreement." I should make sure to cite the specific citation_id where this is mentioned. From the context, the relevant part is in Document 8.2.64.0, so I'll use that citation.
</think>

The name of the contract is the **Endorsement Agreement**. This can be cited as `<cite ref='8.2.64.0' />`.

References:
1. <cite ref='8.2.64.0' />:
	15. Representations and Warranties of Naked and Wade. 
{'id': None,
 'metadata': {'citation_id': '8.2.64.0',
              'document': 'NakedBrandGroupInc_20150731_POS AM (on '
                          'S-1)_EX-10.75_9196027_EX-10.75_Endorsement '
                          'Agreement',
              'key': 'Warranties,15,Representations,Naked',
              'structure': '8.2.64.0'},
 'page_content': '15. Representations and Warranties of Naked and Wade. ',
 'type': 'Document'}
```