In [1]:
import os
import uuid
from dotenv import load_dotenv
load_dotenv()
uid = uuid.uuid4()
# os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANGCHAIN_API_KEY")
# os.environ["ANTHROPIC_API_KEY"] = os.getenv("ANTHROPIC_API_KEY")

In [2]:
from langsmith import Client

share_token = "08ab7912-006e-4c00-a973-0f833e74907b"
dataset_name = f"Contract Extraction - {uid}"

client = Client()
examples = list(client.list_shared_examples(share_token))
dataset = client.create_dataset(dataset_name=dataset_name)
client.create_examples(
    inputs=[e.inputs for e in examples],
    outputs=[e.outputs for e in examples],
    dataset_id=dataset.id,
)

In [6]:
from typing import List, Optional, Union

from pydantic import BaseModel


class Address(BaseModel):
    street: str
    city: str
    state: str
    zip_code: str
    country: Optional[str]


class Party(BaseModel):
    name: str
    address: Address
    type: Optional[str]


class Section(BaseModel):
    title: str
    content: str


class Contract(BaseModel):
    document_title: str
    exhibit_number: Optional[str]
    effective_date: str
    parties: List[Party]
    sections: List[Section]

In [9]:
from langchain import hub
from langchain.chains import create_extraction_chain
from langchain.chat_models import ChatAnthropic
from langchain_experimental.llms.anthropic_functions import AnthropicFunctions

contract_prompt = hub.pull("wfh/anthropic_contract_extraction")


extraction_subchain = create_extraction_chain(
    Contract.model_json_schema(),
    llm=AnthropicFunctions(model="claude-2", max_tokens=20_000),
    prompt=contract_prompt,
)
# Dataset inputs have an "context" key, but this chain
# expects a dict with an "input" key
chain = (
    (lambda x: {"input": x["context"]})
    | extraction_subchain
    | (lambda x: {"output": x["text"]})
)

In [16]:
from pprint import pprint

contract_prompt


ChatPromptTemplate(input_variables=['input'], messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['input'], template="Populate a Contract based on the raw passage below. Use the the provided 'information_extraction' function.\n\nIf a property is not present and is not required in the function parameters, do not include it in the output.\n\n<Raw Passage>\n{input}\n</Raw Passage>\n\nRemember to respond using the XML-formating like so:\n<tool>information_extraction</tool>\n<tool_input>...THE TOOL INPUT</tool_input>\n\nBegin!"))])

In [10]:
import logging

# We will suppress any errors here since the documents are long
# and could pollute the notebook output
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)

In [12]:
from langchain.smith import RunEvalConfig

eval_config = RunEvalConfig(
    evaluators=["json_edit_distance"],
)
res = client.run_on_dataset(
    dataset_name=dataset_name,
    llm_or_chain_factory=chain,
    evaluation=eval_config,
    # In case you are rate-limited
    concurrency_level=2,
)

View the evaluation results for project 'test-sparkling-pleasure-47' at:
https://smith.langchain.com/o/d43512dc-bbac-5ebf-a00e-57a322925756/projects/p/134662d1-042b-4a34-a313-0e490ca7d33f?eval=true

View all tests for Dataset Contract Extraction - cb823edd-4c9c-43e6-9388-705f0eb6c51d at:
https://smith.langchain.com/datasets/02448a51-1638-4b03-9c2b-02c42b2cd486
[------------------------------------------------->] 16/16