In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [46]:
import nest_asyncio
nest_asyncio.apply()

#### Load documents

In [32]:
import uuid
from typing import List, Optional, TypedDict

from langchain_community.document_loaders import PyPDFLoader
from langchain_core.messages import (AIMessage, BaseMessage, HumanMessage,
                                    ToolMessage)
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_openai import ChatOpenAI


In [3]:
loader = PyPDFLoader("test.pdf")
docs = loader.load()

#### Simple Extraction

In [8]:
class AWSService(BaseModel):
    """Information about a service."""
    name: Optional[str] = Field(..., description="The name of the service")
    evidence: Optional[str]  = Field(
        ..., description="Verbatim sentence of text where the service is mentioned"
    )

In [12]:
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are an expert extraction algorithm. "
            "Only extract relevant information from the text. "
            "If you do not know the value of an attribute asked to extract, "
            "return null for the attribute's value.",
        ),
        ("human", "{text}"),
    ]
)

In [13]:
llm = ChatOpenAI(model="gpt-4-turbo-preview", temperature=0)
runnable = prompt | llm.with_structured_output(schema=AWSService)

  warn_beta(


In [15]:
simple = runnable.invoke(docs[5].page_content)
simple

AWSService(name='Amazon QuickSight', evidence='Amazon QuickSight dashboards for visualization')

#### Multiple Entities (Recommended)

In [18]:
class NonAWSSoftware(BaseModel):
    """Information about a non-AWS software."""
    name: Optional[str] = Field(..., description="The name of the software")
    evidence: Optional[str] = Field(
        ..., description="Verbatim sentence of text where the software is mentioned"
    )

In [19]:
class Data(BaseModel):
    """Extracted data about AWS services and non-AWS software."""
    aws_services: List[AWSService]
    non_aws_software: List[NonAWSSoftware]

In [20]:
runnable = prompt | llm.with_structured_output(Data)
results = runnable.invoke(docs[5].page_content)

In [21]:
results.aws_services

[AWSService(name='Amazon QuickSight', evidence='These metrics are presented in Amazon QuickSight dashboards for visualization.'),
 AWSService(name='Amazon Athena', evidence='You can also use other visualization tools, such as Tableau, to build visualizations from the Amazon Athena database.'),
 AWSService(name='AWS CloudFormation', evidence='This solution’s AWS CloudFormation template launches and conﬁgures the AWS services required to deploy the solution using AWS best practices for security, availability, performance eﬃciency, and cost optimization.')]

In [22]:
results.non_aws_software

[NonAWSSoftware(name='GitHub', evidence='This solution supports ingestion, analysis, and visualization of data from AWS Developer Tools as well as GitHub repository to calculate key DevOps metrics.'),
 NonAWSSoftware(name='Tableau', evidence='You can also use other visualization tools, such as Tableau, to build visualizations from the Amazon Athena database.')]

#### Testing No Entities

In [23]:
test_text = "The fox, an animal, jumped over the lazy dog. then the dog barked at the fox."

In [24]:
runnable = prompt | llm.with_structured_output(schema=AWSService)
runnable.invoke(test_text)

AWSService(name='fox', evidence='The fox, an animal, jumped over the lazy dog.')

In [25]:
runnable = prompt | llm.with_structured_output(schema=NonAWSSoftware)
runnable.invoke(test_text)

NonAWSSoftware(name='fox', evidence='The fox, an animal, jumped over the lazy dog.')

In [26]:
runnable = prompt | llm.with_structured_output(schema=Data)
runnable.invoke(test_text)

Data(aws_services=[], non_aws_software=[])

#### Reference examples

In [27]:
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are an expert extraction algorithm. "
            "Only extract relevant information from the text. "
            "If you do not know the value of an attribute asked "
            "to extract, return null for the attribute's value.",
        ),
        # ↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓
        MessagesPlaceholder("examples"),  # <-- EXAMPLES!
        # ↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑
        ("human", "{text}"),
    ]
)

In [31]:
prompt.invoke(
    {"text": "this is some text", "examples": [HumanMessage(content="testing 1 2 3"), HumanMessage(content="testing 4 5 6")]}
).to_messages()

[SystemMessage(content="You are an expert extraction algorithm. Only extract relevant information from the text. If you do not know the value of an attribute asked to extract, return null for the attribute's value."),
 HumanMessage(content='testing 1 2 3'),
 HumanMessage(content='testing 4 5 6'),
 HumanMessage(content='this is some text')]

In [34]:
class Example(TypedDict):
    """A representation of an example consisting of text input and expected tool calls.

    For extraction, the tool calls are represented as instances of pydantic model.
    """

    input: str  # This is the example text
    tool_calls: List[BaseModel]  # Instances of pydantic model that should be extracted

In [35]:
def tool_example_to_messages(example: Example) -> List[BaseMessage]:
    """Convert an example into a list of messages that can be fed into an LLM.

    This code is an adapter that converts our example to a list of messages
    that can be fed into a chat model.

    The list of messages per example corresponds to:

    1) HumanMessage: contains the content from which content should be extracted.
    2) AIMessage: contains the extracted information from the model
    3) ToolMessage: contains confirmation to the model that the model requested a tool correctly.

    The ToolMessage is required because some of the chat models are hyper-optimized for agents
    rather than for an extraction use case.
    """
    messages: List[BaseMessage] = [HumanMessage(content=example["input"])]
    openai_tool_calls = []
    for tool_call in example["tool_calls"]:
        openai_tool_calls.append(
            {
                "id": str(uuid.uuid4()),
                "type": "function",
                "function": {
                    # The name of the function right now corresponds
                    # to the name of the pydantic model
                    # This is implicit in the API right now,
                    # and will be improved over time.
                    "name": tool_call.__class__.__name__,
                    "arguments": tool_call.json(),
                },
            }
        )
    messages.append(
        AIMessage(content="", additional_kwargs={"tool_calls": openai_tool_calls})
    )
    tool_outputs = example.get("tool_outputs") or [
        "You have correctly called this tool."
    ] * len(openai_tool_calls)
    for output, tool_call in zip(tool_outputs, openai_tool_calls):
        messages.append(ToolMessage(content=output, tool_call_id=tool_call["id"]))
    return messages

In [63]:
_test = Example(input="The fox, an animal, jumped over the lazy dog. then the dog barked at the fox.", tool_calls=[AWSService(name=None, evidence=None)])
_messages = [HumanMessage(content=_test["input"])]
_openai_tool_calls = [
    {
        "id": str(uuid.uuid4()),
        "type": "function",
        "function": {
            "name": tool_call.__class__.__name__,
            "arguments": tool_call.json(),
        },
    }
    for tool_call in _test["tool_calls"]
]
_messages.append(AIMessage(content="", additional_kwargs={"tool_calls": _openai_tool_calls}))
_tool_outputs = _test.get("tool_outputs") or ["You have correctly called this tool."] * len(_openai_tool_calls)
for output, tool_call in zip(_tool_outputs, _openai_tool_calls):
    print(output)
    print(tool_call)
    _messages.append(ToolMessage(content=output, tool_call_id=tool_call["id"]))


You have correctly called this tool.
{'id': 'ef8e1526-f5e7-411d-b784-fa10eaf52636', 'type': 'function', 'function': {'name': 'AWSService', 'arguments': '{"name": null, "evidence": null}'}}


In [36]:
examples = [
    (
        "The fox, an animal, jumped over the lazy dog. then the dog barked at the fox.",
        AWSService(name=None, evidence=None),
    ),
    (
        "AWS S3 is the key service to store data in AWS cloud.",
        AWSService(name="AWS S3", evidence="AWS S3 is the key service to store data in AWS cloud."),
    ),
    (
        "The fox, an animal, jumped over the lazy dog. then the dog barked at the fox.",
        NonAWSSoftware(name=None, evidence=None),
    ),
]

In [37]:
messages = []

for text, tool_call in examples:
    messages.extend(tool_example_to_messages({"input": text, "tool_calls": [tool_call]}))

In [38]:
tool_example_to_messages({"input": examples[0][0], "tool_calls": [examples[0][1]]})

[HumanMessage(content='The fox, an animal, jumped over the lazy dog. then the dog barked at the fox.'),
 AIMessage(content='', additional_kwargs={'tool_calls': [{'id': '352ffac1-5fa0-4979-ab95-863fbac775ea', 'type': 'function', 'function': {'name': 'AWSService', 'arguments': '{"name": null, "evidence": null}'}}]}),
 ToolMessage(content='You have correctly called this tool.', tool_call_id='352ffac1-5fa0-4979-ab95-863fbac775ea')]

In [40]:
prompt.invoke({"text": "this is some text", "examples": messages}).to_messages()

[SystemMessage(content="You are an expert extraction algorithm. Only extract relevant information from the text. If you do not know the value of an attribute asked to extract, return null for the attribute's value."),
 HumanMessage(content='The fox, an animal, jumped over the lazy dog. then the dog barked at the fox.'),
 AIMessage(content='', additional_kwargs={'tool_calls': [{'id': '8dc43376-ae3d-4d8b-a572-81d115b56045', 'type': 'function', 'function': {'name': 'AWSService', 'arguments': '{"name": null, "evidence": null}'}}]}),
 ToolMessage(content='You have correctly called this tool.', tool_call_id='8dc43376-ae3d-4d8b-a572-81d115b56045'),
 HumanMessage(content='AWS S3 is the key service to store data in AWS cloud.'),
 AIMessage(content='', additional_kwargs={'tool_calls': [{'id': '7e9abed7-82fe-4b0d-98f1-a26872733860', 'type': 'function', 'function': {'name': 'AWSService', 'arguments': '{"name": "AWS S3", "evidence": "AWS S3 is the key service to store data in AWS cloud."}'}}]}),
 T

In [41]:
runnable = prompt | llm.with_structured_output(
    schema=Data,
    method="function_calling",
    include_raw=False,
)

In [45]:
for _ in range(5):
    print(runnable.invoke({"text": docs[5].page_content, "examples": messages}))

aws_services=[AWSService(name='AWS Developer Tools', evidence='This solution supports ingestion, analysis, and visualization of data from AWS Developer Tools as well as GitHub repository to calculate key DevOps metrics.'), AWSService(name='Amazon QuickSight', evidence='These metrics are presented in Amazon QuickSight dashboards for visualization.'), AWSService(name='Amazon Athena', evidence='You can also use other visualization tools, such as Tableau, to build visualizations from the Amazon Athena database.')] non_aws_software=[NonAWSSoftware(name='GitHub', evidence='This solution supports ingestion, analysis, and visualization of data from AWS Developer Tools as well as GitHub repository to calculate key DevOps metrics.'), NonAWSSoftware(name='Tableau', evidence='You can also use other visualization tools, such as Tableau, to build visualizations from the Amazon Athena database.')]
aws_services=[AWSService(name='AWS Developer Tools', evidence='This solution supports ingestion, analysi

In [48]:
await runnable.abatch([
    {"text": "The fox, an animal, jumped over the lazy dog. then the dog barked at the fox.", "examples": messages} for _ in range(5)
])

[Data(aws_services=[], non_aws_software=[]),
 Data(aws_services=[], non_aws_software=[]),
 Data(aws_services=[], non_aws_software=[]),
 Data(aws_services=[], non_aws_software=[]),
 Data(aws_services=[], non_aws_software=[])]