# Build and Extraction Chain
Using `tool-calling` features of `chat models` to extract structured information from unstructured text.

In [13]:
from typing import Optional, List
from pydantic import BaseModel, Field
from langchain.chat_models import init_chat_model
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder

### Best practices in defining schema
There are two best practices when defining schema:

Document the attributes and the schema itself: This information is sent to the LLM and is used to improve the quality of information extraction.
Do not force the LLM to make up information! Above we used Optional for the attributes allowing the LLM to output None if it doesn't know the answer.

In [8]:
# Define what information we want to extract from the text
class Person(BaseModel):
    """ Information about a person. """
    name: Optional[str] = Field(default=None, description="Name of the person")
    hair_color: Optional[str] = Field(default=None, description="Hair color of the person")
    height_in_meters: Optional[str] = Field(default=None, description="Height of the person in meters")

In [5]:
# Define a custom prompt to provide instructions and any additional context
# 1) You can add examples into the prompt template to improve extraction quality
# 2) Introduce additional parameters to take context into account (e.g., Include metadata about the document from which the text was extracted.)

# here we use .from_messages because it allows us to define a multi-role, multi-turn conversation
prompt_template = ChatPromptTemplate.from_messages(
    [
        ('system', 'You are an expert extraction algorithm. '
        "Only extract relevant information from text."
        " If you do not know the value of an attribute asked to extract,"
        " return null for that attribute's value."
        ),
        ("human", "{text}")
    ]
)

In [6]:
TEMPERATURE = 0.0
NUM_PREDICT = 256
MODEL = "gemma3:12b-it-qat"

llm = init_chat_model(
    model=MODEL,
    temperature=TEMPERATURE,
    num_predict=NUM_PREDICT,
    use_gpu=True,
    model_provider='ollama',
)

In [9]:
structured_llm = llm.with_structured_output(schema=Person)

In [10]:
text = "Alan Smith is 6 feet tall and has blond hair."
prompt = prompt_template.invoke({"text": text})
structured_output = structured_llm.invoke(prompt)

In [11]:
print(f"Prompt: {prompt}")
print(f"Structured Output: {structured_output}")


Prompt: messages=[SystemMessage(content="You are an expert extraction algorithm. Only extract relevant information from text. If you do not know the value of an attribute asked to extract, return null for that attribute's value.", additional_kwargs={}, response_metadata={}), HumanMessage(content='Alan Smith is 6 feet tall and has blond hair.', additional_kwargs={}, response_metadata={})]
Structured Output: name='Alan Smith' hair_color=None height_in_meters=None


## Multiple Entities

In [14]:
class Data(BaseModel):
    """ Extracted data about people """
    people: List[Person]

In [15]:
structured_llm = llm.with_structured_output(schema=Data)
text = "My name is Jeff, my hair is black and i am 6 feet tall. Anna has the same color hair as me."
prompt = prompt_template.invoke({"text": text})
structured_output = structured_llm.invoke(prompt)
print(structured_output)

people=[Person(name='Jeff', hair_color='black', height_in_meters=None), Person(name='Anna', hair_color='black', height_in_meters=None)]


## Tools Reference
`Structured output` often uses tool calling under-the-hood. This typically involves the generation of `AI Messages` containing tool calls, as well as `tool messages` containing the results of tool calls.  

Here we are demonstrating how does providing a **few shot** learning examples help to direct the LLM to learn

In [16]:
from langchain_core.utils.function_calling import tool_example_to_messages

examples = [
    (
        "The ocean is vast and blue. It is more than 20,000 feet deep.",
        Data(people=[])
    ),
    (
        'Fiona traveled far from France to Spain.',
        Data(people=[Person(name="Fiona", hair_color=None, height_in_meters=None)]),
    )
]

messages = []
for txt, tool_call in examples:
    if tool_call.people:
        ai_response = "Detected People."
    else:
        ai_response = "Detected no people."
    messages.extend(
        tool_example_to_messages(
            txt,
            [tool_call],
            ai_response=ai_response,
        )
    )

  tool_example_to_messages(


In [17]:
for message in messages:
    message.pretty_print()


The ocean is vast and blue. It is more than 20,000 feet deep.
Tool Calls:
  Data (e00aeb52-bd1d-4a73-ac46-ea26a0b37369)
 Call ID: e00aeb52-bd1d-4a73-ac46-ea26a0b37369
  Args:
    people: []

You have correctly called this tool.

Detected no people.

Fiona traveled far from France to Spain.
Tool Calls:
  Data (563da6ee-6209-4f71-af69-b9e857884895)
 Call ID: 563da6ee-6209-4f71-af69-b9e857884895
  Args:
    people: [{'name': 'Fiona', 'hair_color': None, 'height_in_meters': None}]

You have correctly called this tool.

Detected People.


### Comparing performance with and without these messages

In [22]:
message_no_extraction = {
    "role": "user",
    "content": "The solar system is large, but earth has only 1 moon which has Neil Armstrong's Foot imprint.",
}

structured_llm = llm.with_structured_output(schema=Data)
structured_llm.invoke([message_no_extraction])


Data(people=[Person(name='Neil Armstrong', hair_color=None, height_in_meters=None), Person(name='Buzz Aldrin', hair_color=None, height_in_meters=None)])

In [23]:
structured_llm.invoke(messages + [message_no_extraction])

Data(people=[Person(name='Neil Armstrong', hair_color=None, height_in_meters=None)])