In [None]:
!pip install -qU langchain langchain-openai

# Build an Extraction Chain

## Setup

In [None]:
import os

langchain_api_key = 'your_langchain_api_key_here'  # Replace with your actual LangChain API key
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_API_KEY'] = langchain_api_key

openai_api_key = 'your_openai_api_key_here'  # Replace with your actual OpenAI API key
os.environ['OPENAI_API_KEY'] = openai_api_key

## The Schema

First, we need to describe what information we want to extract from the text.

We will use Pydantic to define an example schema to extract personal information.

In [3]:
from typing import Optional
from pydantic import BaseModel, Field


class Person(BaseModel):
    """Information about a person"""

    # ^ Doc-string for the entity Person.
    # This doc-string is sent to the LLM as the description of the schema Person,
    # and it can help to improve extraction results.

    # Note that:
    # 1. Each field is an `optional` -- this allows the model to decline to extract it.
    # 2. Each field has a `description` -- this description is used by the LLM.
    # Having a good description can help improve extraction results.

    name: Optional[str] = Field(
        default=None,
        description='The name of the person',
    )
    hair_color: Optional[str] = Field(
        default=None,
        description="The color of the person's hair if known",
    )
    height_in_meters: Optional[str] = Field(
        default=None,
        description="The height measured in meters",
    )

When defining schema:
1. Document the **attributes** and the **schema** itself: This information is sent to the LLM and is used to improve the quality of information extraction.
2. Do NOT force the LLM to make up information! Above we used `Optional` for the attributes allowing the LLM to output `None` if it does not know the answer.

## The Extractor

We can create an information extractor using the schema we defined above.

In [4]:
from typing import Optional
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from pydantic import BaseModel, Field


# Define a custom prompt to provide instructions and any additional context.
# 1) We can add examples into the prompt template to improve extraction quality
# 2) Introduce additional parameters to take context into account (e.g., include
#    metadata about the document from which the text was extracted.)

prompt = ChatPromptTemplate.from_messages(
    [
        (
            'system',
            "You are an expert extraction algorithm. "
            "Only extract relevant information from the text. "
            "If you do not know the value of an attribute asked to extract, "
            "return null for the attribute's value."
        ),

        # Check the how-to about improving performance with
        # reference examples.
        #MessagesPlaceholder(variable_name='examples'),
        ('human', '{text}'),
    ]
)

We need a model that supports function/tool calling.

In [5]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=0)

runnable = prompt | llm.with_structured_output(schema=Person)

In [17]:
# test
text = "Haobo Sun is 6 feet tall. His hair color is blue."
result = runnable.invoke({'text': text})
result

Person(name='Haobo Sun', hair_color='blue', height_in_meters='1.83')

In [18]:
f"My name is {result.name}, my hair color is {result.hair_color}, and my height is {result.height_in_meters} meters."

'My name is Haobo Sun, my hair color is blue, and my height is 1.83 meters.'

## Multiple Entities

In most cases, we should be extracting a list of entities rather than a single entity.

This can be achieved using pydantic by nesting models inside one another.

In [19]:
from typing import List, Optional
from pydantic import BaseModel, Field

class Person(BaseModel):
    """Information about a person"""

    # ^ Doc-string for the entity Person.
    # This doc-string is sent to the LLM as the description of the schema Person,
    # and it can help to improve extraction results.

    # Note that:
    # 1. Each field is an `optional` -- this allows the model to decline to extract it.
    # 2. Each field has a `description` -- this description is used by the LLM.
    # Having a good description can help improve extraction results.

    name: Optional[str] = Field(
        default=None,
        description='The name of the person',
    )
    hair_color: Optional[str] = Field(
        default=None,
        description="The color of the person's hair if known",
    )
    height_in_meters: Optional[str] = Field(
        default=None,
        description="The height measured in meters",
    )


class Data(BaseModel):
    """Extracted data about people"""

    # Create a model so that we can extract multiple entities
    people: List[Person]

In [20]:
runnable = prompt | llm.with_structured_output(schema=Data)

text = """
My name is Bin, my hair is black and I am 6 feet 3 inches tall.
Anna has the same color hair as me.
Haobo Sun has the same height as Bin but has blue hair.
"""

runnable.invoke({'text': text})

Data(people=[Person(name='Bin', hair_color='black', height_in_meters='1.91'), Person(name='Anna', hair_color='black', height_in_meters=None), Person(name='Haobo Sun', hair_color='blue', height_in_meters='1.91')])