# Tagging and Extraction Using OpenAI functions

In [1]:
from typing import List
from pydantic import BaseModel, Field
from langchain.utils.openai_functions import convert_pydantic_to_openai_function

In [2]:
class Tagging(BaseModel):
    """Tag the piece of text with particular info."""
    sentiment: str = Field(description="sentiment of text, should be `pos`, `neg`, or `neutral`")
    language: str = Field(description="language of text (should be ISO 639-1 code)")

In [3]:
convert_pydantic_to_openai_function(Tagging)

  warn_deprecated(


{'name': 'Tagging',
 'description': 'Tag the piece of text with particular info.',
 'parameters': {'properties': {'sentiment': {'description': 'sentiment of text, should be `pos`, `neg`, or `neutral`',
    'type': 'string'},
   'language': {'description': 'language of text (should be ISO 639-1 code)',
    'type': 'string'}},
  'required': ['sentiment', 'language'],
  'type': 'object'}}

In [4]:
from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOllama

In [5]:
model = ChatOllama(model="zephyr:latest")

In [6]:
tagging_functions = [convert_pydantic_to_openai_function(Tagging)]

In [7]:
prompt = ChatPromptTemplate.from_messages([
    ("system", "Think carefully, and then tag the text as instructed"),
    ("user", "{input}")
])

In [8]:
model_with_functions = model.bind(
    functions=tagging_functions,
    function_call={"name": "Tagging"}
)

In [9]:
tagging_chain = prompt | model_with_functions

In [10]:
print(tagging_chain.invoke({"input": "I love langchain"}))

content='You can tag this text as "personal opinion" or "user statement". The choice depends on the context in which it is used. If it appears in a discussion about language processing libraries, then "personal opinion" would be appropriate. However, if it is part of a product review, then "user statement" might be more relevant. In either case, both tags are useful for categorizing and searching through content, making it easier to find similar opinions or statements in the future.' response_metadata={'model': 'zephyr:latest', 'created_at': '2024-06-02T13:36:43.9190875Z', 'message': {'role': 'assistant', 'content': ''}, 'done_reason': 'stop', 'done': True, 'total_duration': 54836821800, 'load_duration': 16509561800, 'prompt_eval_count': 46, 'prompt_eval_duration': 3291710000, 'eval_count': 98, 'eval_duration': 35012875000} id='run-189d7b9b-6329-4db0-adbd-ce4acebad876-0'


In [11]:
print(tagging_chain.invoke({"input": "non mi piace questo cibo"}))

content='Tag: personal preference (dislike)\nText: non mi piace questo cibo\nExplanation: "non mi piace questo cibo" translates to "I do not like this food." This is a statement of personal preference, indicating that the author does not enjoy the specific food being referred to.' response_metadata={'model': 'zephyr:latest', 'created_at': '2024-06-02T13:38:32.4257693Z', 'message': {'role': 'assistant', 'content': ''}, 'done_reason': 'stop', 'done': True, 'total_duration': 28915694600, 'load_duration': 4558400, 'prompt_eval_count': 19, 'prompt_eval_duration': 2494493000, 'eval_count': 68, 'eval_duration': 26413098000} id='run-432e3ca8-4c04-4d15-ab2e-09b447fe693f-0'


## Extraction

Extraction is similar to tagging, but used for extracting multiple pieces of information.

In [14]:
from typing import Optional
class Person(BaseModel):
    """Information about a person."""
    name: str = Field(description="person's name")
    age: Optional[int] = Field(description="person's age")

In [15]:
class Information(BaseModel):
    """Information to extract."""
    people: List[Person] = Field(description="List of info about people")

In [16]:
convert_pydantic_to_openai_function(Information)

{'name': 'Information',
 'description': 'Information to extract.',
 'parameters': {'$defs': {'Person': {'description': 'Information about a person.',
    'properties': {'name': {'description': "person's name", 'type': 'string'},
     'age': {'anyOf': [{'type': 'integer'}, {'type': 'null'}],
      'description': "person's age"}},
    'required': ['name', 'age'],
    'type': 'object'}},
  'properties': {'people': {'description': 'List of info about people',
    'items': {'description': 'Information about a person.',
     'properties': {'name': {'description': "person's name", 'type': 'string'},
      'age': {'anyOf': [{'type': 'integer'}, {'type': 'null'}],
       'description': "person's age"}},
     'required': ['name', 'age'],
     'type': 'object'},
    'type': 'array'}},
  'required': ['people'],
  'type': 'object'}}

In [17]:
extraction_functions = [convert_pydantic_to_openai_function(Information)]
extraction_model = model.bind(functions=extraction_functions, function_call={"name": "Information"})

In [18]:
extraction_model.invoke("Joe is 30, his mom is Martha")

AIMessage(content="And Joe's grandmother is Martha's mother, making Joe's grandmother one generation older than Joe's mother. Therefore, Joe's grandmother is also related to Joe, but in a different way - she is Joe's maternal grandmother. So Joe's relationship with his grandmother is that of a grandson, while his mother's relationship with her mother is that of a daughter.\n\nIn summary:\n- Joe's mother (Martha) is 30 years old\n- Martha's mother is Joe's grandmother\n- Joe's relationship with his grandmother is grandson, as his grandmother is one generation older than his mom and related to him through his mom (Joe's maternal grandmother)", response_metadata={'model': 'zephyr:latest', 'created_at': '2024-06-02T13:40:38.6059775Z', 'message': {'role': 'assistant', 'content': ''}, 'done_reason': 'stop', 'done': True, 'total_duration': 75602011100, 'load_duration': 5864600, 'prompt_eval_count': 26, 'prompt_eval_duration': 3079610000, 'eval_count': 155, 'eval_duration': 72512877000}, id='r

In [19]:
prompt = ChatPromptTemplate.from_messages([
    ("system", "Extract the relevant information, if not explicitly provided do not guess. Extract partial info"),
    ("human", "{input}")
])

In [20]:
extraction_chain = prompt | extraction_model

In [21]:
extraction_chain.invoke({"input": "Joe is 30, his mom is Martha"})

AIMessage(content="Relevant information: Joe's name is Joe and his mother's name is Martha. Both individuals have their own ages, but without further information it cannot be determined how old either of them are.", response_metadata={'model': 'zephyr:latest', 'created_at': '2024-06-02T13:41:03.628908Z', 'message': {'role': 'assistant', 'content': ''}, 'done_reason': 'stop', 'done': True, 'total_duration': 22840324500, 'load_duration': 4198400, 'prompt_eval_count': 53, 'prompt_eval_duration': 3639059000, 'eval_count': 43, 'eval_duration': 19188615000}, id='run-14d61d97-5992-4db9-bc95-fcdd82b74145-0')

In [33]:
class Overview(BaseModel):
    """Overview of a section of text."""
    summary: str = Field(description="Provide a concise summary of the content.")
    language: str = Field(description="Provide the language that the content is written in.")
    keywords: str = Field(description="Provide keywords related to the content.")

In [34]:
overview_tagging_function = [
    convert_pydantic_to_openai_function(Overview)
]
tagging_model = model.bind(
    functions=overview_tagging_function,
    function_call={"name":"Overview"}
)
tagging_chain = prompt | tagging_model | JsonOutputFunctionsParser()

In [35]:
tagging_chain.invoke({"input": page_content})

{'summary': 'This text discusses the concept of building autonomous agents powered by LLM (large language model) as the core controller. It covers components such as planning, memory, and tool use, along with examples and challenges in implementing LLM-powered agents.',
 'language': 'English',
 'keywords': 'LLM, autonomous agents, planning, memory, tool use, proof-of-concepts, challenges'}

In [36]:
class Paper(BaseModel):
    """Information about papers mentioned."""
    title: str
    author: Optional[str]


class Info(BaseModel):
    """Information to extract"""
    papers: List[Paper]

In [37]:
paper_extraction_function = [
    convert_pydantic_to_openai_function(Info)
]
extraction_model = model.bind(
    functions=paper_extraction_function, 
    function_call={"name":"Info"}
)
extraction_chain = prompt | extraction_model | JsonKeyOutputFunctionsParser(key_name="papers")

In [38]:
extraction_chain.invoke({"input": page_content})

[{'title': 'LLM Powered Autonomous Agents', 'author': 'Lilian Weng'}]

In [39]:
template = """A article will be passed to you. Extract from it all papers that are mentioned by this article. 

Do not extract the name of the article itself. If no papers are mentioned that's fine - you don't need to extract any! Just return an empty list.

Do not make up or guess ANY extra information. Only extract what exactly is in the text."""

prompt = ChatPromptTemplate.from_messages([
    ("system", template),
    ("human", "{input}")
])

In [40]:
extraction_chain = prompt | extraction_model | JsonKeyOutputFunctionsParser(key_name="papers")

In [41]:
extraction_chain.invoke({"input": page_content})

[{'title': 'Chain of thought (CoT; Wei et al. 2022)'},
 {'title': 'Tree of Thoughts (Yao et al. 2023)'},
 {'title': 'LLM+P (Liu et al. 2023)'},
 {'title': 'ReAct (Yao et al. 2023)'},
 {'title': 'Reflexion (Shinn & Labash 2023)'},
 {'title': 'Chain of Hindsight (CoH; Liu et al. 2023)'},
 {'title': 'Algorithm Distillation (AD; Laskin et al. 2023)'}]

In [42]:
extraction_chain.invoke({"input": "hi"})

[]