# Tagging and Extraction Using OpenAI functions


# Setup {.smaller}

In [None]:
import os
import openai

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) 
openai.api_key = os.environ['OPENAI_API_KEY']

In [None]:
from pydantic import BaseModel, Field
from typing import List
from typing import Optional

from langchain.utils.openai_functions import convert_pydantic_to_openai_function
from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
from langchain.output_parsers.openai_functions import JsonKeyOutputFunctionsParser
from langchain.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema.runnable import RunnableLambda

# Tagging

## Create tagging class

In [None]:
class Tagging(BaseModel):
    """Tag the piece of text with particular info."""
    sentiment: str = Field(description="sentiment of text, should be `pos`, `neg`, or `neutral`")
    language: str = Field(description="language of text (should be ISO 639-1 code)")

## Take a look at the class {.smaller}

In [None]:
convert_pydantic_to_openai_function(Tagging)

{'name': 'Tagging',
 'description': 'Tag the piece of text with particular info.',
 'parameters': {'title': 'Tagging',
  'description': 'Tag the piece of text with particular info.',
  'type': 'object',
  'properties': {'sentiment': {'title': 'Sentiment',
    'description': 'sentiment of text, should be `pos`, `neg`, or `neutral`',
    'type': 'string'},
   'language': {'title': 'Language',
    'description': 'language of text (should be ISO 639-1 code)',
    'type': 'string'}},
  'required': ['sentiment', 'language']}}


## Create model, tagging function and prompt {.smaller}

In [None]:
model = ChatOpenAI(temperature=0)

In [None]:
tagging_functions = [convert_pydantic_to_openai_function(Tagging)]

In [None]:
prompt = ChatPromptTemplate.from_messages([
    ("system", "Think carefully, and then tag the text as instructed"),
    ("user", "{input}")
])

## Bind model to tagging function and create chain

We force the model to use the tagging functions

In [None]:
model_with_functions = model.bind(
    functions=tagging_functions,
    function_call={"name": "Tagging"}
)

In [None]:
tagging_chain = prompt | model_with_functions

## Call the function with example 1

In [None]:
tagging_chain.invoke({"input": "I like the book Sapiens"})

- AIMessage(content='', additional_kwargs={'function_call': {'name': 'Tagging', 'arguments': '{\n  "sentiment": "pos",\n  "language": "en"\n}'}})

## Call the function with example 2


In [None]:
tagging_chain.invoke({"input": "Das 'Buch Eine Anleitung zum guten Leben: Wie Sie die alte Kunst des Stoizismus' ist sehr lesenswert"})

- AIMessage(content='', additional_kwargs={'function_call': {'name': 'Tagging', 'arguments': '{\n  "sentiment": "pos",\n  "language": "de"\n}'}})


## Use output parser {.smaller}

- Obtain a cleaner result with `JsonOutputFunctionsParser()`



In [None]:
tagging_chain = prompt | model_with_functions | JsonOutputFunctionsParser()

In [None]:
tagging_chain.invoke({"input": "Das 'Buch Eine Anleitung zum guten Leben: Wie Sie die alte Kunst des Stoizismus' ist sehr lesenswert"})

- {'sentiment': 'pos', 'language': 'de'}

# Extraction

Extraction is similar to tagging, but used for extracting multiple pieces of information.

## Define class {.smaller}

In [None]:
class Person(BaseModel):
    """Information about a person."""
    name: str = Field(description="person's name")
    age: Optional[int] = Field(description="person's age")

In [None]:
class Information(BaseModel):
    """Information to extract."""
    people: List[Person] = Field(description="List of info about people")

## Convert Pydantic to OpenAI function {.smaller}

In [None]:
convert_pydantic_to_openai_function(Information)

{'name': 'Information',
 'description': 'Information to extract.',
 'parameters': {'title': 'Information',
  'description': 'Information to extract.',
  'type': 'object',
  'properties': {'people': {'title': 'People',
    'description': 'List of info about people',
    'type': 'array',
    'items': {'title': 'Person',
     'description': 'Information about a person.',
     'type': 'object',
     'properties': {'name': {'title': 'Name',
       'description': "person's name",
       'type': 'string'},
      'age': {'title': 'Age',
       'description': "person's age",
       'type': 'integer'}},
     'required': ['name']}}},
  'required': ['people']}}


## Set up model

In [None]:
extraction_functions = [convert_pydantic_to_openai_function(Information)]

extraction_model = model.bind(functions=extraction_functions, function_call={"name": "Information"})

## Test model

In [None]:
extraction_model.invoke("Joe is 30, his mom is Martha")

- AIMessage(content='', additional_kwargs={'function_call': {'name': 'Information', 'arguments': '{\n  "people": [\n    {\n      "name": "Joe",\n      "age": 30\n    },\n    {\n      "name": "Martha",\n      "age": 0\n    }\n  ]\n}'}})

- Model inputs age 0 if age isn't provided


## Update prompt {.smaller}

In [None]:
prompt = ChatPromptTemplate.from_messages([
    ("system", "Extract the relevant information, if not explicitly provided do not guess. Extract partial info"),
    ("human", "{input}")
])

In [None]:
extraction_chain = prompt | extraction_model

In [None]:
extraction_chain.invoke({"input": "Joe is 30, his mom is Martha"})

- AIMessage(content='', additional_kwargs={'function_call': {'name': 'Information', 'arguments': '{\n  "people": [\n    {\n      "name": "Joe",\n      "age": 30\n    },\n    {\n      "name": "Martha"\n    }\n  ]\n}'}})


## Parse output


In [None]:
extraction_chain = prompt | extraction_model | JsonOutputFunctionsParser()

In [None]:
extraction_chain.invoke({"input": "Joe is 30, his mom is Martha"})

- {'people': [{'name': 'Joe', 'age': 30}, {'name': 'Martha'}]}


## Use different output parser

- Use `JsonKeyOutputFunctionsParser()`to only extract relevant info



In [None]:
extraction_chain = prompt | extraction_model | JsonKeyOutputFunctionsParser(key_name="people")

In [None]:
extraction_chain.invoke({"input": "Joe is 30, his mom is Martha"})

- [{'name': 'Joe', 'age': 30}, {'name': 'Martha'}]

# Blog post example

We can apply tagging and axtracting to a larger body of text.



## Load document

In [None]:
loader = WebBaseLoader("https://lilianweng.github.io/posts/2023-06-23-agent/")

documents = loader.load()

In [None]:
doc = documents[0]

## Inspect content {.smaller}


In [None]:
page_content = doc.page_content[:10000]

In [None]:
print(page_content[:1000])



```markdown
LLM Powered Autonomous Agents | Lil'Log

Lil'Log

Posts

Archive

Search

Tags

FAQ

emojisearch.app

      LLM Powered Autonomous Agents
    
June 23, 2023 · 31 min · Lilian Weng

Table of Contents
Agent System Overview
Component One: Planning
Task Decomposition
Self-Reflection
Component Two: Memory
Types of Memory
Maximum Inner Product Search (MIPS)
Component Three: Tool Use
Case Studies
Scientific Discovery Agent
Generative Agents Simulation
Proof-of-Concept Examples
Challenges
Citation
References

Building agents with LLM (large language model) as its core controller is a cool concept. Several proof-of-concepts demos, such as AutoGPT, GPT-Engineer and BabyAGI, serve as inspiring examples. The potentiality of LLM extends beyond generating well-written copies, stories, essays and programs; it can be framed as a powerful general problem solver.
Agent System Overview#
In
```

# Blog post tagging

## Create class to create article overview and tags


In [None]:
class Overview(BaseModel):
    """Overview of a section of text."""
    summary: str = Field(description="Provide a concise summary of the content.")
    language: str = Field(description="Provide the language that the content is written in.")
    keywords: str = Field(description="Provide keywords related to the content.")

## Setup the chain

In [None]:
overview_tagging_function = [
    convert_pydantic_to_openai_function(Overview)
]

tagging_model = model.bind(
    functions=overview_tagging_function,
    function_call={"name":"Overview"}
)

tagging_chain = prompt | tagging_model | JsonOutputFunctionsParser()

## Invoke chain

In [None]:
tagging_chain.invoke({"input": page_content})

- {'summary': 'This article discusses the concept of building autonomous agents powered by LLM (large language model) as their core controller. It explores the key components of such agents, including planning, memory, and tool use. It also covers various techniques for task decomposition and self-reflection in autonomous agents.',
 'language': 'English',
 'keywords': 'LLM, autonomous agents, planning, memory, tool use, task decomposition, self-reflection'}

# Blog post extraction

## Define class to extract papers

In [None]:
class Paper(BaseModel):
    """Information about papers mentioned."""
    title: str
    author: Optional[str]


class Info(BaseModel):
    """Information to extract"""
    papers: List[Paper]

## Setup extraction chain

In [None]:
paper_extraction_function = [
    convert_pydantic_to_openai_function(Info)
]

extraction_model = model.bind(
    functions=paper_extraction_function, 
    function_call={"name":"Info"}
)

extraction_chain = prompt | extraction_model | JsonKeyOutputFunctionsParser(key_name="papers")

## Invoke chain

In [None]:
extraction_chain.invoke({"input": page_content})

- [{'title': 'LLM Powered Autonomous Agents', 'author': 'Lilian Weng'}]


## Update sytem message {.smaller}


In [None]:
template = """A article will be passed to you. Extract from it all papers that are mentioned by this article. 

Do not extract the name of the article itself. If no papers are mentioned that's fine - you don't need to extract any! Just return an empty list.

Do not make up or guess ANY extra information. Only extract what exactly is in the text."""

prompt = ChatPromptTemplate.from_messages([
    ("system", template),
    ("human", "{input}")
])

## Set up chain

In [None]:
extraction_chain = prompt | extraction_model | JsonKeyOutputFunctionsParser(key_name="papers")

## Invoke chain


In [None]:
extraction_chain.invoke({"input": page_content})

- [{'title': 'Chain of thought (CoT; Wei et al. 2022)', 'author': 'Wei et al.'},
 {'title': 'Tree of Thoughts (Yao et al. 2023)', 'author': 'Yao et al.'},
 {'title': 'LLM+P (Liu et al. 2023)', 'author': 'Liu et al.'},
 {'title': 'ReAct (Yao et al. 2023)', 'author': 'Yao et al.'},
 {'title': 'Reflexion (Shinn & Labash 2023)', 'author': 'Shinn & Labash'},
 {'title': 'Chain of Hindsight (CoH; Liu et al. 2023)',
  'author': 'Liu et al.'},
 {'title': 'Algorithm Distillation (AD; Laskin et al. 2023)',
  'author': 'Laskin et al.'}]


## Test chain

In [None]:
extraction_chain.invoke({"input": "hi"})

- []

# Extraction for the complete blog post


## Split the text

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_overlap=0)

In [None]:
splits = text_splitter.split_text(doc.page_content)

In [None]:
len(splits)

- 14

## Create function to join the lists

In [None]:
def flatten(matrix):
    flat_list = []
    for row in matrix:
        flat_list += row
    return flat_list

- Test the function



In [None]:
flatten([[1, 2], [3, 4]])

- [1, 2, 3, 4]

## Take a look at the splits

- The splits are just text. 
- We need to convert them to a dictionary where the text is the input key.



In [None]:
print(splits[0])

## Use RunnableLambda to create function {.smaller}

In [None]:
prep = RunnableLambda(
    lambda x: [{"input": doc} for doc in text_splitter.split_text(x)]
)

- Test function



In [None]:
prep.invoke("hi")

- [{'input': 'hi'}]


## Create chain

In [None]:
chain = prep | extraction_chain.map() | flatten

- `extraction_chain` operates over a single element

- However, we have a list of elements

- Therefore, we call `.map()`


## Invoke chain {.smaller}

In [None]:
chain.invoke(doc.page_content)