# Tagging and Extraction

One of the biggest usecases of OpenAI functions.

---

## Setup

In [52]:
import openai
import os
from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv())
openai.api_type = os.environ.get("OPENAI_API_TYPE")
openai.api_base = os.environ.get("OPENAI_API_BASE")
openai.api_key = os.environ.get("OPENAI_API_KEY")
openai.api_version = os.environ.get("OPENAI_API_VERSION")

deployment_id = "gpt40125"

## Tagging

<img src="../../images/tagging.png" alt="Tagging" style="width: 60%; height: auto;"/>

In [53]:
from pydantic import BaseModel, Field


class Tagging(BaseModel):
    """Tag the piece of text with particular info."""
    sentiment: str = Field(description="sentiment of text, should be `pos`, `neg`, or `neutral`")
    language: str = Field(description="language of text (should be ISO 639-1 code)")

In [54]:
from langchain.utils.openai_functions import convert_pydantic_to_openai_function


convert_pydantic_to_openai_function(Tagging)

{'name': 'Tagging',
 'description': 'Tag the piece of text with particular info.',
 'parameters': {'title': 'Tagging',
  'description': 'Tag the piece of text with particular info.',
  'type': 'object',
  'properties': {'sentiment': {'title': 'Sentiment',
    'description': 'sentiment of text, should be `pos`, `neg`, or `neutral`',
    'type': 'string'},
   'language': {'title': 'Language',
    'description': 'language of text (should be ISO 639-1 code)',
    'type': 'string'}},
  'required': ['sentiment', 'language']}}

In [55]:
from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import AzureChatOpenAI


model = AzureChatOpenAI(deployment_name=deployment_id, temperature=0)
tagging_functions = [convert_pydantic_to_openai_function(Tagging)]
prompt = ChatPromptTemplate.from_messages([
    ("system", "Think carefully, and then tag the text as instructed"),
    ("user", "{input}")
])

In [56]:
model_with_functions = model.bind(
    functions=tagging_functions,
    function_call={"name": "Tagging"}
)

In [57]:
tagging_chain = prompt | model_with_functions

In [58]:
tagging_chain.invoke({"input": "I love langchain"})

AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{"sentiment":"pos","language":"en"}', 'name': 'Tagging'}})

In [59]:
tagging_chain.invoke({"input": "mujhe ye bilkul pasand nahi"})

AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{"sentiment":"neg","language":"hi"}', 'name': 'Tagging'}})

### Simplify the output

In [60]:
from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser


tagging_chain = prompt | model_with_functions | JsonOutputFunctionsParser()
tagging_chain.invoke({"input": "mujhe ye bilkul pasand nahi"})

{'sentiment': 'neg', 'language': 'hi'}

## Extraction

<img src="../../images/extraction.png" alt="Tagging" style="width: 60%; height: auto;"/>

In [61]:
from typing import Optional, List


class Person(BaseModel):
    """Information about a person."""
    name: str = Field(description="person's name")
    age: Optional[int] = Field(description="person's age")


class Information(BaseModel):
    """Information to extract."""
    people: List[Person] = Field(description="List of info about people")

In [62]:
convert_pydantic_to_openai_function(Information)

{'name': 'Information',
 'description': 'Information to extract.',
 'parameters': {'title': 'Information',
  'description': 'Information to extract.',
  'type': 'object',
  'properties': {'people': {'title': 'People',
    'description': 'List of info about people',
    'type': 'array',
    'items': {'title': 'Person',
     'description': 'Information about a person.',
     'type': 'object',
     'properties': {'name': {'title': 'Name',
       'description': "person's name",
       'type': 'string'},
      'age': {'title': 'Age',
       'description': "person's age",
       'type': 'integer'}},
     'required': ['name']}}},
  'required': ['people']}}

In [63]:
extraction_functions = [convert_pydantic_to_openai_function(Information)]
extraction_model = model.bind(functions=extraction_functions, function_call={"name": "Information"})

In [64]:
extraction_model.invoke("Joe is 30, his mom is Martha")

AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{"people":[{"name":"Joe","age":30},{"name":"Martha"}]}', 'name': 'Information'}})

### Simplify the output

In [65]:
prompt = ChatPromptTemplate.from_messages([
    ("system", "Think carefully, and then extract info as instructed."),
    ("user", "{input}")
])
extraction_chain = prompt | extraction_model | JsonOutputFunctionsParser()

In [66]:
extraction_chain.invoke({"input": "Joe is 30, his mom is Martha"})

{'people': [{'name': 'Joe', 'age': 30}, {'name': 'Martha'}]}

In [20]:
from langchain.output_parsers.openai_functions import JsonKeyOutputFunctionsParser


extraction_chain = prompt | extraction_model | JsonKeyOutputFunctionsParser(key_name="people")
extraction_chain.invoke({"input": "Joe is 30, his mom is Martha"})

[{'name': 'Joe', 'age': 30}, {'name': 'Martha'}]

## Real world usecase

We can apply tagging to a larger body of text.

For example, let's load this blog post and extract tag information from a sub-set of the text.

### Load the document

In [21]:
from langchain.document_loaders import WebBaseLoader


loader = WebBaseLoader("https://lilianweng.github.io/posts/2023-06-23-agent/")
documents = loader.load()

In [22]:
doc = documents[0]
page_content = doc.page_content[:10000]
page_content[:400]

"\n\n\n\n\n\nLLM Powered Autonomous Agents | Lil'Log\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nLil'Log\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nPosts\n\n\n\n\nArchive\n\n\n\n\nSearch\n\n\n\n\nTags\n\n\n\n\nFAQ\n\n\n\n\nemojisearch.app\n\n\n\n\n\n\n\n\n\n      LLM Powered Autonomous Agents\n    \nDate: June 23, 2023  |  Estimated Reading Time: 31 min  |  Author: Lilian Weng\n\n\n \n\n\nTable of Contents\n\n\n\nAgent System Overview\n\nComponent One: Planning\n\nTask Decomposit"

### Define tagging function for calling

In [23]:
class Overview(BaseModel):
    """Overview of a section of text."""
    summary: str = Field(description="Provide a concise summary of the content.")
    language: str = Field(description="Provide the language that the content is written in.")
    keywords: str = Field(description="Provide keywords related to the content.")

In [24]:
overview_tagging_function = [
    convert_pydantic_to_openai_function(Overview)
]

tagging_model = model.bind(
    functions=overview_tagging_function,
    function_call={"name":"Overview"}
)

prompt = ChatPromptTemplate.from_messages([
    ("system", "Think carefully, and then tag the text as instructed"),
    ("user", "{input}")
])
tagging_chain = prompt | tagging_model | JsonOutputFunctionsParser()

In [25]:
tagging_chain.invoke({"input": page_content})

{'summary': 'This article discusses the concept of building autonomous agents powered by large language models (LLMs) and explores various components and methodologies involved in their development. It covers aspects such as planning, memory, tool use, and self-reflection, highlighting techniques like Task Decomposition, Memory Types, Maximum Inner Product Search (MIPS), and self-improvement strategies. The article also presents case studies and proof-of-concept examples to illustrate the potential applications of LLM-powered agents in fields like scientific discovery and generative simulations.',
 'language': 'English',
 'keywords': 'LLM, autonomous agents, planning, memory, tool use, self-reflection, Task Decomposition, Maximum Inner Product Search, MIPS, scientific discovery, generative simulations'}

### Define extraction function for calling

In [33]:
class Paper(BaseModel):
    """Information about papers mentioned."""
    title: str
    author: Optional[str]


class Info(BaseModel):
    """Information to extract"""
    papers: List[Paper]

In [34]:
paper_extraction_function = [
    convert_pydantic_to_openai_function(Info)
]

extraction_model = model.bind(
    functions=paper_extraction_function, 
    function_call={"name":"Info"}
)

prompt = ChatPromptTemplate.from_messages([
    ("system", "Think carefully, and then extract info as instructed"),
    ("user", "{input}")
])

extraction_chain = prompt | extraction_model | JsonKeyOutputFunctionsParser(key_name="papers")

In [35]:
extraction_chain.invoke({"input": page_content})

[{'title': 'Chain of thought (CoT)', 'author': 'Wei et al. 2022'},
 {'title': 'Tree of Thoughts', 'author': 'Yao et al. 2023'},
 {'title': 'LLM+P', 'author': 'Liu et al. 2023'},
 {'title': 'ReAct', 'author': 'Yao et al. 2023'},
 {'title': 'Reflexion', 'author': 'Shinn & Labash 2023'},
 {'title': 'Chain of Hindsight (CoH)', 'author': 'Liu et al. 2023'},
 {'title': 'Algorithm Distillation (AD)', 'author': 'Laskin et al. 2023'}]

In [36]:
extraction_chain.invoke({"input": "hi"})

[{'title': 'The Impact of Artificial Intelligence on Modern Warfare'}]

Uh oh! It made up things, let's update the prompt.

In [37]:
template = """\
An article will be passed to you. Extract from it all papers that are mentioned by this article.

Do not extract the name of the article itself. If no papers are mentioned that's fine - you don't \
need to extract any! Just return an empty list.

Do not make up or guess any extra information. Only extract what exactly is in the text.\
"""

prompt = ChatPromptTemplate.from_messages([
    ("system", template),
    ("human", "{input}")
])

In [38]:
extraction_chain = prompt | extraction_model | JsonKeyOutputFunctionsParser(key_name="papers")

In [39]:
extraction_chain.invoke({"input": page_content})

[{'title': 'Chain of thought (CoT)', 'author': 'Wei et al. 2022'},
 {'title': 'Tree of Thoughts', 'author': 'Yao et al. 2023'},
 {'title': 'LLM+P', 'author': 'Liu et al. 2023'},
 {'title': 'ReAct', 'author': 'Yao et al. 2023'},
 {'title': 'Reflexion', 'author': 'Shinn & Labash 2023'},
 {'title': 'Chain of Hindsight (CoH)', 'author': 'Liu et al. 2023'},
 {'title': 'Algorithm Distillation (AD)', 'author': 'Laskin et al. 2023'}]

In [40]:
extraction_chain.invoke({"input": "hi"})

[]

### Let's do it for the entire page

In [41]:
from langchain.text_splitter import RecursiveCharacterTextSplitter


text_splitter = RecursiveCharacterTextSplitter(chunk_overlap=0)

In [42]:
splits = text_splitter.split_text(doc.page_content)
len(splits)

15

In [43]:
def flatten(matrix):
    flat_list = []
    for row in matrix:
        flat_list += row
    return flat_list

In [44]:
flatten([[1, 2], [3, 4]])

[1, 2, 3, 4]

In [45]:
from langchain.schema.runnable import RunnableLambda


prep = RunnableLambda(
    lambda x: [{"input": doc} for doc in text_splitter.split_text(x)]
)

In [46]:
prep.invoke("hi")

[{'input': 'hi'}]

Since, we have a list of elements from `prep`, we will have to map them to our existing chain:

In [48]:
chain = prep | extraction_chain.map() | flatten

So, here is what happens:
- `prep` splits up the entire page_content into 15 sections.
- these 15 sections are then passed onto the `extraction_chain`, a lot of these calls will be parallelized. By default, 5 calls are parallelized.
- when all the 15 calls are done, they then get passed to the `flatten` function

In [49]:
chain.invoke(doc.page_content)

[{'title': 'Chain of thought', 'author': 'Wei et al. 2022'},
 {'title': 'Tree of Thoughts', 'author': 'Yao et al. 2023'},
 {'title': 'LLM+P', 'author': 'Liu et al. 2023'},
 {'title': 'ReAct', 'author': 'Yao et al. 2023'},
 {'title': 'Reflexion', 'author': 'Shinn & Labash 2023'},
 {'title': 'Chain of Hindsight', 'author': 'Liu et al. 2023'},
 {'title': 'Algorithm Distillation', 'author': 'Laskin et al. 2023'},
 {'title': 'RL^2', 'author': 'Duan et al. 2017'},
 {'title': 'Modular Reasoning, Knowledge and Language',
  'author': 'Karpas et al. 2022'},
 {'title': 'Tool Augmented Language Models', 'author': 'Parisi et al. 2022'},
 {'title': 'Toolformer', 'author': 'Schick et al. 2023'},
 {'title': 'HuggingGPT', 'author': 'Shen et al. 2023'},
 {'title': 'API-Bank', 'author': 'Li et al. 2023'},
 {'title': 'ChemCrow', 'author': 'Bran et al. 2023'},
 {'title': 'LLM-empowered agents for scientific discovery',
  'author': 'Boiko et al. (2023)'},
 {'title': 'Generative Agents', 'author': 'Park, et 