## <b><font color='darkblue'>Preface</font></b>
([course link](https://learn.deeplearning.ai/courses/functions-tools-agents-langchain/lesson/5/tagging-and-extraction)) <b><font size='3ptx'>We could leverage Pydantic BaseModel to tell LLM on extract tags (e.g. name, age) from text.</font></b>

In [1]:
!pip freeze | grep -P '(openai|langchain)'

langchain==0.2.6
langchain-anthropic==0.1.15
langchain-community==0.2.6
langchain-core==0.2.10
langchain-experimental==0.0.62
langchain-google-genai==1.0.6
langchain-groq==0.1.3
langchain-openai==0.1.9
langchain-text-splitters==0.2.0
langchainhub==0.1.14
openai==1.28.1


In [55]:
import json
import os
import openai
import re
import httpx
import os
from dotenv import load_dotenv, find_dotenv
from typing import List, Optional
from pydantic import BaseModel, Field, TypeAdapter

import openai
from openai import OpenAI
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser
from langchain_core.utils.function_calling import convert_to_openai_function
from langchain.utils.openai_functions import convert_pydantic_to_openai_function
from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
from langchain.output_parsers.openai_functions import JsonKeyOutputFunctionsParser

a = load_dotenv(find_dotenv(os.path.expanduser('~/.env'))) # read local .env file
openai.api_key = os.environ['OPENAI_API_KEY']

## <b><font color='darkblue'>Tags & Extracts</font></b>

In [5]:
class Tagging(BaseModel):
    """Tag the piece of text with particular info."""
    sentiment: str = Field(description="sentiment of text, should be `pos`, `neg`, or `neutral`")
    language: str = Field(description="language of text (should be ISO 639-1 code)")

In [9]:
model = ChatOpenAI(temperature=0)
tagging_functions = [convert_to_openai_function(Tagging)]

In [10]:
prompt = ChatPromptTemplate.from_messages([
    ("system", "Think carefully, and then tag the text as instructed"),
    ("user", "{input}")
])

In [11]:
model_with_functions = model.bind(
    functions=tagging_functions,
    function_call={"name": "Tagging"}
)

In [12]:
tagging_chain = prompt | model_with_functions

In [13]:
tagging_chain.invoke({"input": "I love langchain"})

AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{"sentiment":"positive","language":"English"}', 'name': 'Tagging'}}, response_metadata={'token_usage': {'completion_tokens': 10, 'prompt_tokens': 75, 'total_tokens': 85}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-1894facf-303f-4d7d-b6f1-edaaa5ca2205-0', usage_metadata={'input_tokens': 75, 'output_tokens': 10, 'total_tokens': 85})

In [14]:
tagging_chain.invoke({"input": "non mi piace questo cibo"})

AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{"sentiment":"negative","language":"Italian"}', 'name': 'Tagging'}}, response_metadata={'token_usage': {'completion_tokens': 10, 'prompt_tokens': 78, 'total_tokens': 88}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-7b7f22f3-73c7-4181-83ea-c1cc4d730eab-0', usage_metadata={'input_tokens': 78, 'output_tokens': 10, 'total_tokens': 88})

### <b><font color='darkgreen'>Extraction</font></b>
Extraction is similar to tagging, but used for extracting multiple pieces of information.

In [27]:
class Person(BaseModel):
    """Information about a person."""
    name: str = Field(description="person's name")
    age: int = Field(description="person's age")

In [40]:
person_list_adapter = TypeAdapter(List[Person])

class Information(BaseModel):
    """Information to extract."""
    people: list[Person] = Field(description="List of info about people")

In [44]:
convert_pydantic_to_openai_function(Information)

  warn_deprecated(


{'name': 'Information',
 'description': 'Information to extract.',
 'parameters': {'$defs': {'Person': {'description': 'Information about a person.',
    'properties': {'name': {'description': "person's name", 'type': 'string'},
     'age': {'description': "person's age", 'type': 'integer'}},
    'required': ['name', 'age'],
    'type': 'object'}},
  'properties': {'people': {'description': 'List of info about people',
    'items': {'description': 'Information about a person.',
     'properties': {'name': {'description': "person's name", 'type': 'string'},
      'age': {'description': "person's age", 'type': 'integer'}},
     'required': ['name', 'age'],
     'type': 'object'},
    'type': 'array'}},
  'required': ['people'],
  'type': 'object'}}

In [46]:
extraction_functions = [convert_pydantic_to_openai_function(Information)]
extraction_model = model.bind(functions=extraction_functions, function_call={"name": "Information"})

In [47]:
extraction_model.invoke("Joe is 30, his mom is Martha")

AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{"people":[{"name":"Joe","age":30},{"name":"Martha","age":null}]}', 'name': 'Information'}}, response_metadata={'token_usage': {'completion_tokens': 21, 'prompt_tokens': 93, 'total_tokens': 114}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-fc8d1628-346b-4f6d-a55d-7c8c0d4c211e-0', usage_metadata={'input_tokens': 93, 'output_tokens': 21, 'total_tokens': 114})

In [48]:
prompt = ChatPromptTemplate.from_messages([
    ("system", "Extract the relevant information, if not explicitly provided do not guess. Extract partial info"),
    ("human", "{input}")
])

In [49]:
extraction_chain = prompt | extraction_model

In [50]:
extraction_chain.invoke({"input": "John is 30, his mom is Mary."})

AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{"people":[{"name":"John","age":30},{"name":"Mary"}]}', 'name': 'Information'}}, response_metadata={'token_usage': {'completion_tokens': 16, 'prompt_tokens': 111, 'total_tokens': 127}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-4c8d1fa8-ead9-4996-bfc4-a5c6c6382d58-0', usage_metadata={'input_tokens': 111, 'output_tokens': 16, 'total_tokens': 127})

In [53]:
extraction_chain = prompt | extraction_model | JsonOutputFunctionsParser()

In [54]:
extraction_chain.invoke({"input": "John is 30, his mom is Mary."})

{'people': [{'name': 'John', 'age': 30}, {'name': 'Mary'}]}

In [56]:
extraction_chain = prompt | extraction_model | JsonKeyOutputFunctionsParser(key_name="people")

In [57]:
extraction_chain.invoke({"input": "John is 30, his mom is Mary."})

[{'name': 'John', 'age': 30}, {'name': 'Mary'}]

### <b><font color='darkgreen'>Doing it for real</font></b>
<b><font size='3ptx'>We can apply tagging to a larger body of text.</font></b>

For example, let's load this blog post and extract tag information from a sub-set of the text.

In [87]:
import os
from langchain.document_loaders import WebBaseLoader
from langchain.schema.runnable import RunnableLambda
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [60]:
loader = WebBaseLoader("https://lilianweng.github.io/posts/2023-06-23-agent/")
documents = loader.load()

In [61]:
doc = documents[0]
page_content = doc.page_content[:10000]

In [83]:
def print_lines(content: str):
    for line in content.split('\n'):
        line = line.strip()
        if not line:
            continue
        print(line)

In [84]:
print_lines(page_content[:1000])

LLM Powered Autonomous Agents | Lil'Log
Lil'Log
Posts
Archive
Search
Tags
FAQ
emojisearch.app
LLM Powered Autonomous Agents
Date: June 23, 2023  |  Estimated Reading Time: 31 min  |  Author: Lilian Weng
Table of Contents
Agent System Overview
Component One: Planning
Task Decomposition
Self-Reflection
Component Two: Memory
Types of Memory
Maximum Inner Product Search (MIPS)
Component Three: Tool Use
Case Studies
Scientific Discovery Agent
Generative Agents Simulation
Proof-of-Concept Examples
Challenges
Citation
References
Building agents with LLM (large language model) as its core controller is a cool concept. Several proof-of-concepts demos, such as AutoGPT, GPT-Engineer and BabyAGI, serve as inspiring examples. The potentiality of LLM extends beyond generating well-written copies, stories, essays and programs; it can be framed as a powerful general


In [64]:
class Overview(BaseModel):
    """Overview of a section of text."""
    summary: str = Field(description="Provide a concise summary of the content.")
    language: str = Field(description="Provide the language that the content is written in.")
    keywords: str = Field(description="Provide keywords related to the content.")

In [68]:
overview_tagging_function = [convert_pydantic_to_openai_function(Overview)]

tagging_model = model.bind(
    functions=overview_tagging_function,
    function_call={"name":"Overview"})
tagging_chain = prompt | tagging_model | JsonOutputFunctionsParser()

In [66]:
tagging_chain.invoke({"input": page_content})

{'summary': 'This article discusses building autonomous agents powered by LLM (large language model) as the core controller. It covers components like planning, memory, and tool use, along with challenges and references.',
 'language': 'English',
 'keywords': 'LLM, autonomous agents, planning, memory, tool use, challenges, references'}

In [67]:
class Paper(BaseModel):
    """Information about papers mentioned."""
    title: str
    author: Optional[str]


class Info(BaseModel):
    """Information to extract"""
    papers: List[Paper]

In [69]:
paper_extraction_function = [
    convert_pydantic_to_openai_function(Info)
]
extraction_model = model.bind(
    functions=paper_extraction_function, 
    function_call={"name":"Info"}
)
extraction_chain = prompt | extraction_model | JsonKeyOutputFunctionsParser(key_name="papers")

In [70]:
extraction_chain.invoke({"input": page_content})

[{'title': 'LLM Powered Autonomous Agents', 'author': 'Lilian Weng'}]

In [71]:
template = """A article will be passed to you. Extract from it all papers that are mentioned by this article. 

Do not extract the name of the article itself. If no papers are mentioned that's fine - you don't need to extract any! Just return an empty list.

Do not make up or guess ANY extra information. Only extract what exactly is in the text."""

prompt = ChatPromptTemplate.from_messages([
    ("system", template),
    ("human", "{input}")
])

In [72]:
extraction_chain = prompt | extraction_model | JsonKeyOutputFunctionsParser(key_name="papers")

In [73]:
extraction_chain.invoke({"input": page_content})

[{'title': 'Chain of thought (CoT; Wei et al. 2022)', 'author': None},
 {'title': 'Tree of Thoughts (Yao et al. 2023)', 'author': None},
 {'title': 'LLM+P (Liu et al. 2023)', 'author': None},
 {'title': 'ReAct (Yao et al. 2023)', 'author': None},
 {'title': 'Reflexion (Shinn & Labash 2023)', 'author': None},
 {'title': 'Chain of Hindsight (CoH; Liu et al. 2023)', 'author': None},
 {'title': 'Algorithm Distillation (AD; Laskin et al. 2023)', 'author': None}]

In [74]:
extraction_chain.invoke({"input": "hi"})

[]

In [76]:
text_splitter = RecursiveCharacterTextSplitter(chunk_overlap=0)

In [77]:
splits = text_splitter.split_text(doc.page_content)

In [78]:
len(splits)

15

In [79]:
def flatten(matrix):
    flat_list = []
    for row in matrix:
        flat_list += row
    return flat_list

In [80]:
flatten([[1, 2], [3, 4]])

[1, 2, 3, 4]

In [86]:
print_lines(splits[0])

LLM Powered Autonomous Agents | Lil'Log
Lil'Log
Posts
Archive
Search
Tags
FAQ
emojisearch.app
LLM Powered Autonomous Agents
Date: June 23, 2023  |  Estimated Reading Time: 31 min  |  Author: Lilian Weng
Table of Contents
Agent System Overview
Component One: Planning
Task Decomposition
Self-Reflection
Component Two: Memory
Types of Memory
Maximum Inner Product Search (MIPS)
Component Three: Tool Use
Case Studies
Scientific Discovery Agent
Generative Agents Simulation
Proof-of-Concept Examples
Challenges
Citation
References
Building agents with LLM (large language model) as its core controller is a cool concept. Several proof-of-concepts demos, such as AutoGPT, GPT-Engineer and BabyAGI, serve as inspiring examples. The potentiality of LLM extends beyond generating well-written copies, stories, essays and programs; it can be framed as a powerful general problem solver.
Agent System Overview#
In a LLM-powered autonomous agent system, LLM functions as the agent’s brain, complemented by seve

In [88]:
prep = RunnableLambda(
    lambda x: [{"input": doc} for doc in text_splitter.split_text(x)]
)

In [89]:
prep.invoke("hi")

[{'input': 'hi'}]

In [90]:
chain = prep | extraction_chain.map() | flatten

In [91]:
chain.invoke(doc.page_content)

[{'title': 'AutoGPT', 'author': None},
 {'title': 'GPT-Engineer', 'author': None},
 {'title': 'BabyAGI', 'author': None},
 {'title': 'Chain of thought', 'author': 'Wei et al. 2022'},
 {'title': 'Tree of Thoughts', 'author': 'Yao et al. 2023'},
 {'title': 'LLM+P', 'author': 'Liu et al. 2023'},
 {'title': 'ReAct', 'author': 'Yao et al. 2023'},
 {'title': 'Reflexion', 'author': 'Shinn & Labash 2023'},
 {'title': 'Chain of Hindsight', 'author': 'Liu et al. 2023'},
 {'title': 'Algorithm Distillation', 'author': 'Laskin et al. 2023'},
 {'title': 'Laskin et al. 2023', 'author': None},
 {'title': 'Miller 1956', 'author': None},
 {'title': 'Duan et al. 2017', 'author': None},
 {'title': 'Google Blog', 'author': None},
 {'title': 'MRKL (Karpas et al. 2022)', 'author': None},
 {'title': 'TALM (Tool Augmented Language Models; Parisi et al. 2022)',
  'author': None},
 {'title': 'Toolformer (Schick et al. 2023)', 'author': None},
 {'title': 'HuggingGPT (Shen et al. 2023)', 'author': None},
 {'title'

## <b><font color='darkblue'>Supplement</font></b>
* [Deeplearning.ai - Functions, Tools and Agents with LangChain ch3: LangChain Expression Language (LCEL)](https://learn.deeplearning.ai/courses/functions-tools-agents-langchain/lesson/3/langchain-expression-language-(lcel))
*  [Deeplearning.ai - Functions, Tools and Agents with LangChain ch4: OpenAI functioning call in LangChain](https://github.com/johnklee/ml_articles/blob/master/deeplearning_ai/functions_tools_agents_langchain/ch4_openai-function-calling-in-langchain.ipynb)
*  [Deeplearning.ai - Functions, Tools and Agents with LangChain ch5: Tagging and Extraction](https://github.com/johnklee/ml_articles/blob/master/deeplearning_ai/functions_tools_agents_langchain/ch5_tagging_and_extraction.ipynb)
*  [Deeplearning.ai - Functions, Tools and Agents with LangChain ch6: Tooling and Routing](https://learn.deeplearning.ai/courses/functions-tools-agents-langchain/lesson/6/tools-and-routing)