Extract Structured data from unstructured text

Tagging
In tagging we pass some unstructured piece of text along with some structured description and then we use the llm to create a structurs piece of text

Tagging and Extraction using the openAI functions

In [1]:
import os
import openai
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file
openai.api_key = os.environ['OPENAI_API_KEY']

In [2]:
from typing import List
from pydantic import BaseModel, Field
from langchain.utils.openai_functions import convert_pydantic_to_openai_function

In [3]:
class Tagging(BaseModel):
    """Tag the piece of the text with particular info"""
    sentiment : str = Field(description = "sentiment of the text, it can be positive, negative nor neutral"),
    language : str = Field(description = "language of text (should be ISO 639-1 code)")

In [4]:
convert_pydantic_to_openai_function(Tagging)

  convert_pydantic_to_openai_function(Tagging)


{'name': 'Tagging',
 'description': 'Tag the piece of the text with particular info',
 'parameters': {'properties': {'sentiment': {'type': 'string'},
   'language': {'description': 'language of text (should be ISO 639-1 code)',
    'type': 'string'}},
  'required': ['language'],
  'type': 'object'}}

In [5]:
from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI

In [8]:
# by using the temperature we determine the behavouir of the model
# temperature = 0 means that the model is deterministic and focused
# while the Higher Temperatures (>0, e.g., 0.7): Increases randomness and creativity. The model will explore less likely possibilities, resulting in more varied and creative responses.
model = ChatOpenAI(temperature = 0) #the range of the temperature of the model is within the randge of 0 and 2

  model = ChatOpenAI(temperature = 0)


In [9]:
tagging_fucntions = [convert_pydantic_to_openai_function(Tagging)]



In [10]:
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", "Think carefully and then tag the text as instructed"),
        ("user","{input}")
    ]
)

In [13]:
model_with_functions = model.bind(functions = tagging_fucntions, function_call = {"name":"Tagging"})

In [14]:
tagging_chain = prompt | model_with_functions

In [15]:
tagging_chain.invoke("I love langchain")

AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{"sentiment":"positive","language":"en"}', 'name': 'Tagging'}}, response_metadata={'token_usage': {'completion_tokens': 11, 'prompt_tokens': 91, 'total_tokens': 102, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-3.5-turbo', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-dd3ed4de-26c8-494f-a55c-1d65e04c7032-0')

In [17]:
tagging_chain.invoke("Anand do not study")

AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{"sentiment":"negative","language":"en"}', 'name': 'Tagging'}}, response_metadata={'token_usage': {'completion_tokens': 11, 'prompt_tokens': 92, 'total_tokens': 103, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-3.5-turbo', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-b2e4fbba-c8d4-400a-bbe7-65e033e87e0f-0')

In [18]:
from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser #used to give the output as the json blob

In [19]:
tagging_chain = prompt | model_with_functions | JsonOutputFunctionsParser()

In [20]:
tagging_chain.invoke("anand do not study")

{'sentiment': 'negative', 'language': 'en'}

Extraction 
It is similat to tagging, but used for extracting the multiple pieces of the information

In [25]:
from typing import Optional
class Person(BaseModel):
    """Information about a person"""
    name : str = Field(description = "persons name ")
    age: Optional[int] = Field(description= "person's age")

In [26]:
class Information(BaseModel):
    """Information to extract"""
    people: List[Person] = Field(description= "List of info about the people")

In [27]:
convert_pydantic_to_openai_function(Information)

{'name': 'Information',
 'description': 'Information to extract',
 'parameters': {'properties': {'people': {'description': 'List of info about the people',
    'items': {'description': 'Information about a person',
     'properties': {'name': {'description': 'persons name ', 'type': 'string'},
      'age': {'anyOf': [{'type': 'integer'}, {'type': 'null'}],
       'description': "person's age"}},
     'required': ['name', 'age'],
     'type': 'object'},
    'type': 'array'}},
  'required': ['people'],
  'type': 'object'}}

In [29]:
extraction_functions = [convert_pydantic_to_openai_function(Information)]

In [38]:
extraction_model = model.bind(functions = extraction_functions, function_call ={"name":"Information"} )

In [39]:
extraction_model.invoke("Anand is 20")

AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{"people":[{"name":"Anand","age":20}]}', 'name': 'Information'}}, response_metadata={'token_usage': {'completion_tokens': 14, 'prompt_tokens': 91, 'total_tokens': 105, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-3.5-turbo', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-8032255e-f1c7-48f5-bbce-4acb7e3ae0cd-0')

In [40]:
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", "extract the relevant information , if not expilicilty mentioned then assume it to be None"),
        ("human","{input}")
    ]
)

In [43]:
extraction_chain = prompt | extraction_model | JsonOutputFunctionsParser()

In [44]:
extraction_chain.invoke({"input":"Joe is 30, his mom is martha"})

{'people': [{'name': 'Joe', 'age': 30}, {'name': 'Martha', 'age': None}]}

Doing it for real 
we can apply tagging to a larger body of the text 

In [47]:
from langchain.document_loaders import WebBaseLoader

In [49]:
loader = WebBaseLoader("https://lilianweng.github.io/posts/2023-06-23-agent/")
documents = loader.load()

In [50]:
doc = documents[0]

In [51]:
page_content = doc.page_content[:10000]

In [52]:
print(page_content[:1000])







LLM Powered Autonomous Agents | Lil'Log







































Lil'Log

















|






Posts




Archive




Search




Tags




FAQ




emojisearch.app









      LLM Powered Autonomous Agents
    
Date: June 23, 2023  |  Estimated Reading Time: 31 min  |  Author: Lilian Weng


 


Table of Contents



Agent System Overview

Component One: Planning

Task Decomposition

Self-Reflection


Component Two: Memory

Types of Memory

Maximum Inner Product Search (MIPS)


Component Three: Tool Use

Case Studies

Scientific Discovery Agent

Generative Agents Simulation

Proof-of-Concept Examples


Challenges

Citation

References





Building agents with LLM (large language model) as its core controller is a cool concept. Several proof-of-concepts demos, such as AutoGPT, GPT-Engineer and BabyAGI, serve as inspiring examples. The potentiality of LLM extends beyond generating well-written copies, stories, essays and programs; it can be framed as a powerful gene

In [53]:
class Overview(BaseModel):
    """Overview of section of a text"""
    summary: str = Field(description= "Provide the summary of the given text")
    language: str = Field(description="Provid the language of the content provided")
    keywords: str = Field(description= "Provide keywords related to the article") 

In [55]:
overview_tagging_functions = [convert_pydantic_to_openai_function(Overview)]
tagging_model = model.bind(functions = overview_tagging_functions, function_call = {"name":"Overview"})

In [57]:
tagging_chain = prompt | tagging_model | JsonOutputFunctionsParser()

In [58]:
tagging_chain.invoke(page_content)

{'summary': 'This article discusses the concept of building autonomous agents powered by LLM (large language model) as the core controller. It explores the components of such agents, including planning, memory, and tool use. The article also delves into techniques like task decomposition, self-reflection, and challenges faced in developing LLM-powered autonomous agents.',
 'language': 'English',
 'keywords': 'LLM, autonomous agents, large language model, planning, memory, tool use, task decomposition, self-reflection, challenges'}

Extracting the papers mentioned in the blog 

In [59]:
class Paper(BaseModel):
    """Information about the papers mentioned"""
    title:str
    author: Optional[str]

In [60]:
class Info(BaseModel):
    """Information to extract"""
    papers: List[Paper]

In [61]:
paper_extraction_functions = [convert_pydantic_to_openai_function(Info)]
extraction_model = model.bind(functions  = paper_extraction_functions, function_call = {"name":"Info"})
extraction_chain = prompt | extraction_model | JsonOutputFunctionsParser()

In [62]:
extraction_chain.invoke(page_content)

{'papers': [{'title': 'LLM Powered Autonomous Agents',
   'author': 'Lilian Weng'}]}

A better prompt template

In [64]:
template = """A article will be passed to you. Extract from it all papers that are mentioned by this article follow by its author. 

Do not extract the name of the article itself. If no papers are mentioned that's fine - you don't need to extract any! Just return an empty list.

Do not make up or guess ANY extra information. Only extract what exactly is in the text."""

prompt = ChatPromptTemplate.from_messages([
    ("system", template),
    ("human", "{input}")
])

In [65]:
extraction_chain = prompt | extraction_model | JsonOutputFunctionsParser()

In [66]:
extraction_chain.invoke(page_content)

{'papers': [{'title': 'Chain of thought (CoT; Wei et al. 2022)',
   'author': 'Wei et al. 2022'},
  {'title': 'Tree of Thoughts (Yao et al. 2023)', 'author': 'Yao et al. 2023'},
  {'title': 'LLM+P (Liu et al. 2023)', 'author': 'Liu et al. 2023'},
  {'title': 'ReAct (Yao et al. 2023)', 'author': 'Yao et al. 2023'},
  {'title': 'Reflexion (Shinn & Labash 2023)',
   'author': 'Shinn & Labash 2023'},
  {'title': 'Chain of Hindsight (CoH; Liu et al. 2023)',
   'author': 'Liu et al. 2023'},
  {'title': 'Algorithm Distillation (AD; Laskin et al. 2023)',
   'author': 'Laskin et al. 2023'}]}