In [30]:
import os
from dotenv import load_dotenv
load_dotenv()

True

In [31]:
os.environ["OPENAI_API_KEY"]=os.getenv("OPENAI_API_KEY")

## Langsmith Tracking and tracing

os.environ["LANGCHAIN_API_KEY"]=os.getenv("LANGCHAIN_API_KEY")
os.environ["LANGCHAIN_TRACING_V2"]="true"
os.environ["LANGCHAIN_PROJECT"]=os.getenv("LANGCHAIN_PROJECT")

In [3]:
#OpenAI Tools
#These output parsers extract tool calls from OpenAI's function calling API responses. This means they are only usable with models that support function calling, 
# and specifically the latest tools and tool_choice parameters.
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field, validator
from langchain_openai import ChatOpenAI


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  exec(code_obj, self.user_global_ns, self.user_ns)


In [4]:
class Joke(BaseModel):
    """Joke to tell user."""

    setup: str = Field(description="question to set up a joke")
    punchline: str = Field(description="answer to resolve the joke")

In [5]:
model = ChatOpenAI(model="gpt-3.5-turbo", temperature=0).bind_tools([Joke])

In [6]:
model.kwargs["tools"]

[{'type': 'function',
  'function': {'name': 'Joke',
   'description': 'Joke to tell user.',
   'parameters': {'type': 'object',
    'properties': {'setup': {'description': 'question to set up a joke',
      'type': 'string'},
     'punchline': {'description': 'answer to resolve the joke',
      'type': 'string'}},
    'required': ['setup', 'punchline']}}}]

In [7]:
prompt = ChatPromptTemplate.from_messages(
    [("system", "You are helpful assistant"), ("user", "{input}")]
)

In [None]:
from langchain.output_parsers.openai_tools import JsonOutputToolsParser
parser = JsonOutputToolsParser()
#we can also user Key_name
#parser = JsonOutputKeyToolsParser(key_name="Joke")

#Certain models can return multiple tool invocations each call, so by default the output is a list. If we just want to return the first tool invocation, we can specify first_tool_only=True
#parser = JsonOutputKeyToolsParser(key_name="Joke", first_tool_only=True)
chain = prompt | model | parser

In [9]:
chain.invoke({"input": "tell me a joke"})

[{'args': {'setup': "Why couldn't the bicycle stand up by itself?",
   'punchline': 'Because it was two tired!'},
  'type': 'Joke'}]

In [10]:
#PydanticToolsParser
#This builds on top of JsonOutputToolsParser but passes the results to a Pydantic Model. This allows for further validation should you choose.
from langchain.output_parsers.openai_tools import PydanticToolsParser

In [11]:
class Joke(BaseModel):
    """Joke to tell user."""

    setup: str = Field(description="question to set up a joke")
    punchline: str = Field(description="answer to resolve the joke")

    # You can add custom validation logic easily with Pydantic.
    @validator("setup")
    def question_ends_with_question_mark(cls, field):
        if field[-1] != "?":
            raise ValueError("Badly formed question!")
        return field


parser = PydanticToolsParser(tools=[Joke])

In [12]:
model = ChatOpenAI(model="gpt-3.5-turbo", temperature=0).bind_tools([Joke])
chain = prompt | model | parser

In [13]:
chain.invoke({"input": "tell me a joke"})

[Joke(setup="Why couldn't the bicycle stand up by itself?", punchline='Because it was two tired!')]

Pandas DataFrame Parser

A Pandas DataFrame is a popular data structure in the Python programming language, commonly used for data manipulation and analysis. It provides a comprehensive set of tools for working with structured data, making it a versatile option for tasks such as data cleaning, transformation, and analysis.

This output parser allows users to specify an arbitrary Pandas DataFrame and query LLMs for data in the form of a formatted dictionary that extracts data from the corresponding DataFrame. Keep in mind that large language models are leaky abstractions! You'll have to use an LLM with sufficient capacity to generate a well-formed query as per the defined format instructions.

Use Pandas' DataFrame object to declare the DataFrame you wish to perform queries on.

In [15]:
import pprint
from typing import Any, Dict

import pandas as pd
from langchain.output_parsers import PandasDataFrameOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI

In [16]:
model = ChatOpenAI(temperature=0)

In [17]:
# Solely for documentation purposes.
def format_parser_output(parser_output: Dict[str, Any]) -> None:
    for key in parser_output.keys():
        parser_output[key] = parser_output[key].to_dict()
    return pprint.PrettyPrinter(width=4, compact=True).pprint(parser_output)

In [18]:
# Define your desired Pandas DataFrame.
df = pd.DataFrame(
    {
        "num_legs": [2, 4, 8, 0],
        "num_wings": [2, 0, 0, 0],
        "num_specimen_seen": [10, 2, 1, 8],
    }
)

# Set up a parser + inject instructions into the prompt template.
parser = PandasDataFrameOutputParser(dataframe=df)

In [19]:
# Here's an example of a column operation being performed.
df_query = "Retrieve the num_wings column."

# Set up the prompt.
prompt = PromptTemplate(
    template="Answer the user query.\n{format_instructions}\n{query}\n",
    input_variables=["query"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

chain = prompt | model | parser
parser_output = chain.invoke({"query": df_query})

format_parser_output(parser_output)

{'num_wings': {0: 2,
               1: 0,
               2: 0,
               3: 0}}


In [20]:
# Here's an example of a row operation being performed.
df_query = "Retrieve the first row."

# Set up the prompt.
prompt = PromptTemplate(
    template="Answer the user query.\n{format_instructions}\n{query}\n",
    input_variables=["query"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

chain = prompt | model | parser
parser_output = chain.invoke({"query": df_query})

format_parser_output(parser_output)

{'0': {'num_legs': 2,
       'num_specimen_seen': 10,
       'num_wings': 2}}


In [21]:
# Here's an example of a random Pandas DataFrame operation limiting the number of rows
df_query = "Retrieve the average of the num_legs column from rows 1 to 3."

# Set up the prompt.
prompt = PromptTemplate(
    template="Answer the user query.\n{format_instructions}\n{query}\n",
    input_variables=["query"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

chain = prompt | model | parser
parser_output = chain.invoke({"query": df_query})

print(parser_output)

{'mean': np.float64(4.0)}


In [24]:
# Here's an example of a poorly formatted query
df_query = "Retrieve the mean of the num_legs column."

# Set up the prompt.
prompt = PromptTemplate(
    template="Answer the user query.\n{format_instructions}\n{query}\n",
    input_variables=["query"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

chain = prompt | model | parser
parser_output = chain.invoke({"query": df_query})
print(parser_output)

{'mean': np.float64(3.5)}


CSV parser
This output parser can be used when you want to return a list of comma-separated items.

In [25]:
from langchain.output_parsers import CommaSeparatedListOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI

output_parser = CommaSeparatedListOutputParser()

format_instructions = output_parser.get_format_instructions()
prompt = PromptTemplate(
    template="List five {subject}.\n{format_instructions}",
    input_variables=["subject"],
    partial_variables={"format_instructions": format_instructions},
)

model = ChatOpenAI(temperature=0)

chain = prompt | model | output_parser

In [26]:
chain.invoke({"subject": "ice cream flavors"})

['Vanilla',
 'Chocolate',
 'Strawberry',
 'Mint Chocolate Chip',
 'Cookies and Cream']

XML parser
This output parser allows users to obtain results from LLM in the popular XML format.

Keep in mind that large language models are leaky abstractions! You'll have to use an LLM with sufficient capacity to generate well-formed XML.

In the following example we use Claude model (https://docs.anthropic.com/claude/docs) which works really well with XML tags.

In [27]:
from langchain.output_parsers import XMLOutputParser
from langchain_community.chat_models import ChatAnthropic
from langchain_core.prompts import PromptTemplate

In [None]:
# Ensure the ANTHROPIC_API_KEY environment variable is set
anthropic_api_key = os.getenv("ANTHROPIC_API_KEY")
if not anthropic_api_key:
	raise ValueError("ANTHROPIC_API_KEY environment variable is not set.")

# Initialize the ChatAnthropic model with the API key
model = ChatAnthropic(model="claude-2", max_tokens_to_sample=512, temperature=0.1, anthropic_api_key=anthropic_api_key)

# Ensure that the model is used in a way that does not invoke unsupported methods
# Avoid using components that require token counting or any unsupported features.

In [None]:
actor_query = "Generate the shortened filmography for Tom Hanks."
output = model.invoke(
    f"""{actor_query}
Please enclose the movies in <movie></movie> tags"""
)
print(output.content)

In [None]:
parser = XMLOutputParser()

prompt = PromptTemplate(
    template="""{query}\n{format_instructions}""",
    input_variables=["query"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

chain = prompt | model | parser

output = chain.invoke({"query": actor_query})
print(output)

In [None]:
parser = XMLOutputParser(tags=["movies", "actor", "film", "name", "genre"])
prompt = PromptTemplate(
    template="""{query}\n{format_instructions}""",
    input_variables=["query"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)


chain = prompt | model | parser

output = chain.invoke({"query": actor_query})

print(output)