In [32]:
import getpass
import os

os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = getpass.getpass("Enter your Lang API key: ")

os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API key: ")

from langchain_openai import ChatOpenAI

model = ChatOpenAI(model="gpt-3.5-turbo-0125")

We need a persistence layer to store previously exchanged messages.

Wrapping our chat model in a minimal LangGraph application allows us to automatically persist the message history, simplifying the development of multi-turn applications.

LangGraph comes with a simple in-memory checkpointer, which we use below.

In [None]:
from langgraph.checkpoint.memory import MemorySaver
from langgraph.graph import START, MessagesState, StateGraph

# this is a new graph
workflow = StateGraph(state_schema=MessagesState)

# this calls the model
def call_model(state:MessagesState):
    response = model.invoke(state["messages"])
    return { "messages": response }

# the graph has one node (besides the start)
workflow.add_edge(START, "model")
workflow.add_node("model", call_model)

# add memory
memory = MemorySaver()
app = workflow.compile(checkpointer=memory)


# This enables us to support multiple conversation threads with a single application,
# a common requirement when your application has multiple users.
config = {
    "configurable": {
        # each user (conversation) has a unique thread_id
        "thread_id": "test"
    }
}

In [9]:
from langchain_core.messages import HumanMessage

query = input("Enter your query: ")

input_messages = [HumanMessage(query)]
output = app.invoke({ "messages": input_messages }, config)
output["messages"][-1].pretty_print()


Va bene, se hai altre domande o se c'è qualcos'altro di cui vorresti parlare, sentiti libero di chiedere.


Prompt Templates help to turn raw user information into a format that the LLM can work with. In this case, the raw user input is just a message, which we are passing to the LLM. Let's now make that a bit more complicated. First, let's add in a system message with some custom instructions

In [20]:
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You talk like a pirate. Answer all questions to the best of your ability.",
        ),
        # this variable will be replaced with the user's messages
        MessagesPlaceholder(variable_name="messages"),
    ]
)

In [21]:
# this is a new graph
workflow_2 = StateGraph(state_schema=MessagesState)

# this calls the model
def call_model_2(state:MessagesState):
    chain = prompt | model
    response = chain.invoke(state)
    return { "messages": response }

# the graph has one node (besides the start)
workflow_2.add_edge(START, "model")
workflow_2.add_node("model", call_model_2)

# add memory
memory_2 = MemorySaver()
app_2 = workflow_2.compile(checkpointer=memory_2)


config = {
    "configurable": {
        "thread_id": "test"
    }
}

In [22]:
query = "Hi! I'm Andrea."

input_messages = [HumanMessage(query)]
# here the variable "messages" is replaced with the user's messages
output = app_2.invoke({"messages": input_messages}, config)
output["messages"][-1].pretty_print()


Ahoy, Andrea! Pleased to be makin' yer acquaintance, me hearty! How might this old sea dog be assistin' ye today?


Add complication...


In [None]:
prompt_3 = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            # note, another variable is used here: language
            "You are a helpful assistant. Answer all questions to the best of your ability in {language}.",
        ),
        MessagesPlaceholder(variable_name="messages"),
    ]
)

from typing import Sequence

from langchain_core.messages import BaseMessage
from langgraph.graph.message import add_messages
from typing_extensions import Annotated, TypedDict

# there aren't just messages now, but also a language
class State(TypedDict):
    messages: Annotated[Sequence[BaseMessage], add_messages]
    language: str

workflow_3 = StateGraph(state_schema=State)

def call_model_3(state: State):
    chain = prompt_3 | model
    response = chain.invoke(state)
    return {"messages": [response]}


workflow_3.add_edge(START, "model")
workflow_3.add_node("model", call_model_3)

memory_3 = MemorySaver()
app_3 = workflow_3.compile(checkpointer=memory_3)

In [None]:
config = {"configurable": {"thread_id": "abc456"}}
query = "Come mi chiamo?"
language = "Italian"

input_messages = [HumanMessage(query)]
output = app_3.invoke(
    {"messages": input_messages, "language": language},
    config,
)
output["messages"][-1].pretty_print()


Mi dispiace, ma non ho accesso a informazioni personali su di te. Come posso aiutarti oggi?


It is important to add a step that limits the size of the messages you are passing in.

Importantly, __you will want to do this BEFORE the prompt template but AFTER you load previous messages from Message History.__

In [46]:
from langchain_core.messages import SystemMessage, trim_messages, AIMessage

trimmer = trim_messages(
    max_tokens=65,
    strategy="last",
    token_counter=model,
    include_system=True,
    allow_partial=False,
    start_on="human",
)

messages = [
    SystemMessage(content="you're a good assistant"),
    HumanMessage(content="hi! I'm bob"),
    AIMessage(content="hi!"),
    HumanMessage(content="I like vanilla ice cream"),
    AIMessage(content="nice"),
    HumanMessage(content="whats 2 + 2"),
    AIMessage(content="4"),
    HumanMessage(content="thanks"),
    AIMessage(content="no problem!"),
    HumanMessage(content="having fun?"),
    AIMessage(content="yes!"),
]

trimmer.invoke(messages)

[SystemMessage(content="you're a good assistant", additional_kwargs={}, response_metadata={}),
 HumanMessage(content='whats 2 + 2', additional_kwargs={}, response_metadata={}),
 AIMessage(content='4', additional_kwargs={}, response_metadata={}),
 HumanMessage(content='thanks', additional_kwargs={}, response_metadata={}),
 AIMessage(content='no problem!', additional_kwargs={}, response_metadata={}),
 HumanMessage(content='having fun?', additional_kwargs={}, response_metadata={}),
 AIMessage(content='yes!', additional_kwargs={}, response_metadata={})]

In [47]:
workflow_4 = StateGraph(state_schema=State)
prompt_4 = prompt_3


def call_model_4(state: State):
    chain = prompt_4 | model
    trimmed_messages = trimmer.invoke(state["messages"])
    response = chain.invoke(
        {"messages": trimmed_messages, "language": state["language"]}
    )
    return {"messages": [response]}


workflow_4.add_edge(START, "model")
workflow_4.add_node("model", call_model_4)

memory_4 = MemorySaver()
app_4 = workflow_4.compile(checkpointer=memory_4)

In [49]:
config = {"configurable": {"thread_id": "test"}}
# query = "What is my name?"
query = "What math problem did I ask?"
language = "English"

input_messages = messages + [HumanMessage(query)]
output = app_4.invoke(
    {"messages": input_messages, "language": language},
    config,
)
output["messages"][-1].pretty_print()


You asked "having fun?" which is not a math problem.


LLMs can sometimes take a while to respond, and so in order to improve the user experience one thing that most applications do is stream back each token as it is generated. This allows the user to see progress.

By default, `.stream` in our LangGraph application streams application steps -- in this case, the single step of the model response. Setting `stream_mode="messages"` allows us to stream output tokens instead:

In [52]:
config = {"configurable": {"thread_id": "test_test"}}
query = "Hi I'm Andrea, please tell me a joke."
language = "English"

input_messages = [HumanMessage(query)]
for chunk, metadata in app.stream(
    {"messages": input_messages, "language": language},
    config,
    stream_mode="messages",
):
    if isinstance(chunk, AIMessage):  # Filter to just model responses
        print(chunk.content, end="|")

|Hello| Andrea|!| Here|'s| a| joke| for| you|:| What| do| you| call| a| fake| nood|le|?| An| imp|asta|!| 😄||