# Agent Middleware Essentials

Add production-ready middleware for message management, limits, fallbacks, and dynamic prompts.

**What you'll learn:**
- Middleware adds production capabilities without changing agent logic
- Trim messages keeps recent messages within context window
- Delete messages removes specific or all messages from state
- SummarizationMiddleware prevents context overflow with summaries
- Limits control costs and API usage
- Fallbacks improve reliability
- Dynamic prompts enable context-aware behavior

In [None]:
import sys
sys.path.append('../')

import os
from dotenv import load_dotenv
load_dotenv()

In [None]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.agents import create_agent
from langchain.messages import HumanMessage
from langgraph.checkpoint.sqlite import SqliteSaver
import sqlite3
from scripts import base_tools

In [None]:
model = ChatGoogleGenerativeAI(model='gemini-2.5-flash')

# Setup checkpointer
conn = sqlite3.connect("db/middleware_agent.db", check_same_thread=False)
checkpointer = SqliteSaver(conn)
checkpointer.setup()

## Trim Messages

Keep only recent messages to fit context window.

In [None]:
from langchain.messages import RemoveMessage
from langgraph.graph.message import REMOVE_ALL_MESSAGES
from langchain.agents import AgentState
from langchain.agents.middleware import before_model
from langgraph.runtime import Runtime
from typing import Any

@before_model
def trim_messages(state: AgentState, runtime: Runtime):
    """Keep only the last few messages to fit context window."""
    messages = state["messages"]

    if len(messages) <= 3:
        return None  # No changes needed

    first_msg = messages[0]
    recent_messages = messages[-3:] if len(messages) % 2 == 0 else messages[-4:]
    new_messages = [first_msg] + recent_messages

    return {
        "messages": [
            RemoveMessage(id=REMOVE_ALL_MESSAGES),
            *new_messages
        ]
    }

agent = create_agent(
    model=model,
    tools=[],
    middleware=[trim_messages],
    checkpointer=checkpointer
)

config = {"configurable": {"thread_id": "trim_session"}}

agent.invoke({"messages": "hi, my name is bob"}, config)
agent.invoke({"messages": "write a short poem about cats"}, config)
agent.invoke({"messages": "now do the same but for dogs"}, config)
response = agent.invoke({"messages": "what's my name?"}, config)

response['messages'][-1].content

## Delete Messages

Remove specific messages or clear entire history.

In [None]:
from langchain.agents.middleware import after_model

@after_model
def delete_old_messages(state: AgentState, runtime: Runtime):
    """Remove old messages to keep conversation manageable."""
    messages = state["messages"]
    if len(messages) > 2:
        # Remove the earliest two messages
        return {"messages": [RemoveMessage(id=m.id) for m in messages[:2]]}
    return None

agent = create_agent(
    model=model,
    tools=[],
    middleware=[delete_old_messages],
    checkpointer=checkpointer
)

config = {"configurable": {"thread_id": "delete_session"}}

agent.invoke({"messages": "hi! I'm alice"}, config)
response = agent.invoke({"messages": "what's my name?"}, config)

response['messages'][-1].content

## SummarizationMiddleware

Automatically compress long conversations using summaries.

In [None]:
from langchain.agents.middleware import SummarizationMiddleware

agent = create_agent(
    model=model,
    tools=[base_tools.web_search],
    checkpointer=checkpointer,
    middleware=[
        SummarizationMiddleware(
            model=ChatGoogleGenerativeAI(model='gemini-2.5-flash'),
            trigger=[("messages", 15)],  # Summarize when > 15 messages
            keep=("messages", 5)  # Keep last 5 unsummarized
        )
    ]
)

config = {'configurable': {'thread_id': 'summary_session'}}
response = agent.invoke({
    'messages': [HumanMessage(
        "Search for Apple, Microsoft, and Tesla stock news"
    )]
}, config)

len(response['messages'])

## ModelCallLimitMiddleware

Prevent runaway costs by limiting model calls.

In [None]:
from langchain.agents.middleware import ModelCallLimitMiddleware

agent = create_agent(
    model=model,
    tools=[base_tools.web_search],
    middleware=[
        ModelCallLimitMiddleware(
            run_limit=2,  # Max 2 model calls
            exit_behavior="end"  # Stop when limit reached
        )
    ]
)

response = agent.invoke({
    'messages': [HumanMessage("Search for news on 5 different companies")]
})

response['messages'][-1].content

## ToolCallLimitMiddleware

Limit tool executions to manage API usage.

In [None]:
from langchain.agents.middleware import ToolCallLimitMiddleware

agent = create_agent(
    model=model,
    tools=[base_tools.web_search],
    middleware=[
        ToolCallLimitMiddleware(
            run_limit=2,
            exit_behavior="continue"  # Continue without more tools
        )
    ]
)

response = agent.invoke({
    'messages': [HumanMessage("Search for Apple, Microsoft, and Google news")]
})

response['messages'][-1].content

## ModelFallbackMiddleware

Fallback to alternate model on failure or for cost optimization.

In [None]:
from langchain.agents.middleware import ModelFallbackMiddleware

fallback_model = ChatGoogleGenerativeAI(model='gemini-2.0-flash-exp')

agent = create_agent(
    model=model,
    tools=[base_tools.web_search],
    middleware=[ModelFallbackMiddleware(fallback_model)]
)

## Dynamic System Prompt

Modify system prompt based on runtime context.

In [None]:
from typing import TypedDict
from langchain.agents.middleware import dynamic_prompt, ModelRequest

class Context(TypedDict):
    user_role: str

@dynamic_prompt
def user_role_prompt(request: ModelRequest):
    """Generate system prompt based on user role."""
    user_role = request.runtime.context.get("user_role", "user")
    base_prompt = "You are a helpful assistant."
    
    if user_role == "expert":
        return f"{base_prompt} Provide detailed technical responses."
    elif user_role == "beginner":
        return f"{base_prompt} Explain concepts simply and avoid jargon."
    
    return base_prompt

agent = create_agent(
    model=model,
    tools=[base_tools.web_search],
    middleware=[user_role_prompt],
    context_schema=Context
)

In [None]:
# Test with expert context
response = agent.invoke(
    {"messages": [{"role": "user", "content": "Explain machine learning"}]},
    context={"user_role": "expert"}
)

response['messages'][-1].content

In [None]:
# Test with beginner context
response = agent.invoke(
    {"messages": [{"role": "user", "content": "Explain machine learning"}]},
    context={"user_role": "beginner"}
)

response['messages'][-1].content

## Combining Multiple Middleware

In [None]:
# Production agent with stacked middleware
agent = create_agent(
    model=model,
    tools=[base_tools.web_search],
    checkpointer=checkpointer,
    middleware=[
        SummarizationMiddleware(
            model=ChatGoogleGenerativeAI(model='gemini-2.5-flash'),
            trigger=[("messages", 15)],
            keep=("messages", 5)
        ),
        ModelCallLimitMiddleware(run_limit=3, exit_behavior="end"),
        ToolCallLimitMiddleware(run_limit=3, exit_behavior="continue"),
        ModelFallbackMiddleware(fallback_model)
    ]
)

config = {'configurable': {'thread_id': 'production'}}
response = agent.invoke({
    'messages': [HumanMessage("Analyze tech sector trends")]
}, config)

response['messages'][-1].content