# Agent Middleware Essentials

Add production-ready middleware for summarization, limits, fallbacks, and dynamic prompts.

In [None]:
import os
from dotenv import load_dotenv

load_dotenv()

In [None]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.agents import create_agent
from langchain.messages import HumanMessage
from langgraph.checkpoint.sqlite import SqliteSaver
import sqlite3
from scripts import base_tools

In [None]:
model = ChatGoogleGenerativeAI(model='gemini-2.5-flash')

# Setup checkpointer
conn = sqlite3.connect("data/middleware_agent.db", check_same_thread=False)
checkpointer = SqliteSaver(conn=conn)

In [None]:
# Baseline agent without middleware
basic_agent = create_agent(
    model=model,
    tools=[base_tools.web_search, base_tools.get_weather],
    checkpointer=checkpointer
)

## SummarizationMiddleware

Automatically compress long conversations when history exceeds threshold.

In [None]:
from langchain.agents.middleware import SummarizationMiddleware

agent_summary = create_agent(
    model=model,
    tools=[base_tools.web_search],
    checkpointer=checkpointer,
    middleware=[
        SummarizationMiddleware(
            model=ChatGoogleGenerativeAI(model='gemini-2.5-flash'),
            trigger=[("messages", 15)],  # Summarize when > 15 messages
            keep=("messages", 5)  # Keep last 5 unsummarized
        )
    ]
)

config = {'configurable': {'thread_id': 'summary_session'}}
response = agent_summary.invoke({
    'messages': [HumanMessage(
        "Search for Apple, Microsoft, and Tesla stock news"
    )]
}, config)

len(response['messages'])

## ModelCallLimitMiddleware

Prevent runaway costs by limiting model calls.

In [None]:
from langchain.agents.middleware import ModelCallLimitMiddleware

agent_limit = create_agent(
    model=model,
    tools=[base_tools.web_search],
    middleware=[
        ModelCallLimitMiddleware(
            run_limit=2,  # Max 2 model calls
            exit_behavior="end"  # Stop when limit reached
        )
    ]
)

response = agent_limit.invoke({
    'messages': [HumanMessage("Search for news on 5 different companies")]
})

response['messages'][-1].text

## ToolCallLimitMiddleware

Limit tool executions to manage API usage.

In [None]:
from langchain.agents.middleware import ToolCallLimitMiddleware

agent_tool_limit = create_agent(
    model=model,
    tools=[base_tools.web_search],
    middleware=[
        ToolCallLimitMiddleware(
            run_limit=2,
            exit_behavior="continue"  # Continue without more tools
        )
    ]
)

response = agent_tool_limit.invoke({
    'messages': [HumanMessage("Search for Apple, Microsoft, and Google news")]
})

response['messages'][-1].text

## ModelFallbackMiddleware

Fallback to alternate model on failure or for cost optimization.

In [None]:
from langchain.agents.middleware import ModelFallbackMiddleware

fallback_model = ChatGoogleGenerativeAI(model='gemini-2.0-flash-exp')

agent_fallback = create_agent(
    model=model,
    tools=[base_tools.web_search],
    middleware=[ModelFallbackMiddleware(fallback_model)]
)

## Dynamic System Prompt

Modify system prompt based on runtime context.

In [None]:
from typing import TypedDict
from langchain.agents.middleware import dynamic_prompt, ModelRequest

class Context(TypedDict):
    user_role: str

@dynamic_prompt
def user_role_prompt(request: ModelRequest) -> str:
    """Generate system prompt based on user role."""
    user_role = request.runtime.context.get("user_role", "user")
    base_prompt = "You are a helpful assistant."
    
    if user_role == "expert":
        return f"{base_prompt} Provide detailed technical responses."
    elif user_role == "beginner":
        return f"{base_prompt} Explain concepts simply and avoid jargon."
    
    return base_prompt

agent_dynamic = create_agent(
    model=model,
    tools=[base_tools.web_search],
    middleware=[user_role_prompt],
    context_schema=Context
)

In [None]:
# Test with expert context
expert_response = agent_dynamic.invoke(
    {"messages": [{"role": "user", "content": "Explain machine learning"}]},
    context={"user_role": "expert"}
)

expert_response['messages'][-1].text

In [None]:
# Test with beginner context
beginner_response = agent_dynamic.invoke(
    {"messages": [{"role": "user", "content": "Explain machine learning"}]},
    context={"user_role": "beginner"}
)

beginner_response['messages'][-1].text

## Combining Multiple Middleware

In [None]:
# Production agent with stacked middleware
agent_production = create_agent(
    model=model,
    tools=[base_tools.web_search],
    checkpointer=checkpointer,
    middleware=[
        SummarizationMiddleware(
            model=ChatGoogleGenerativeAI(model='gemini-2.5-flash'),
            trigger=[("messages", 15)],
            keep=("messages", 5)
        ),
        ModelCallLimitMiddleware(run_limit=3, exit_behavior="end"),
        ToolCallLimitMiddleware(run_limit=3, exit_behavior="continue"),
        ModelFallbackMiddleware(fallback_model)
    ]
)

config = {'configurable': {'thread_id': 'production'}}
response = agent_production.invoke({
    'messages': [HumanMessage("Analyze tech sector trends")]
}, config)

response['messages'][-1].text

## Key Takeaways

- Middleware adds production capabilities without changing agent logic
- SummarizationMiddleware prevents context overflow
- Limits control costs and API usage
- Fallbacks improve reliability
- Dynamic prompts enable context-aware behavior
- Middleware execution order matters

In [None]:
# Exercise: Create custom middleware configuration
