# 06. 持久化和检查点

## 课程目标
- 理解检查点（Checkpointer）机制
- 掌握 MemorySaver 的使用
- 学习持久化存储配置
- 实现状态恢复和回溯
- 掌握线程管理和配置
- 构建可靠的生产环境应用

## 核心概念

检查点是LangGraph中的核心特性，提供：
1. **状态持久化**：保存图执行过程中的状态
2. **故障恢复**：从检查点恢复执行
3. **状态回溯**：回到之前的执行状态
4. **并发控制**：支持多线程安全执行
5. **人机交互**：支持暂停和恢复执行

In [None]:
# 环境准备
from typing import TypedDict, Annotated, List, Dict, Any, Optional
from langgraph.graph import StateGraph, END, START
from langgraph.checkpoint.memory import MemorySaver
from langchain_core.runnables import RunnableConfig
import json
import time
import uuid
from datetime import datetime

print("环境准备完成")

## 1. MemorySaver 基础使用

MemorySaver 是LangGraph内置的内存检查点实现：

In [None]:
# 定义状态
class CounterState(TypedDict):
    count: int
    operations: List[str]
    last_operation: str

# 定义节点函数
def increment(state: CounterState) -> CounterState:
    """递增计数器"""
    current_count = state.get("count", 0)
    new_count = current_count + 1
    operation = f"increment: {current_count} -> {new_count}"
    
    print(f"🔢 {operation}")
    
    return {
        "count": new_count,
        "operations": state.get("operations", []) + [operation],
        "last_operation": "increment"
    }

def multiply(state: CounterState) -> CounterState:
    """乘以2"""
    current_count = state.get("count", 0)
    new_count = current_count * 2
    operation = f"multiply: {current_count} * 2 = {new_count}"
    
    print(f"✖️ {operation}")
    
    return {
        "count": new_count,
        "operations": state.get("operations", []) + [operation],
        "last_operation": "multiply"
    }

def log_state(state: CounterState) -> CounterState:
    """记录状态"""
    count = state.get("count", 0)
    operations_count = len(state.get("operations", []))
    log_msg = f"State logged: count={count}, operations={operations_count}"
    
    print(f"📝 {log_msg}")
    
    return {
        "operations": state.get("operations", []) + [log_msg],
        "last_operation": "log"
    }

# 创建带检查点的图
def create_checkpoint_graph():
    # 创建检查点保存器
    checkpointer = MemorySaver()
    
    # 创建图
    graph = StateGraph(CounterState)
    
    # 添加节点
    graph.add_node("increment", increment)
    graph.add_node("multiply", multiply)
    graph.add_node("log", log_state)
    
    # 设置边
    graph.set_entry_point("increment")
    graph.add_edge("increment", "multiply")
    graph.add_edge("multiply", "log")
    graph.add_edge("log", END)
    
    # 编译图时传入检查点保存器
    return graph.compile(checkpointer=checkpointer)

# 测试检查点功能
checkpoint_app = create_checkpoint_graph()

# 第一次执行
thread_id = str(uuid.uuid4())
config = {"configurable": {"thread_id": thread_id}}

print("=== 第一次执行 ===")
result1 = checkpoint_app.invoke(
    {"count": 5},
    config=config
)

print(f"\n第一次执行结果: count={result1['count']}")
print(f"操作历史: {len(result1['operations'])} 个操作")

# 继续执行（从检查点恢复）
print("\n=== 继续执行 ===")
result2 = checkpoint_app.invoke(
    None,  # 不提供输入，使用检查点状态
    config=config
)

print(f"\n继续执行结果: count={result2['count']}")
print(f"操作历史: {len(result2['operations'])} 个操作")

# 查看所有操作历史
print("\n=== 完整操作历史 ===")
for i, op in enumerate(result2['operations'], 1):
    print(f"{i}. {op}")

## 2. 检查点历史和状态回溯

检查点系统保存每个步骤的状态，支持历史查看和回溯：

In [None]:
# 查看检查点历史
def explore_checkpoints(app, config):
    """探索检查点历史"""
    print("=== 检查点历史 ===")
    
    # 获取检查点历史
    checkpoints = []
    for checkpoint in app.get_state_history(config):
        checkpoints.append(checkpoint)
    
    print(f"总共有 {len(checkpoints)} 个检查点")
    
    # 显示每个检查点的信息
    for i, checkpoint in enumerate(reversed(checkpoints)):
        state = checkpoint.values
        metadata = checkpoint.metadata
        
        print(f"\n检查点 {i+1}:")
        print(f"  Count: {state.get('count', 'N/A')}")
        print(f"  Last Operation: {state.get('last_operation', 'N/A')}")
        print(f"  Metadata: {metadata}")
        print(f"  Config: {checkpoint.config}")
    
    return checkpoints

# 探索之前的检查点
checkpoints = explore_checkpoints(checkpoint_app, config)

# 从特定检查点恢复
if len(checkpoints) >= 2:
    print("\n=== 从第2个检查点恢复 ===")
    
    # 获取第2个检查点的配置
    second_checkpoint = checkpoints[-2]  # 倒数第二个
    recovery_config = second_checkpoint.config
    
    print(f"恢复点状态: count={second_checkpoint.values.get('count')}")
    
    # 从该检查点继续执行
    recovery_result = checkpoint_app.invoke(
        None,
        config=recovery_config
    )
    
    print(f"恢复后执行结果: count={recovery_result['count']}")

## 3. 交互式执行和人工干预

检查点支持暂停执行，等待人工干预：

In [None]:
# 定义需要人工确认的状态
class InteractiveState(TypedDict):
    value: int
    operation_queue: List[str]
    requires_approval: bool
    approval_message: str
    execution_log: List[str]

# 定义交互式节点
def start_process(state: InteractiveState) -> InteractiveState:
    """开始处理"""
    print("🚀 开始处理流程")
    
    return {
        "value": 10,
        "operation_queue": ["add_5", "multiply_3", "subtract_2"],
        "requires_approval": False,
        "execution_log": ["流程开始，初始值: 10"]
    }

def execute_operation(state: InteractiveState) -> InteractiveState:
    """执行操作"""
    value = state.get("value", 0)
    queue = state.get("operation_queue", [])
    log = state.get("execution_log", [])
    
    if not queue:
        return {
            "execution_log": log + ["没有更多操作"]
        }
    
    # 取出下一个操作
    operation = queue[0]
    remaining_queue = queue[1:]
    
    # 执行操作
    if operation == "add_5":
        new_value = value + 5
        log_msg = f"执行加法: {value} + 5 = {new_value}"
    elif operation == "multiply_3":
        new_value = value * 3
        log_msg = f"执行乘法: {value} * 3 = {new_value}"
    elif operation == "subtract_2":
        new_value = value - 2
        log_msg = f"执行减法: {value} - 2 = {new_value}"
    else:
        new_value = value
        log_msg = f"未知操作: {operation}"
    
    print(f"⚙️ {log_msg}")
    
    # 检查是否需要审批
    requires_approval = new_value > 30  # 值超过30需要审批
    approval_msg = f"值 {new_value} 超过阈值，需要人工审批" if requires_approval else ""
    
    if requires_approval:
        print(f"⚠️ {approval_msg}")
    
    return {
        "value": new_value,
        "operation_queue": remaining_queue,
        "requires_approval": requires_approval,
        "approval_message": approval_msg,
        "execution_log": log + [log_msg]
    }

def check_approval(state: InteractiveState) -> InteractiveState:
    """检查是否需要审批"""
    requires_approval = state.get("requires_approval", False)
    
    if requires_approval:
        print("🛑 流程暂停，等待人工审批...")
        # 在实际应用中，这里会暂停执行，等待人工输入
        # 这里我们模拟审批通过
        approval_result = "approved"  # 模拟审批结果
        
        if approval_result == "approved":
            print("✅ 审批通过，继续执行")
            return {
                "requires_approval": False,
                "approval_message": "审批已通过",
                "execution_log": state.get("execution_log", []) + ["人工审批: 通过"]
            }
        else:
            print("❌ 审批拒绝，流程终止")
            return {
                "requires_approval": False,
                "approval_message": "审批被拒绝，流程终止",
                "execution_log": state.get("execution_log", []) + ["人工审批: 拒绝"]
            }
    else:
        print("✅ 无需审批，继续执行")
        return state

# 决策函数
def should_continue(state: InteractiveState) -> str:
    """决定是否继续执行"""
    queue = state.get("operation_queue", [])
    requires_approval = state.get("requires_approval", False)
    
    if requires_approval:
        return "approval"
    elif queue:
        return "continue"
    else:
        return "end"

# 创建交互式图
def create_interactive_graph():
    checkpointer = MemorySaver()
    graph = StateGraph(InteractiveState)
    
    # 添加节点
    graph.add_node("start", start_process)
    graph.add_node("execute", execute_operation)
    graph.add_node("approval", check_approval)
    
    # 设置流程
    graph.set_entry_point("start")
    graph.add_edge("start", "execute")
    
    # 添加条件边
    graph.add_conditional_edges(
        "execute",
        should_continue,
        {
            "approval": "approval",
            "continue": "execute",
            "end": END
        }
    )
    
    graph.add_conditional_edges(
        "approval",
        should_continue,
        {
            "continue": "execute",
            "end": END
        }
    )
    
    return graph.compile(checkpointer=checkpointer)

# 测试交互式执行
interactive_app = create_interactive_graph()
interactive_thread_id = str(uuid.uuid4())
interactive_config = {"configurable": {"thread_id": interactive_thread_id}}

print("=== 交互式执行演示 ===")

# 开始执行
interactive_result = interactive_app.invoke({}, config=interactive_config)

print("\n=== 执行完成 ===")
print(f"最终值: {interactive_result.get('value', 'N/A')}")
print(f"剩余操作: {interactive_result.get('operation_queue', [])}")
print(f"需要审批: {interactive_result.get('requires_approval', False)}")

print("\n=== 执行日志 ===")
for i, log_entry in enumerate(interactive_result.get('execution_log', []), 1):
    print(f"{i}. {log_entry}")

# 查看交互式执行的检查点历史
print("\n=== 交互式执行的检查点历史 ===")
interactive_checkpoints = explore_checkpoints(interactive_app, interactive_config)

## 4. 流式执行和实时监控

结合检查点实现流式执行和实时监控：

In [None]:
# 定义流式处理状态
class StreamState(TypedDict):
    batch_id: str
    items: List[str]
    processed_items: List[str]
    current_item: str
    progress: float
    status: str
    errors: List[str]

# 流式处理节点
def initialize_batch(state: StreamState) -> StreamState:
    """初始化批次"""
    batch_id = f"batch_{int(time.time())}"
    items = [f"item_{i}" for i in range(1, 6)]  # 5个项目
    
    print(f"📦 初始化批次: {batch_id}，包含 {len(items)} 个项目")
    
    return {
        "batch_id": batch_id,
        "items": items,
        "processed_items": [],
        "current_item": "",
        "progress": 0.0,
        "status": "initialized",
        "errors": []
    }

def process_next_item(state: StreamState) -> StreamState:
    """处理下一个项目"""
    items = state.get("items", [])
    processed_items = state.get("processed_items", [])
    errors = state.get("errors", [])
    
    # 找到下一个未处理的项目
    unprocessed_items = [item for item in items if item not in processed_items]
    
    if not unprocessed_items:
        return {
            "current_item": "",
            "progress": 100.0,
            "status": "completed"
        }
    
    current_item = unprocessed_items[0]
    
    # 模拟处理时间
    print(f"⚙️ 处理项目: {current_item}...")
    time.sleep(0.5)  # 模拟处理时间
    
    # 模拟偶尔的处理错误
    import random
    if random.random() < 0.2:  # 20% 的错误率
        error_msg = f"处理 {current_item} 时发生错误"
        print(f"❌ {error_msg}")
        errors = errors + [error_msg]
        # 跳过这个项目
        processed_items = processed_items + [current_item]
    else:
        print(f"✅ {current_item} 处理成功")
        processed_items = processed_items + [current_item]
    
    # 计算进度
    progress = (len(processed_items) / len(items)) * 100
    status = "processing" if progress < 100 else "completed"
    
    return {
        "processed_items": processed_items,
        "current_item": current_item,
        "progress": round(progress, 1),
        "status": status,
        "errors": errors
    }

def generate_progress_report(state: StreamState) -> StreamState:
    """生成进度报告"""
    batch_id = state.get("batch_id", "")
    progress = state.get("progress", 0)
    processed_count = len(state.get("processed_items", []))
    total_count = len(state.get("items", []))
    error_count = len(state.get("errors", []))
    
    print(f"\n📊 进度报告 - {batch_id}:")
    print(f"   进度: {progress}% ({processed_count}/{total_count})")
    print(f"   错误: {error_count} 个")
    print(f"   状态: {state.get('status', 'unknown')}")
    
    return state

# 决策函数
def should_continue_processing(state: StreamState) -> str:
    """决定是否继续处理"""
    status = state.get("status", "")
    
    if status == "completed":
        return "report"
    elif status in ["initialized", "processing"]:
        return "process"
    else:
        return "report"

# 创建流式处理图
def create_stream_graph():
    checkpointer = MemorySaver()
    graph = StateGraph(StreamState)
    
    # 添加节点
    graph.add_node("init", initialize_batch)
    graph.add_node("process", process_next_item)
    graph.add_node("report", generate_progress_report)
    
    # 设置流程
    graph.set_entry_point("init")
    
    # 添加条件边
    graph.add_conditional_edges(
        "init",
        should_continue_processing,
        {
            "process": "process",
            "report": "report"
        }
    )
    
    graph.add_conditional_edges(
        "process",
        should_continue_processing,
        {
            "process": "process",
            "report": "report"
        }
    )
    
    graph.add_edge("report", END)
    
    return graph.compile(checkpointer=checkpointer)

# 测试流式处理
stream_app = create_stream_graph()
stream_thread_id = str(uuid.uuid4())
stream_config = {"configurable": {"thread_id": stream_thread_id}}

print("=== 流式处理演示 ===")

# 使用流式执行
for step, event in enumerate(stream_app.stream({}, config=stream_config), 1):
    print(f"\n--- 步骤 {step} ---")
    for node_name, node_output in event.items():
        print(f"节点 [{node_name}] 完成")
        if "progress" in node_output:
            print(f"当前进度: {node_output['progress']}%")
        if "current_item" in node_output and node_output["current_item"]:
            print(f"当前处理: {node_output['current_item']}")

print("\n=== 流式处理完成 ===")

# 获取最终状态
final_state = stream_app.get_state(stream_config)
print(f"最终状态: {final_state.values.get('status')}")
print(f"处理项目: {len(final_state.values.get('processed_items', []))} 个")
print(f"错误数量: {len(final_state.values.get('errors', []))} 个")

## 5. 自定义检查点实现

创建自定义的检查点存储实现：

In [None]:
# 简单的文件系统检查点实现
import os
import pickle
from typing import Iterator, Optional, Tuple
from langgraph.checkpoint.base import BaseCheckpointSaver, Checkpoint, CheckpointMetadata

class FileSystemCheckpointSaver(BaseCheckpointSaver):
    """基于文件系统的检查点保存器"""
    
    def __init__(self, base_path: str = "./checkpoints"):
        self.base_path = base_path
        os.makedirs(base_path, exist_ok=True)
    
    def _get_checkpoint_path(self, thread_id: str, checkpoint_id: str) -> str:
        """获取检查点文件路径"""
        thread_dir = os.path.join(self.base_path, thread_id)
        os.makedirs(thread_dir, exist_ok=True)
        return os.path.join(thread_dir, f"{checkpoint_id}.pkl")
    
    def put(self, config: RunnableConfig, checkpoint: Checkpoint, metadata: CheckpointMetadata) -> None:
        """保存检查点"""
        thread_id = config["configurable"]["thread_id"]
        checkpoint_id = checkpoint["id"]
        
        checkpoint_path = self._get_checkpoint_path(thread_id, checkpoint_id)
        
        with open(checkpoint_path, 'wb') as f:
            pickle.dump({
                "checkpoint": checkpoint,
                "metadata": metadata,
                "config": config
            }, f)
        
        print(f"💾 检查点已保存: {checkpoint_path}")
    
    def put_writes(self, config: RunnableConfig, writes: list, task_id: str) -> None:
        """保存写入操作"""
        # 这里可以实现写入操作的持久化
        pass
    
    def get_tuple(self, config: RunnableConfig) -> Optional[Tuple[Checkpoint, CheckpointMetadata]]:
        """获取检查点元组"""
        thread_id = config["configurable"]["thread_id"]
        thread_dir = os.path.join(self.base_path, thread_id)
        
        if not os.path.exists(thread_dir):
            return None
        
        # 查找最新的检查点文件
        checkpoint_files = [f for f in os.listdir(thread_dir) if f.endswith('.pkl')]
        if not checkpoint_files:
            return None
        
        # 按修改时间排序，获取最新的
        latest_file = max(checkpoint_files, 
                         key=lambda f: os.path.getmtime(os.path.join(thread_dir, f)))
        
        checkpoint_path = os.path.join(thread_dir, latest_file)
        
        with open(checkpoint_path, 'rb') as f:
            data = pickle.load(f)
        
        print(f"📂 检查点已加载: {checkpoint_path}")
        return (data["checkpoint"], data["metadata"])
    
    def list(self, config: RunnableConfig, *, limit: Optional[int] = None, 
             before: Optional[str] = None) -> Iterator[Tuple[Checkpoint, CheckpointMetadata]]:
        """列出检查点"""
        thread_id = config["configurable"]["thread_id"]
        thread_dir = os.path.join(self.base_path, thread_id)
        
        if not os.path.exists(thread_dir):
            return
        
        checkpoint_files = [f for f in os.listdir(thread_dir) if f.endswith('.pkl')]
        checkpoint_files.sort(key=lambda f: os.path.getmtime(os.path.join(thread_dir, f)), 
                             reverse=True)
        
        count = 0
        for file in checkpoint_files:
            if limit and count >= limit:
                break
            
            checkpoint_path = os.path.join(thread_dir, file)
            with open(checkpoint_path, 'rb') as f:
                data = pickle.load(f)
            
            yield (data["checkpoint"], data["metadata"])
            count += 1

# 测试自定义检查点保存器
def test_custom_checkpointer():
    """测试自定义检查点保存器"""
    
    # 创建使用自定义检查点的图
    custom_checkpointer = FileSystemCheckpointSaver("./test_checkpoints")
    
    # 重用之前的计数器图结构
    graph = StateGraph(CounterState)
    graph.add_node("increment", increment)
    graph.add_node("multiply", multiply)
    graph.add_node("log", log_state)
    
    graph.set_entry_point("increment")
    graph.add_edge("increment", "multiply")
    graph.add_edge("multiply", "log")
    graph.add_edge("log", END)
    
    custom_app = graph.compile(checkpointer=custom_checkpointer)
    
    # 测试执行
    custom_thread_id = str(uuid.uuid4())
    custom_config = {"configurable": {"thread_id": custom_thread_id}}
    
    print("=== 测试自定义检查点保存器 ===")
    
    result = custom_app.invoke(
        {"count": 3},
        config=custom_config
    )
    
    print(f"\n执行结果: count={result['count']}")
    
    # 检查文件系统中的检查点
    checkpoint_dir = os.path.join("./test_checkpoints", custom_thread_id)
    if os.path.exists(checkpoint_dir):
        files = os.listdir(checkpoint_dir)
        print(f"保存的检查点文件: {files}")
    
    return custom_app, custom_config

# 运行测试
custom_app, custom_config = test_custom_checkpointer()

# 验证检查点恢复
print("\n=== 验证检查点恢复 ===")
recovered_result = custom_app.invoke(None, config=custom_config)
print(f"恢复后的结果: count={recovered_result['count']}")

## 6. 生产环境最佳实践

生产环境中使用检查点的最佳实践：

In [None]:
# 生产级检查点管理
class ProductionCheckpointManager:
    """生产级检查点管理器"""
    
    def __init__(self, app, max_retries=3, cleanup_interval=3600):
        self.app = app
        self.max_retries = max_retries
        self.cleanup_interval = cleanup_interval
        self.execution_stats = {}
    
    def execute_with_retry(self, input_data: dict, config: dict) -> dict:
        """带重试机制的执行"""
        thread_id = config["configurable"]["thread_id"]
        retry_count = 0
        
        while retry_count < self.max_retries:
            try:
                print(f"🔄 执行尝试 {retry_count + 1}/{self.max_retries}")
                
                start_time = time.time()
                result = self.app.invoke(input_data, config=config)
                execution_time = time.time() - start_time
                
                # 记录成功执行统计
                self.execution_stats[thread_id] = {
                    "status": "success",
                    "execution_time": execution_time,
                    "retry_count": retry_count,
                    "timestamp": datetime.now().isoformat()
                }
                
                print(f"✅ 执行成功 (耗时: {execution_time:.2f}s)")
                return result
                
            except Exception as e:
                retry_count += 1
                print(f"❌ 执行失败: {str(e)}")
                
                if retry_count < self.max_retries:
                    # 等待一段时间再重试
                    wait_time = 2 ** retry_count  # 指数退避
                    print(f"⏰ 等待 {wait_time} 秒后重试...")
                    time.sleep(wait_time)
                else:
                    # 记录失败统计
                    self.execution_stats[thread_id] = {
                        "status": "failed",
                        "error": str(e),
                        "retry_count": retry_count - 1,
                        "timestamp": datetime.now().isoformat()
                    }
                    raise e
    
    def get_execution_stats(self) -> dict:
        """获取执行统计"""
        return self.execution_stats
    
    def health_check(self, config: dict) -> dict:
        """健康检查"""
        try:
            # 获取当前状态
            current_state = self.app.get_state(config)
            
            return {
                "status": "healthy",
                "has_checkpoint": current_state is not None,
                "checkpoint_id": current_state.config.get("configurable", {}).get("checkpoint_id") if current_state else None,
                "timestamp": datetime.now().isoformat()
            }
        except Exception as e:
            return {
                "status": "unhealthy",
                "error": str(e),
                "timestamp": datetime.now().isoformat()
            }
    
    def cleanup_old_checkpoints(self, config: dict, max_age_hours: int = 24):
        """清理旧检查点"""
        print(f"🧹 开始清理超过 {max_age_hours} 小时的检查点...")
        
        # 这里应该实现实际的清理逻辑
        # 根据具体的检查点存储实现来清理
        cleaned_count = 0  # 模拟清理数量
        
        print(f"✅ 清理完成，共清理 {cleaned_count} 个旧检查点")
        return cleaned_count

# 测试生产级管理器
def test_production_manager():
    """测试生产级管理器"""
    
    # 创建一个可能失败的图
    def unreliable_node(state: CounterState) -> CounterState:
        """不可靠的节点，有时会失败"""
        import random
        
        if random.random() < 0.3:  # 30% 的失败率
            raise Exception("模拟的处理失败")
        
        count = state.get("count", 0) + 1
        print(f"📈 计数增加到: {count}")
        
        return {
            "count": count,
            "operations": state.get("operations", []) + [f"unreliable_increment: {count}"],
            "last_operation": "unreliable_increment"
        }
    
    # 创建不可靠的图
    unreliable_checkpointer = MemorySaver()
    unreliable_graph = StateGraph(CounterState)
    
    unreliable_graph.add_node("unreliable", unreliable_node)
    unreliable_graph.set_entry_point("unreliable")
    unreliable_graph.add_edge("unreliable", END)
    
    unreliable_app = unreliable_graph.compile(checkpointer=unreliable_checkpointer)
    
    # 创建生产管理器
    manager = ProductionCheckpointManager(unreliable_app, max_retries=3)
    
    # 测试执行
    prod_thread_id = str(uuid.uuid4())
    prod_config = {"configurable": {"thread_id": prod_thread_id}}
    
    print("=== 生产级管理器测试 ===")
    
    try:
        result = manager.execute_with_retry(
            {"count": 0},
            prod_config
        )
        print(f"\n最终结果: count={result['count']}")
    except Exception as e:
        print(f"\n最终执行失败: {str(e)}")
    
    # 显示执行统计
    stats = manager.get_execution_stats()
    print(f"\n执行统计: {stats}")
    
    # 健康检查
    health = manager.health_check(prod_config)
    print(f"\n健康检查: {health}")
    
    return manager

# 运行生产级测试
prod_manager = test_production_manager()

## 7. 练习题

### 练习1：实现数据库检查点存储
创建一个基于数据库的检查点存储实现：

In [None]:
# 练习1：实现数据库检查点存储
# TODO: 基于SQLite或其他数据库实现检查点存储
# TODO: 支持查询、分页、清理等功能
# TODO: 实现事务安全和并发控制

print("请实现数据库检查点存储")

### 练习2：构建分布式任务系统
使用检查点构建一个可恢复的分布式任务处理系统：

In [None]:
# 练习2：分布式任务系统
# TODO: 实现任务分发和负载均衡
# TODO: 支持任务失败自动重试
# TODO: 实现集群状态同步
# TODO: 添加监控和告警机制

print("请实现分布式任务系统")

## 总结

在本课中，我们深入学习了LangGraph的持久化和检查点机制：

### 关键要点：
1. **检查点机制**：自动保存和恢复执行状态
2. **MemorySaver**：内存级检查点存储
3. **状态恢复**：从任意检查点恢复执行
4. **交互式执行**：支持人工干预和审批
5. **流式监控**：实时追踪执行进度
6. **自定义存储**：实现自定义检查点存储
7. **生产实践**：企业级可靠性保证

### 最佳实践：
- **合理设置检查点**：平衡性能和可靠性
- **异常处理**：实现健壮的错误恢复机制
- **资源管理**：定期清理旧检查点
- **监控告警**：实时监控系统健康状态
- **测试验证**：充分测试恢复流程

### 应用场景：
- 长时间运行的批处理任务
- 需要人工审批的工作流
- 关键业务流程
- 分布式计算任务
- 数据处理流水线

## 下一课预告

在下一课《并行执行和Map-Reduce》中，我们将学习：
- 并行节点执行机制
- Map-Reduce 模式实现
- Send API 的使用
- 动态并行任务创建
- 结果聚合和同步策略
- 性能优化技巧