In [1]:
"""
changed part:
1.when dealing with the Pendulum task, won't get new policy if truncated.
2.Add core physical formulas into STATIC_KNOWLEDGE to improve policy generation.
3.Introduce weights for dynamic knowledge, and periodically filter out outdated or already-resolved entries.


"""

import gymnasium as gym
import time
import requests
import random
import numpy as np
import inspect
import math
from datetime import datetime

# ----------------------------
# LLM HTTP call function
# ----------------------------
LLM_URL = 'https://api.yesapikey.com/v1/chat/completions'
LLM_HEADERS = {
    'Content-Type': 'application/json',
    'Authorization': 'Bearer sk-DygYzdGba7V5ggRwDf0d28B193D84c90Af2eE34b68C1C892'
}

def call_llm(prompt, model="gpt-4.1-2025-04-14", temperature=0.2, max_tokens=1024):
    data = {
        "model": model,
        "messages": [{"role": "user", "content": prompt}],
        "temperature": temperature,
        "max_tokens": max_tokens
    }
    while True:
        try:
            response = requests.post(LLM_URL, json=data, headers=LLM_HEADERS)
            if response.status_code == 200:
                resp_json = response.json()
                if 'choices' in resp_json and resp_json['choices']:
                    content = resp_json['choices'][0].get('message', {}).get('content')
                    return content
        except Exception as e:
            print("LLM call exception:", e)
        time.sleep(2)

# ----------------------------
# Environment documentation mapping
# ----------------------------
ENV_DOC_URL = {
    "Acrobot-v1": "https://gymnasium.farama.org/environments/classic_control/acrobot/",
    "CartPole-v1": "https://gymnasium.farama.org/environments/classic_control/cart_pole/",
    "MountainCarContinuous-v0": "https://gymnasium.farama.org/environments/classic_control/mountain_car_continuous/",
    "MountainCar-v0": "https://gymnasium.farama.org/environments/classic_control/mountain_car/",
    "Pendulum-v1": "https://gymnasium.farama.org/environments/classic_control/pendulum/"
}

def get_env_doc_url(env_id: str) -> str:
    return ENV_DOC_URL.get(env_id, "https://gymnasium.farama.org/")

# ----------------------------
# Static Knowledge (with physics formulas)
# ----------------------------
STATIC_KNOWLEDGE = {
    "CartPole-v1": {
        "state_dim": 4,
        "state_vars": ["cart_position", "cart_velocity", "pole_angle", "pole_velocity"],
        "state_ranges": [(-4.8, 4.8), (-float("inf"), float("inf")), (-0.418, 0.418), (-float("inf"), float("inf"))],
        "action_space": [0, 1],
        "reward_threshold": 475,
        "action_type": "discrete",
        "physics": {
            "theta_accel": "theta_accel = g*sin(theta) + cos(theta)*(-F - m*l*theta_dot^2*sin(theta))/(M+m)",
            "cart_accel": "x_accel = (F + m*l*(theta_dot^2*sin(theta) - theta_accel*cos(theta))) / (M+m)"
        }
    },
    "Acrobot-v1": {
        "state_dim": 6,
        "state_vars": ["cos_theta1", "sin_theta1", "cos_theta2", "sin_theta2", "theta1_dot", "theta2_dot"],
        "state_ranges": [(-1, 1), (-1, 1), (-1, 1), (-1, 1), (-float("inf"), float("inf")), (-float("inf"), float("inf"))],
        "action_space": [0, 1, 2],
        "reward_threshold": -100,
        "action_type": "discrete",
        "physics": {
            "theta1_accel": "theta1_ddot = -(d2*(theta2_dot^2*sin(theta2)+g*sin(theta1+theta2)+...) ) / denominator",
            "theta2_accel": "theta2_ddot = formula depends on theta1, theta1_dot, theta2, theta2_dot, torque"
        }
    },
    "MountainCar-v0": {
        "state_dim": 2,
        "state_vars": ["position", "velocity"],
        "state_ranges": [(-1.2, 0.6), (-0.07, 0.07)],
        "action_space": [0, 1, 2],
        "reward_threshold": -110,
        "action_type": "discrete",
        "physics": {
            "velocity_update": "v_next = v + 0.001*action - 0.0025*cos(3*position)",
            "position_update": "pos_next = pos + v_next"
        }
    },
    "MountainCarContinuous-v0": {
        "state_dim": 2,
        "state_vars": ["position", "velocity"],
        "state_ranges": [(-1.2, 0.6), (-0.07, 0.07)],
        "action_space": [-1.0, 1.0],
        "reward_threshold": 90,
        "action_type": "continuous",
        "physics": {
            "velocity_update": "v_next = v + 0.001*force - 0.0025*cos(3*position)",
            "position_update": "pos_next = pos + v_next"
        }
    },
    "Pendulum-v1": {
        "state_dim": 3,
        "state_vars": ["cos_theta", "sin_theta", "theta_dot"],
        "state_ranges": [(-1, 1), (-1, 1), (-float("inf"), float("inf"))],
        "action_space": [-2.0, 2.0],
        "reward_threshold": -200,
        "action_type": "continuous",
        "physics": {
            "theta_accel": "theta_ddot = (-3*g/(2*l)*sin(theta + pi) + 3./(m*l^2)*torque)"
        }
    }
}

# ----------------------------
# Knowledge module with timestamp, validity, weight
# ----------------------------
class Knowledge:
    def __init__(self):
        self.static_knowledge = {}
        self.dynamic_knowledge = []

    def load_static_knowledge(self, env_id):
        if env_id not in STATIC_KNOWLEDGE:
            raise ValueError("Unsupported environment")
        self.static_knowledge = STATIC_KNOWLEDGE[env_id]
        self.dynamic_knowledge = []

    def add_dynamic_entry(self, entry, weight=1.0):
        entry_copy = entry.copy()
        entry_copy.update({
            "timestamp": datetime.now().isoformat(),
            "valid": True,
            "weight": weight
        })
        self.dynamic_knowledge.append(entry_copy)

    def filter_dynamic_knowledge(self, consecutive_inactive=3):
        # Mark entries invalid if they haven't appeared in last consecutive_inactive rounds
        valid_entries = []
        for entry in self.dynamic_knowledge:
            if "last_seen_iter" not in entry:
                entry["last_seen_iter"] = 0
            if entry["valid"]:
                valid_entries.append(entry)
        self.dynamic_knowledge = valid_entries

    def get_dynamic_guidance(self, env_id):
        valid_entries = [e for e in self.dynamic_knowledge if e.get("valid", True)]
        prompt = f"""
I am generating a policy in environment {env_id}.
Current valid dynamic knowledge entries: {valid_entries}

Focus on environment **principles, physics, and dynamics**, not superficial patterns.
Please provide concise heuristic suggestions for policy generation based on this knowledge, such as:
- State ranges to prioritize
- Common failing action patterns
- Recommended threshold adjustments

Return a short, structured bullet list (no prose).
"""
        guidance = call_llm(prompt)
        return guidance

# ----------------------------
# Memory module
# ----------------------------
class Memory:
    def __init__(self):
        self.episodes = []

    def start_episode(self):
        self.episodes.append({"steps": [], "summary": None})

    def add_step(self, s, a, r, done):
        if not self.episodes:
            raise ValueError("Please call start_episode() before adding steps!")
        self.episodes[-1]["steps"].append({"s": s, "a": a, "r": r, "done": done})

    def add_episode_summary(self, env_id, policy_version):
        if not self.episodes:
            raise ValueError("No running episode!")
        steps = self.episodes[-1]["steps"]
        total_reward = sum(step["r"] for step in steps)
        length = len(steps)
        self.episodes[-1]["summary"] = {
            "env_id": env_id,
            "policy_version": policy_version,
            "return": total_reward,
            "length": length
        }

    def get_recent_episodes(self, n=5):
        summaries = [ep["summary"] for ep in self.episodes if ep["summary"] is not None]
        return summaries[-n:]

# ----------------------------
# Reflection module
# ----------------------------
class Reflection:
    def __init__(self, knowledge: Knowledge):
        self.knowledge = knowledge

    def metrics(self, recent_episodes):
        returns = [ep["return"] for ep in recent_episodes]
        lengths = [ep["length"] for ep in recent_episodes]
        avg_return = np.mean(returns) if returns else 0
        avg_length = np.mean(lengths) if lengths else 0
        threshold = self.knowledge.static_knowledge.get("reward_threshold", 0)
        success_count = sum(1 for ep in recent_episodes if ep["return"] >= threshold)
        success_rate = success_count / len(recent_episodes) if recent_episodes else 0
        return {"avg_return": avg_return, "avg_length": avg_length, "success_rate": success_rate}

    def failure_pattern(self, recent_episodes, env_id, iter_idx=None):
        prompt = f"""
I have the following {env_id} environment episode summaries: {recent_episodes}
Please analyze the most common failure patterns, including state characteristics, action issues, and return patterns. Focus only on key points.
Return a concise paragraph.
"""
        pattern = call_llm(prompt).strip()
        self.knowledge.add_dynamic_entry({"failure_pattern": pattern, "env_id": env_id, "last_seen_iter": iter_idx}, weight=1.0)
        return pattern

    def edit_suggestion(self, recent_episodes, env_id, iter_idx=None):
        prompt = f"""
Based on recent episode data from environment {env_id}: {recent_episodes}
Generate one policy editing suggestion in one of the following formats:
- add_rule(condition -> action)
- modify_threshold(variable, old_value, new_value)
- reprioritize(rule_i over rule_j)

Return exactly one line with one edit.
"""
        suggestion = call_llm(prompt).strip()
        self.knowledge.add_dynamic_entry({"edit_suggestion": suggestion, "env_id": env_id, "last_seen_iter": iter_idx}, weight=1.0)
        return suggestion

# ----------------------------
# Safe policy call
# ----------------------------
def safe_policy_call(state, policy_fn, sk):
    try:
        a = policy_fn(state)
    except Exception:
        if sk["action_type"] == "discrete":
            a = sk["action_space"][0]
        else:
            a = (sk["action_space"][0] + sk["action_space"][1]) / 2.0
    if sk["action_type"] == "continuous":
        a = np.clip(a, sk["action_space"][0], sk["action_space"][1])
    return a

# ----------------------------
# Safe step wrapper
# ----------------------------
def safe_step(env, action):
    sk = STATIC_KNOWLEDGE[env.unwrapped.spec.id]
    if sk["action_type"] == "continuous":
        action = np.array([np.clip(action, sk["action_space"][0], sk["action_space"][1])]) if np.isscalar(action) else np.clip(np.array(action), sk["action_space"][0], sk["action_space"][1])
    return env.step(action)

# ----------------------------
# Policy generation helper with rule-based constraint
# ----------------------------
def _action_constraints_text(static_knowledge: dict) -> str:
    a = static_knowledge["action_space"]
    if static_knowledge.get("action_type") == "discrete":
        return f"Discrete actions; valid actions are exactly the integers in {a}."
    else:
        lo, hi = a[0], a[1]
        return f"Continuous action; return a single float within [{lo}, {hi}]. Clip if necessary."

def generate_base_policy(env_id, knowledge: Knowledge):
    guidance = knowledge.get_dynamic_guidance(env_id) or ""
    doc_url = get_env_doc_url(env_id)
    sk = knowledge.static_knowledge
    action_desc = _action_constraints_text(sk)
    state_vars_text = "\n".join([f"- {name} in range {rng}" for name, rng in zip(sk["state_vars"], sk["state_ranges"])])
    physics_text = "\n".join([f"- {k}: {v}" for k,v in sk.get("physics", {}).items()])

    prompt = f"""
You are writing a deterministic, white-box **rule-based policy** for Gymnasium environment "{env_id}".
Focus on **environment principles, physics, and dynamics**, not superficial patterns.
The policy must be based on simple if-else statements or threshold comparisons using state variables.
Environment documentation: {doc_url}

Observation (state vector):
{state_vars_text}

Physics references:
{physics_text}

Action constraints:
- {action_desc}
- May import 'math' if needed
- Must be deterministic
- Do not use loops, functions, or external libraries except math
- Example: if state[2] > 0: return 1 else: return 0

Dynamic guidance:
{guidance}

Output requirements:
- Only one Python function: def policy(state): ...
- No explanations, no markdown, no print
- Returned action strictly satisfies constraints
"""
    policy_code = call_llm(prompt)
    local_vars = {"math": math, "np": np}
    try:
        exec(policy_code, local_vars)
        policy_fn = local_vars.get("policy")
        if policy_fn is None:
            if sk["action_type"] == "discrete":
                def policy_fn(state): return sk["action_space"][0]
            else:
                lo, hi = sk["action_space"]
                def policy_fn(state): return (lo + hi)/2.0
    except Exception:
        if sk["action_type"] == "discrete":
            def policy_fn(state): return sk["action_space"][0]
        else:
            lo, hi = sk["action_space"]
            def policy_fn(state): return (lo + hi)/2.0
    return policy_fn

def apply_edit(policy_fn, edit_text, knowledge: Knowledge):
    sk = knowledge.static_knowledge
    try:
        existing_src = inspect.getsource(policy_fn)
    except Exception:
        existing_src = "def policy(state):\n    return " + (str(sk["action_space"][0]) if sk["action_type"]=="discrete" else str((sk["action_space"][0]+sk["action_space"][1])/2.0))
    action_desc = _action_constraints_text(sk)

    doc_url = get_env_doc_url(knowledge.dynamic_knowledge[0].get("env_id", ""))
    prompt = f"""
Revise deterministic, **rule-based** policy for environment at {doc_url}.
Focus on physics, dynamics, and environment principles.
Constraints: {action_desc}
Current policy:
{existing_src}
Edit suggestion: {edit_text}
You may use 'math' module.
- Must remain if-else or threshold based

Output only a valid Python function def policy(state): ...
"""
    policy_code = call_llm(prompt)
    local_vars = {"math": math, "np": np}
    try:
        exec(policy_code, local_vars)
        new_policy_fn = local_vars.get("policy")
        return new_policy_fn if new_policy_fn else policy_fn
    except Exception:
        return policy_fn

# ----------------------------
# Main closed-loop training
# ----------------------------
def run_env_loop(env_id, max_iters=10, episodes_per_iter=10, ma_window=3,
                 success_rate_threshold=0.8, rollback_window=3,
                 truncated_threshold=10):
    knowledge = Knowledge()
    knowledge.load_static_knowledge(env_id)
    memory = Memory()
    reflection = Reflection(knowledge)
    policy_version = 0
    first_iter = True
    policy_fn = None
    best_policy_fn = None
    best_metrics = {"avg_return": -np.inf, "success_rate": 0}
    recent_metrics_history = []
    consecutive_truncated = 0

    for iter_idx in range(max_iters):
        policy_version += 1
        print(f"=== Iteration {iter_idx+1} ===")

        if first_iter:
            policy_fn = generate_base_policy(env_id, knowledge)
            first_iter = False
        else:
            recent_episodes = memory.get_recent_episodes()
            suggestion = reflection.edit_suggestion(recent_episodes, env_id, iter_idx)
            policy_fn = apply_edit(policy_fn, suggestion, knowledge)

        env = gym.make(env_id)
        iteration_returns = []
        iteration_truncated = False

        for ep in range(episodes_per_iter):
            s, _ = env.reset()
            done = False
            memory.start_episode()

            while not done:
                a = safe_policy_call(s, policy_fn, knowledge.static_knowledge)
                s_next, r, terminated, truncated, info = safe_step(env, a)
                if truncated:
                    iteration_truncated = True
                done = terminated or truncated
                memory.add_step(s, a, r, done)
                s = s_next

            memory.add_episode_summary(env_id, policy_version)
            iteration_returns.append(memory.episodes[-1]["summary"]["return"])

        recent_episodes_ma = memory.get_recent_episodes(n=ma_window)
        metrics = reflection.metrics(recent_episodes_ma)
        print(f"Moving Avg Return: {metrics['avg_return']:.2f}, Success Rate: {metrics['success_rate']:.2f}")

        # Update failure pattern
        pattern = reflection.failure_pattern(recent_episodes_ma, env_id, iter_idx)
        print("Failure Pattern:", pattern)

        # Filter dynamic knowledge periodically
        knowledge.filter_dynamic_knowledge(consecutive_inactive=3)

        # Update best policy if improved
        if metrics["avg_return"] > best_metrics["avg_return"] and metrics["success_rate"] >= success_rate_threshold:
            best_metrics = metrics.copy()
            best_policy_fn = policy_fn

        recent_metrics_history.append(metrics["avg_return"])
        if len(recent_metrics_history) > rollback_window:
            recent_metrics_history.pop(0)

        # Check rollback: if consecutive rollback_window drops, restore best policy
        if len(recent_metrics_history) == rollback_window:
            if all(recent_metrics_history[i] < recent_metrics_history[i-1] for i in range(1, rollback_window)):
                print(f"Rollback triggered. Restoring previous best policy with Avg Return={best_metrics['avg_return']:.2f}")
                policy_fn = best_policy_fn

        # Check convergence based on moving average over ma_window
        if len(memory.episodes) >= ma_window * episodes_per_iter:
            if metrics["avg_return"] >= knowledge.static_knowledge.get("reward_threshold", 0) and metrics["success_rate"] >= success_rate_threshold:
                print(f"Converged! Stop training. Moving Avg Return={metrics['avg_return']:.2f}, Success Rate={metrics['success_rate']:.2f}")
                break

        # Truncated handling: only trigger if consecutive_truncated >= threshold
        # Truncated handling (skip for Pendulum-v1)
        if env_id != "Pendulum-v1":
            if iteration_truncated:
                consecutive_truncated += 1
                if consecutive_truncated >= truncated_threshold:
                    print(f"Consecutive truncated >= {truncated_threshold}. Regenerating new policy via LLM.")
                    policy_fn = generate_base_policy(env_id, knowledge)
                    consecutive_truncated = 0
            else:
                consecutive_truncated = 0


# ----------------------------
# Run multiple control tasks
# ----------------------------
if __name__ == "__main__":
    env_list = [
        "Acrobot-v1",
        "CartPole-v1",
        "MountainCarContinuous-v0",
        "MountainCar-v0",
        "Pendulum-v1"
    ]
    for env_id in env_list:
        print(f"==== Running {env_id} ====")
        run_env_loop(env_id)


==== Running Acrobot-v1 ====
=== Iteration 1 ===
Moving Avg Return: -500.00, Success Rate: 0.00
Failure Pattern: The episode summaries for Acrobot-v1 show a consistent failure pattern: each episode ends with the minimum possible return of -500.0 and the maximum episode length of 500 steps. This indicates that the agent repeatedly fails to achieve the environment's goal (swinging the end-effector above a target height) within the allowed time. The uniformity suggests the policy is ineffective, likely resulting in repetitive or uninformative actions (e.g., always choosing the same action or random actions with no learning). State characteristics likely remain in low-energy or unproductive regions, with the agent unable to generate sufficient momentum. The return pattern confirms persistent failure without any sign of improvement or exploration.
=== Iteration 2 ===
Moving Avg Return: -500.00, Success Rate: 0.00
Failure Pattern: The episode summaries for Acrobot-v1 consistently show a retu