In [None]:
"""
changed part:
1.Chinese to English : there seems to be little difference in reward.
"""
import gymnasium as gym
import time
import requests
import random
import numpy as np

# ----------------------------
# LLM HTTP call function
# ----------------------------
LLM_URL = 'https://api.yesapikey.com/v1/chat/completions'
LLM_HEADERS = {
    'Content-Type': 'application/json',
    'Authorization': 'Bearer ssk-DygYzdGba7V5ggRwDf0d28B193D84c90Af2eE34b68C1C892'
}

def call_llm(prompt, model="gpt-4.1-2025-04-14", temperature=0.7, max_tokens=1024):
    data = {
        "model": model,
        "messages": [{"role": "user", "content": prompt}],
        "temperature": temperature,
        "max_tokens": max_tokens
    }
    while True:
        try:
            response = requests.post(LLM_URL, json=data, headers=LLM_HEADERS)
            if response.status_code == 200:
                resp_json = response.json()
                if 'choices' in resp_json and resp_json['choices']:
                    content = resp_json['choices'][0].get('message', {}).get('content')
                    return content
        except Exception as e:
            print("LLM call exception:", e)
        time.sleep(2)

# ----------------------------
# Knowledge module
# ----------------------------
class Knowledge:
    def __init__(self):
        self.static_knowledge = {}
        self.dynamic_knowledge = []

    def load_static_knowledge(self, env_id):
        env_map = {
            "Acrobot-v1": {"state_dim": 6, "action_space": [0,1,2], "reward_threshold": -100},
            "CartPole-v1": {"state_dim": 4, "action_space": [0,1], "reward_threshold": 475},
            "MountainCarContinuous-v0": {"state_dim": 2, "action_space": [-1.0,1.0], "reward_threshold": 90},
            "MountainCar-v0": {"state_dim": 2, "action_space": [0,1,2], "reward_threshold": -110},
            "Pendulum-v1": {"state_dim": 3, "action_space": [-2.0,2.0], "reward_threshold": -200}
        }
        if env_id not in env_map:
            raise ValueError("Unsupported environment")
        self.static_knowledge = env_map[env_id]
        self.dynamic_knowledge = []

    def add_dynamic_entry(self, entry):
        self.dynamic_knowledge.append(entry)

    def get_dynamic_guidance(self, env_id):
        prompt = f"""
        I am generating a policy in environment {env_id}.
        Current known dynamic knowledge: {self.dynamic_knowledge}
        Please provide heuristic suggestions for policy generation based on this knowledge, such as:
        - State ranges to prioritize
        - Common failing action patterns
        - Recommended threshold adjustments
        Return in structured text.
        """
        guidance = call_llm(prompt)
        return guidance

# ----------------------------
# Memory module
# ----------------------------
class Memory:
    def __init__(self):
        self.episodes = []

    def start_episode(self):
        self.episodes.append({"steps": [], "summary": None})

    def add_step(self, s, a, r, done):
        if not self.episodes:
            raise ValueError("Please call start_episode() before adding steps!")
        self.episodes[-1]["steps"].append({"s": s, "a": a, "r": r, "done": done})

    def add_episode_summary(self, env_id, policy_version):
        if not self.episodes:
            raise ValueError("No running episode!")
        steps = self.episodes[-1]["steps"]
        total_reward = sum(step["r"] for step in steps)
        length = len(steps)
        self.episodes[-1]["summary"] = {
            "env_id": env_id,
            "policy_version": policy_version,
            "return": total_reward,
            "length": length
        }

    def get_recent_episodes(self, n=5):
        summaries = [ep["summary"] for ep in self.episodes if ep["summary"] is not None]
        return summaries[-n:]

# ----------------------------
# Reflection module
# ----------------------------
class Reflection:
    def __init__(self, knowledge: Knowledge):
        self.knowledge = knowledge

    def metrics(self, recent_episodes):
        returns = [ep["return"] for ep in recent_episodes]
        lengths = [ep["length"] for ep in recent_episodes]
        avg_return = np.mean(returns) if returns else 0
        avg_length = np.mean(lengths) if lengths else 0
        return {"avg_return": avg_return, "avg_length": avg_length}

    def failure_pattern(self, recent_episodes, env_id):
        prompt = f"""
        I have the following {env_id} environment episode data: {recent_episodes}
        Please analyze the most common failure patterns in detail, including state characteristics, action issues, and return patterns. Focus on the key points, minimize unimportant parts.
        """
        pattern = call_llm(prompt).strip()
        self.knowledge.add_dynamic_entry({"failure_pattern": pattern})
        return pattern

    def edit_suggestion(self, recent_episodes, env_id):
        prompt = f"""
        Based on the recent episode data from environment {env_id}: {recent_episodes}
        Generate a policy editing suggestion. The format must be:
        - add_rule(condition -> action)
        - modify_threshold(variable, old_value, new_value)
        - reprioritize(rule_i over rule_j)
        """
        suggestion = call_llm(prompt).strip()
        self.knowledge.add_dynamic_entry({"edit_suggestion": suggestion})
        return suggestion

# ----------------------------
# Policy generation and refinement
# ----------------------------
def generate_base_policy(env_id, knowledge: Knowledge):
    guidance = knowledge.get_dynamic_guidance(env_id)
    prompt = f"""
    Generate an initial executable white-box policy function in Python code:
    def policy(state): ...
    Environment: {env_id}
    Action space: {knowledge.static_knowledge['action_space']}
    Guidance: {guidance}
    """
    policy_code = call_llm(prompt)
    local_vars = {}
    try:
        exec(policy_code, {}, local_vars)
        policy_fn = local_vars.get("policy")
        if policy_fn is None:
            def policy_fn(state):
                return random.choice(knowledge.static_knowledge['action_space'])
    except:
        def policy_fn(state):
            return random.choice(knowledge.static_knowledge['action_space'])
    return policy_fn

def apply_edit(policy_fn, edit_text, knowledge: Knowledge):
    """Use LLM to rewrite the policy function according to edit_text"""
    code_prompt = f"""
    Existing policy function:
    {policy_fn.__code__ if hasattr(policy_fn, '__code__') else 'def policy(state): pass'}
    Editing suggestion: {edit_text}
    Please output a complete executable Python function: def policy(state): ...
    """
    policy_code = call_llm(code_prompt)
    local_vars = {}
    try:
        exec(policy_code, {}, local_vars)
        new_policy_fn = local_vars.get("policy")
        if new_policy_fn is None:
            return policy_fn
        return new_policy_fn
    except:
        return policy_fn

# ----------------------------
# Main closed-loop training logic
# ----------------------------
def run_env_loop(env_id, max_iters=50, episodes_per_iter=5):
    knowledge = Knowledge()
    knowledge.load_static_knowledge(env_id)
    memory = Memory()
    reflection = Reflection(knowledge)

    policy_version = 0
    first_iter = True

    policy_fn = None

    for iter_idx in range(max_iters):
        policy_version += 1

        if first_iter:
            policy_fn = generate_base_policy(env_id, knowledge)
            first_iter = False
        else:
            # Refine policy
            recent_episodes = memory.get_recent_episodes()
            suggestion = reflection.edit_suggestion(recent_episodes, env_id)
            policy_fn = apply_edit(policy_fn, suggestion, knowledge)

        env = gym.make(env_id)
        episode_rewards = []

        for ep in range(episodes_per_iter):
            s, _ = env.reset()
            done = False
            memory.start_episode()

            while not done:
                a = policy_fn(s)
                s_next, r, terminated, truncated, info = env.step(a)
                done = terminated or truncated
                memory.add_step(s, a, r, done)
                s = s_next
            memory.add_episode_summary(env_id, policy_version)
            episode_rewards.append(memory.episodes[-1]["summary"]["return"])

        recent_episodes = memory.get_recent_episodes()
        metrics = reflection.metrics(recent_episodes)
        print(f"Iteration {iter_idx+1}, Avg Return: {metrics['avg_return']}")

        pattern = reflection.failure_pattern(recent_episodes, env_id)
        print("Failure Pattern:", pattern)

        threshold = knowledge.static_knowledge.get("reward_threshold", 0)
        if metrics["avg_return"] >= threshold:
            print(f"Convergence reached, stopping training, Avg Return={metrics['avg_return']}")
            break

# ----------------------------
# Run five control tasks
# ----------------------------
if __name__ == "__main__":
    env_list = [
        #"Acrobot-v1",
        "CartPole-v1",
        "MountainCarContinuous-v0",
        "MountainCar-v0",
        "Pendulum-v1"
    ]
    for env_id in env_list:
        print(f"==== Running {env_id} ====")
        run_env_loop(env_id)


==== Running CartPole-v1 ====
