In [1]:
"""
Enhanced Closed-loop RL:
- Policy Pool + UCB bandit selection
- Parallel multi-candidate policy generation & evaluation
- Safe 1-step MCTS (uses clone_state/restore_state if available)
- Truncated handling: Acrobot/Pendulum won't trigger regeneration

环境	                    核心问题
Acrobot-v1	                swing-up 动作协调差，容易被低能量摆动卡住；truncated 常触发
CartPole-v1	                已收敛，但对 edge-case 仍可能 early failure；truncated 事件较少
MountainCarContinuous-v0	完成任务，但动作效率不高，略显保守
MountainCar-v0	            动能累积不足，policy oscillation，探索不足
Pendulum-v1	torque          不够，动作精度差，策略不连续，失败频繁


文件/函数	修改建议
generate_policy()	1. 在 rule 生成时加入 ε-greedy 或 Gaussian noise。
2. 连续环境动作幅度自适应。
Memory	1. 存储源码字符串 + 可解释规则。
2. 增加权重机制，定期清理老旧策略。
apply_edit()	1. 增加 truncated 智能处理：若 episode truncated → 调整 ε 或动作幅度，而不是直接替换 policy。
iteration loop	1. 延迟更新 best_policy。
2. MA return 收敛条件加入 SR > threshold 条件。
failure_pattern_analysis()	1. 增加对“动作幅度不足 / oscillation / momentum 不够”的识别。
"""

import gymnasium as gym
import time
import requests
import random
import numpy as np
import inspect
import math
from concurrent.futures import ThreadPoolExecutor, as_completed

# ----------------------------
# LLM HTTP call function
# ----------------------------
LLM_URL = 'https://api.yesapikey.com/v1/chat/completions'
LLM_HEADERS = {
    'Content-Type': 'application/json',
    'Authorization': 'Bearer sk-nNlzbvBIDhXdy8i02f95Ac27Db33490bAbFc66E725C2B9E9'
}

def call_llm(prompt, model="gpt-4.1-2025-04-14", temperature=0.2, max_tokens=1024):
    data = {
        "model": model,
        "messages": [{"role": "user", "content": prompt}],
        "temperature": temperature,
        "max_tokens": max_tokens
    }
    while True:
        try:
            response = requests.post(LLM_URL, json=data, headers=LLM_HEADERS, timeout=60)
            if response.status_code == 200:
                resp_json = response.json()
                if 'choices' in resp_json and resp_json['choices']:
                    content = resp_json['choices'][0].get('message', {}).get('content')
                    return content
            else:
                print("[LLM HTTP]", response.status_code, response.text[:200])
        except Exception as e:
            print("[LLM Exception]", e)
        time.sleep(2)

# ----------------------------
# Environment documentation mapping
# ----------------------------
ENV_DOC_URL = {
    "Acrobot-v1": "https://gymnasium.farama.org/environments/classic_control/acrobot/",
    "CartPole-v1": "https://gymnasium.farama.org/environments/classic_control/cart_pole/",
    "MountainCarContinuous-v0": "https://gymnasium.farama.org/environments/classic_control/mountain_car_continuous/",
    "MountainCar-v0": "https://gymnasium.farama.org/environments/classic_control/mountain_car/",
    "Pendulum-v1": "https://gymnasium.farama.org/environments/classic_control/pendulum/"
}

def get_env_doc_url(env_id: str) -> str:
    return ENV_DOC_URL.get(env_id, "https://gymnasium.farama.org/")

# ----------------------------
# Static Knowledge
# ----------------------------
STATIC_KNOWLEDGE = {
    "CartPole-v1": {
        "state_dim": 4,
        "state_vars": ["cart_position", "cart_velocity", "pole_angle", "pole_velocity"],
        "state_ranges": [(-4.8, 4.8), (-float("inf"), float("inf")), (-0.418, 0.418), (-float("inf"), float("inf"))],
        "action_space": [0, 1],
        "reward_threshold": 475,
        "action_type": "discrete"
    },
    "Acrobot-v1": {
        "state_dim": 6,
        "state_vars": ["cos_theta1", "sin_theta1", "cos_theta2", "sin_theta2", "theta1_dot", "theta2_dot"],
        "state_ranges": [(-1, 1), (-1, 1), (-1, 1), (-1, 1), (-float("inf"), float("inf")), (-float("inf"), float("inf"))],
        "action_space": [0, 1, 2],
        "reward_threshold": -100,
        "action_type": "discrete"
    },
    "MountainCar-v0": {
        "state_dim": 2,
        "state_vars": ["position", "velocity"],
        "state_ranges": [(-1.2, 0.6), (-0.07, 0.07)],
        "action_space": [0, 1, 2],
        "reward_threshold": -110,
        "action_type": "discrete"
    },
    "MountainCarContinuous-v0": {
        "state_dim": 2,
        "state_vars": ["position", "velocity"],
        "state_ranges": [(-1.2, 0.6), (-0.07, 0.07)],
        "action_space": [-1.0, 1.0],
        "reward_threshold": 90,
        "action_type": "continuous"
    },
    "Pendulum-v1": {
        "state_dim": 3,
        "state_vars": ["cos_theta", "sin_theta", "theta_dot"],
        "state_ranges": [(-1, 1), (-1, 1), (-float("inf"), float("inf"))],
        "action_space": [-2.0, 2.0],
        "reward_threshold": -200,
        "action_type": "continuous"
    }
}

# ----------------------------
# Knowledge module
# ----------------------------
class Knowledge:
    def __init__(self):
        self.static_knowledge = {}
        self.dynamic_knowledge = []

    def load_static_knowledge(self, env_id):
        if env_id not in STATIC_KNOWLEDGE:
            raise ValueError("Unsupported environment")
        self.static_knowledge = STATIC_KNOWLEDGE[env_id]
        self.dynamic_knowledge = []

    def add_dynamic_entry(self, entry):
        self.dynamic_knowledge.append(entry)

    def get_dynamic_guidance(self, env_id):
        prompt = f"""
I am generating a policy in environment {env_id}.
Current dynamic knowledge entries: {self.dynamic_knowledge}

Focus on environment **principles, physics, and dynamics**, not superficial patterns.
Please provide concise heuristic suggestions for policy generation based on this knowledge, such as:
- State ranges to prioritize
- Common failing action patterns
- Recommended threshold adjustments

Return a short, structured bullet list (no prose).
"""
        guidance = call_llm(prompt)
        return guidance

# ----------------------------
# Memory module
# ----------------------------
class Memory:
    def __init__(self):
        self.episodes = []

    def start_episode(self):
        self.episodes.append({"steps": [], "summary": None})

    def add_step(self, s, a, r, done):
        if not self.episodes:
            raise ValueError("Please call start_episode() before adding steps!")
        self.episodes[-1]["steps"].append({"s": s, "a": a, "r": r, "done": done})

    def add_episode_summary(self, env_id, policy_version):
        if not self.episodes:
            raise ValueError("No running episode!")
        steps = self.episodes[-1]["steps"]
        total_reward = sum(step["r"] for step in steps)
        length = len(steps)
        self.episodes[-1]["summary"] = {
            "env_id": env_id,
            "policy_version": policy_version,
            "return": total_reward,
            "length": length
        }

    def get_recent_episodes(self, n=5):
        summaries = [ep["summary"] for ep in self.episodes if ep["summary"] is not None]
        return summaries[-n:]

# ----------------------------
# Reflection module
# ----------------------------
class Reflection:
    def __init__(self, knowledge: Knowledge):
        self.knowledge = knowledge

    def metrics(self, recent_episodes):
        returns = [ep["return"] for ep in recent_episodes]
        lengths = [ep["length"] for ep in recent_episodes]
        avg_return = np.mean(returns) if returns else 0.0
        avg_length = np.mean(lengths) if lengths else 0.0
        threshold = self.knowledge.static_knowledge.get("reward_threshold", 0.0)
        success_count = sum(1 for ep in recent_episodes if ep["return"] >= threshold)
        success_rate = (success_count / len(recent_episodes)) if recent_episodes else 0.0
        return {"avg_return": float(avg_return), "avg_length": float(avg_length), "success_rate": float(success_rate)}

    def failure_pattern(self, recent_episodes, env_id):
        prompt = f"""
I have the following {env_id} environment episode summaries: {recent_episodes}
Please analyze the most common failure patterns, including state characteristics, action issues, and return patterns. Focus only on key points.
Return a concise paragraph.
"""
        pattern = call_llm(prompt).strip()
        self.knowledge.add_dynamic_entry({"env_id": env_id, "failure_pattern": pattern})
        return pattern

    def edit_suggestion(self, recent_episodes, env_id):
        prompt = f"""
Based on recent episode data from environment {env_id}: {recent_episodes}
Generate one policy editing suggestion in one of the following formats:
- add_rule(condition -> action)
- modify_threshold(variable, old_value, new_value)
- reprioritize(rule_i over rule_j)

Return exactly one line with one edit.
"""
        suggestion = call_llm(prompt).strip()
        self.knowledge.add_dynamic_entry({"env_id": env_id, "edit_suggestion": suggestion})
        return suggestion

# ----------------------------
# Policy pool for multi-strategy search
# ----------------------------
class PolicyPool:
    def __init__(self, max_size=5):
        # each: {"fn": callable, "version": int, "metrics": dict|None}
        self.policies = []
        self.max_size = max_size
        self.counts = []  # pulls for UCB
        self.values = []  # mean value for UCB (moving average)

    def add_policy(self, policy_fn, version, metrics=None):
        entry = {"fn": policy_fn, "version": version, "metrics": metrics}
        if len(self.policies) < self.max_size:
            self.policies.append(entry)
            self.counts.append(0)
            self.values.append(0.0)
        else:
            idx = self.get_worst_policy_idx()
            self.policies[idx] = entry
            self.counts[idx] = 0
            self.values[idx] = 0.0

    def select_policy_ucb(self, c=1.0):
        if not self.policies:
            raise RuntimeError("PolicyPool is empty")
        total_counts = sum(self.counts) + 1
        ucb_scores = []
        for i in range(len(self.policies)):
            if self.counts[i] == 0:
                ucb_scores.append(float('inf'))
            else:
                ucb_scores.append(self.values[i] + c * math.sqrt(math.log(total_counts)/self.counts[i]))
        idx = int(np.argmax(ucb_scores))
        self.counts[idx] += 1
        return self.policies[idx]["fn"], idx

    def update_policy_value(self, idx, reward):
        n = self.counts[idx]
        if n <= 0:
            self.values[idx] = reward
        else:
            # incremental mean
            self.values[idx] = ((n-1)/n)*self.values[idx] + (1/n)*reward

    def get_worst_policy_idx(self):
        # define worst by avg_return in metrics; if None -> very bad
        vals = []
        for p in self.policies:
            if p["metrics"] and "avg_return" in p["metrics"]:
                vals.append(p["metrics"]["avg_return"])
            else:
                vals.append(-float('inf'))
        return int(np.argmin(vals))

# ----------------------------
# Safe step wrapper
# ----------------------------
def safe_step(env, action):
    sk = STATIC_KNOWLEDGE[env.unwrapped.spec.id]
    if sk["action_type"] == "continuous":
        lo, hi = sk["action_space"]
        if np.isscalar(action):
            action = np.array([np.clip(action, lo, hi)], dtype=np.float32)
        else:
            action = np.clip(np.array(action, dtype=np.float32), lo, hi)
    return env.step(action)

# ----------------------------
# Safe policy call with optional 1-step MCTS
# ----------------------------
def safe_policy_call(state, policy_fn, sk, env):
    """
    - Calls rule policy to propose action.
    - For discrete envs, if env supports clone_state/restore_state, do 1-step lookahead
      over all actions & pick best immediate reward (does NOT pollute real trajectory).
    - If clone_state is not available, skip lookahead.
    """
    # Base action from rule
    try:
        a = policy_fn(state)
    except Exception as e:
        if sk["action_type"] == "discrete":
            a = random.choice(sk["action_space"])
        else:
            lo, hi = sk["action_space"]
            a = 0.5 * (lo + hi)

    # Clip for continuous now (final safety also happens in safe_step)
    if sk["action_type"] == "continuous":
        lo, hi = sk["action_space"]
        a = float(np.clip(a, lo, hi))
        return a

    # Discrete: try 1-step lookahead with clone_state
    if sk["action_type"] == "discrete":
        uw = getattr(env, "unwrapped", env)
        can_clone = hasattr(uw, "clone_state") and hasattr(uw, "restore_state")
        if not can_clone:
            return a  # graceful fallback

        best_a, best_r = a, -float('inf')
        try:
            snapshot = uw.clone_state()
        except Exception:
            return a
        for cand in sk["action_space"]:
            try:
                # step then restore
                obs2, r, term, trunc, _ = safe_step(env, cand)
                # restore to snapshot for next branch
                uw.restore_state(snapshot)
                # purely 1-step score
                if r > best_r:
                    best_r, best_a = r, cand
            except Exception:
                # if any branch fails, ignore that candidate
                try:
                    uw.restore_state(snapshot)
                except Exception:
                    pass
                continue
        return best_a

    return a

# ----------------------------
# Policy generation helper with rule-based constraint
# ----------------------------
def _action_constraints_text(static_knowledge: dict) -> str:
    a = static_knowledge["action_space"]
    if static_knowledge.get("action_type") == "discrete":
        return f"Discrete actions; valid actions are exactly the integers in {a}."
    else:
        lo, hi = a[0], a[1]
        return f"Continuous action; return a single float within [{lo}, {hi}]. Clip if necessary."

def generate_rule_policy_code(env_id, knowledge: Knowledge):
    sk = knowledge.static_knowledge
    guidance = knowledge.get_dynamic_guidance(env_id) or ""
    doc_url = get_env_doc_url(env_id)
    action_desc = _action_constraints_text(sk)
    state_vars_text = "\n".join([f"- {name} in range {rng}" for name, rng in zip(sk["state_vars"], sk["state_ranges"])])

    prompt = f"""
You are writing a deterministic, white-box **rule-based policy** for Gymnasium environment "{env_id}".
Focus on **environment principles, physics, and dynamics**, not superficial patterns.
The policy must be based on simple if-else statements or threshold comparisons using state variables.
Environment documentation: {doc_url}

Observation (state vector):
{state_vars_text}

Action constraints:
- {action_desc}
- May import 'math' if needed
- Must be deterministic
- Do not use loops, functions, or external libraries except math
- Example (discrete): if state[2] > 0: return 1 else: return 0
- Example (continuous): return max(min(k1*state[1]-k2*state[0], hi), lo)

Dynamic guidance:
{guidance}

Output requirements:
- Only one Python function: def policy(state): ...
- No explanations, no markdown, no print
- Returned action strictly satisfies constraints
"""
    code = call_llm(prompt)
    return code

def compile_policy_or_default(code, sk):
    local_vars = {"math": math, "np": np}
    try:
        exec(code, local_vars)
        policy_fn = local_vars.get("policy")
        if policy_fn is None:
            raise ValueError("No function 'policy' found")
        return policy_fn
    except Exception:
        if sk["action_type"] == "discrete":
            def policy(state): return sk["action_space"][0]
        else:
            lo, hi = sk["action_space"]
            def policy(state): return 0.5 * (lo + hi)
        return policy

def generate_base_policies(env_id, knowledge: Knowledge, n_candidates=3):
    """Return list[policy_fn] of length n_candidates."""
    sk = knowledge.static_knowledge
    fns = []
    for _ in range(n_candidates):
        code = generate_rule_policy_code(env_id, knowledge)
        fns.append(compile_policy_or_default(code, sk))
    return fns

def apply_edit(policy_fn, edit_text, knowledge: Knowledge, env_id: str):
    """Ask LLM to edit the current rule policy."""
    sk = knowledge.static_knowledge
    try:
        existing_src = inspect.getsource(policy_fn)
    except Exception:
        if sk["action_type"] == "discrete":
            existing_src = "def policy(state):\n    return " + str(sk["action_space"][0])
        else:
            lo, hi = sk["action_space"]
            existing_src = "def policy(state):\n    return " + str(0.5*(lo+hi))

    action_desc = _action_constraints_text(sk)
    doc_url = get_env_doc_url(env_id)

    prompt = f"""
Revise deterministic, **rule-based** policy for environment at {doc_url}.
Focus on physics, dynamics, and environment principles.
Constraints: {action_desc}
Current policy:
{existing_src}

Edit suggestion: {edit_text}

Rules:
- Keep it deterministic and rule-based (if-else / threshold)
- Only output a single valid Python function: def policy(state): ...
- You may use 'math'
"""
    code = call_llm(prompt)
    return compile_policy_or_default(code, sk)

# ----------------------------
# Evaluation helpers
# ----------------------------
def eval_policy_once(env_id, policy_fn, episodes=5, use_mcts=True, no_trunc_reset_for=None):
    """
    Evaluate a policy for given env_id.
    - use_mcts: if True, safe_policy_call attempts 1-step lookahead when supported
    - no_trunc_reset_for: set of env_ids that won't trigger regeneration on truncated (here just info)
    Returns dict: {"avg_return":..., "avg_length":..., "success_rate":...}
    """
    knowledge = Knowledge()
    knowledge.load_static_knowledge(env_id)
    sk = knowledge.static_knowledge
    mem = Memory()

    env = gym.make(env_id)
    trunc_sensitive = (env_id not in (no_trunc_reset_for or set()))

    for _ in range(episodes):
        s, _ = env.reset()
        done = False
        mem.start_episode()
        while not done:
            if use_mcts:
                a = safe_policy_call(s, policy_fn, sk, env)
            else:
                # plain policy call
                try:
                    a = policy_fn(s)
                except Exception:
                    if sk["action_type"] == "discrete":
                        a = random.choice(sk["action_space"])
                    else:
                        lo, hi = sk["action_space"]
                        a = 0.5 * (lo + hi)
            s_next, r, terminated, truncated, info = safe_step(env, a)
            done = terminated or truncated
            # truncated info retained—外层会根据需求决定是否重置策略，这里仅记录
            mem.add_step(s, a, r, done)
            s = s_next
        mem.add_episode_summary(env_id, policy_version=0)

    env.close()
    refl = Reflection(knowledge)
    return refl.metrics(mem.get_recent_episodes(n=episodes))

def parallel_eval_candidates(env_id, policy_fns, episodes_each=5):
    """
    Parallel evaluate candidate policies; return list of (policy_fn, metrics)
    """
    results = []
    no_trunc_reset_for = {"Acrobot-v1", "Pendulum-v1"}
    with ThreadPoolExecutor(max_workers=min(len(policy_fns), 3)) as ex:
        fut2fn = {
            ex.submit(eval_policy_once, env_id, fn, episodes_each, True, no_trunc_reset_for): fn
            for fn in policy_fns
        }
        for fut in as_completed(fut2fn):
            fn = fut2fn[fut]
            try:
                metrics = fut.result()
            except Exception as e:
                metrics = {"avg_return": -1e9, "avg_length": 0.0, "success_rate": 0.0}
                print("[Parallel Eval Exception]", e)
            results.append((fn, metrics))
    return results

# ----------------------------
# Main closed-loop training with Policy Pool
# ----------------------------
def run_env_loop(env_id, max_iters=10, episodes_per_iter=10, ma_window=3,
                 success_rate_threshold=0.8, pool_size=5, n_init_candidates=3):
    knowledge = Knowledge()
    knowledge.load_static_knowledge(env_id)
    memory = Memory()
    reflection = Reflection(knowledge)
    policy_version = 0
    policy_pool = PolicyPool(max_size=pool_size)

    # === Initial multi-candidate generation + parallel evaluation ===
    init_fns = generate_base_policies(env_id, knowledge, n_candidates=n_init_candidates)
    evaluated = parallel_eval_candidates(env_id, init_fns, episodes_each=max(2, episodes_per_iter//2))
    # pick top by avg_return to fill the pool (and keep all if pool bigger)
    evaluated.sort(key=lambda t: t[1]["avg_return"], reverse=True)
    for fn, m in evaluated[:pool_size]:
        policy_version += 1
        policy_pool.add_policy(fn, policy_version, metrics=m)

    # === Iterative loop ===
    for iter_idx in range(max_iters):
        print(f"=== Iteration {iter_idx+1} ===")

        # Select policy via UCB
        policy_fn, idx = policy_pool.select_policy_ucb()

        # Run selected policy for this iteration
        env = gym.make(env_id)
        iteration_returns = []
        truncated_seen = False
        for ep in range(episodes_per_iter):
            s, _ = env.reset()
            done = False
            memory.start_episode()
            while not done:
                a = safe_policy_call(s, policy_fn, knowledge.static_knowledge, env)
                s_next, r, terminated, truncated, info = safe_step(env, a)
                done = terminated or truncated
                if truncated and env_id not in ["Acrobot-v1", "Pendulum-v1"]:
                    truncated_seen = True
                memory.add_step(s, a, r, done)
                s = s_next
            memory.add_episode_summary(env_id, policy_version=policy_pool.policies[idx]["version"])
            iteration_returns.append(memory.episodes[-1]["summary"]["return"])
        env.close()

        # Compute moving metrics
        recent_ma = memory.get_recent_episodes(n=ma_window)
        metrics = reflection.metrics(recent_ma)
        policy_pool.policies[idx]["metrics"] = metrics
        policy_pool.update_policy_value(idx, metrics["avg_return"])

        print(f"[Selected idx {idx} v{policy_pool.policies[idx]['version']}] "
              f"MA Return={metrics['avg_return']:.2f}  SR={metrics['success_rate']:.2f}")

        # Reflection diagnostics & knowledge update
        pattern = reflection.failure_pattern(recent_ma, env_id)
        print("Failure Pattern:", pattern)
        edit = reflection.edit_suggestion(recent_ma, env_id)
        print("Edit Suggestion:", edit)

        # === Eliminate worst & refill with an improved/new policy ===
        worst_idx = policy_pool.get_worst_policy_idx()
        # Generate 3 new candidates (edited from current policy as seed for diversity)
        seed_fn = policy_fn
        # Try one edit from seed, plus 2 fresh policies
        new_candidates = []
        try:
            edited_fn = apply_edit(seed_fn, edit, knowledge, env_id)
            new_candidates.append(edited_fn)
        except Exception:
            pass
        fresh = generate_base_policies(env_id, knowledge, n_candidates=2)
        new_candidates.extend(fresh)

        # Evaluate candidates in parallel and pick the best to insert
        cand_eval = parallel_eval_candidates(env_id, new_candidates, episodes_each=max(2, episodes_per_iter//2))
        cand_eval.sort(key=lambda t: t[1]["avg_return"], reverse=True)
        best_new_fn, best_new_metrics = cand_eval[0]
        policy_version += 1
        policy_pool.policies[worst_idx] = {"fn": best_new_fn, "version": policy_version, "metrics": best_new_metrics}
        policy_pool.counts[worst_idx] = 0
        policy_pool.values[worst_idx] = 0.0
        print(f"Replaced worst idx {worst_idx} with new v{policy_version}: avg_return={best_new_metrics['avg_return']:.2f}")

        # === Convergence check on moving average ===
        if len(memory.episodes) >= ma_window * episodes_per_iter:
            thr = knowledge.static_knowledge.get("reward_threshold", 0.0)
            if metrics["avg_return"] >= thr and metrics["success_rate"] >= success_rate_threshold:
                print(f"Converged! MA Return={metrics['avg_return']:.2f}, SR={metrics['success_rate']:.2f}")
                break

        # Note: truncated_seen no longer regenerates for Acrobot/Pendulum by design
        if truncated_seen and env_id not in ["Acrobot-v1", "Pendulum-v1"]:
            print("[Info] Truncated observed (non-exempt env) — handled via pool search (no hard reset).")

# ----------------------------
# Run multiple control tasks
# ----------------------------
if __name__ == "__main__":
    env_list = [
        "Acrobot-v1",
        "CartPole-v1",
        "MountainCarContinuous-v0",
        "MountainCar-v0",
        "Pendulum-v1"
    ]
    for env_id in env_list:
        print(f"==== Running {env_id} ====")
        run_env_loop(env_id, max_iters=10, episodes_per_iter=8, ma_window=3,
                     success_rate_threshold=0.8, pool_size=5, n_init_candidates=3)


==== Running Acrobot-v1 ====
=== Iteration 1 ===
[Selected idx 0 v1] MA Return=-70.00  SR=1.00
Failure Pattern: The episode summaries for Acrobot-v1 show consistently negative returns (-72.0 and -66.0) and relatively short episode lengths (67–73 steps), indicating that the agent frequently fails to achieve the task objective, likely falling short of swinging the end-effector to the target height. The repeated returns and lengths suggest the agent is stuck in similar suboptimal behaviors, possibly due to poor exploration or ineffective action selection, such as failing to generate sufficient momentum or getting trapped in local oscillations. The lack of variation in outcomes points to a policy that does not adapt well to different state characteristics, leading to repeated early terminations and limited progress toward solving the environment.
Edit Suggestion: modify_threshold(length, 73, 67)
Replaced worst idx 2 with new v4: avg_return=-500.00
=== Iteration 2 ===
[Selected idx 1 v2] MA