In [1]:
import ale_py  # Ensure Atari environment is registered
from stable_baselines3.common.env_util import make_atari_env
from stable_baselines3.common.vec_env import VecFrameStack, VecTransposeImage
from stable_baselines3 import DQN
from stable_baselines3.common.evaluation import evaluate_policy

# ===== Configuration =====
ENV_ID      = "SeaquestNoFrameskip-v4"
MODEL_PATH  = "logs/dqn/SeaquestNoFrameskip-v4_1/best_model.zip"
SEED        = 0
EVAL_EPISODES = 5
# ==========================

# 1) Create Atari environment with preprocessing
env = make_atari_env(
    ENV_ID,
    n_envs=1,
    seed=SEED
)
env = VecFrameStack(env, n_stack=4)
env = VecTransposeImage(env)

# 4) Load the trained DQN model with minimal buffer
model = DQN.load(
    MODEL_PATH,
    env=env,
    buffer_size=1,
    custom_objects={
        "learning_rate": 0.0,
        "lr_schedule": lambda _: 0.0,
        "exploration_schedule": lambda _: 0.0,
    },
)
print("✅ Model loaded successfully.")

obs = env.reset()
done = False

rewards = []

while True:
    action, _states = model.predict(obs, deterministic=True)
    obs, reward, done, info = env.step(action)
    rewards.append(reward)
    if done:
        break

env.close()

✅ Model loaded successfully.


In [19]:
import ale_py  # Ensure Atari environment is registered
from stable_baselines3.common.env_util import make_atari_env
from stable_baselines3.common.vec_env import VecFrameStack, VecTransposeImage
from stable_baselines3 import DQN
import numpy as np

# ===== Configuration =====
ENV_ID = "SeaquestNoFrameskip-v4"
MODEL_PATH = "logs/dqn/SeaquestNoFrameskip-v4_1/best_model.zip"
SEED = 0
EVAL_EPISODES = 5
# ==========================

# Create Atari environment with preprocessing
env = make_atari_env(
    ENV_ID,
    n_envs=1,
    seed=SEED
)
env = VecFrameStack(env, n_stack=4)
env = VecTransposeImage(env)

# Load the trained DQN model
model = DQN.load(
    MODEL_PATH,
    env=env,
    buffer_size=1,
    custom_objects={
        "learning_rate": 0.0,
        "lr_schedule": lambda _: 0.0,
        "exploration_schedule": lambda _: 0.0,
    },
)
print("✅ Model loaded successfully.\n")

# Run multiple episodes and collect step-wise rewards
for ep in range(1, EVAL_EPISODES + 1):
    obs = env.reset()
    done = False
    rewards = []

    while not done:
        action, _states = model.predict(obs, deterministic=True)
        obs, reward, done, info = env.step(action)
        rewards.append(reward[0])  # reward is a list with one element (because n_envs=1)

    total_reward = sum(rewards)
    print(f"🎯 Episode {ep}: return = {total_reward:.1f}, steps = {len(rewards)}")
    print(f"   Step-wise rewards: {np.array(rewards)}\n")

env.close()


✅ Model loaded successfully.

🎯 Episode 1: return = 23.0, steps = 462
   Step-wise rewards: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.
 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.
 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0

In [4]:
sum(rewards)

array([23.], dtype=float32)

In [10]:
from rl_zoo3.utils import get_saved_hyperparams
import os
stats_path = os.path.join("logs/dqn/SeaquestNoFrameskip-v4_1", "SeaquestNoFrameskip-v4")
hyperparams, stats_path = get_saved_hyperparams(stats_path)
gamma = hyperparams.get("gamma", 0.99)  # 默认值是 0.99

In [11]:
print("Gamma =", gamma)

Gamma = 0.99


In [13]:
def compute_discounted_return(rewards, gamma):
    return sum(r * (gamma ** t) for t, r in enumerate(rewards))
compute_discounted_return(rewards, gamma)

array([3.4319015], dtype=float32)

In [39]:
from stable_baselines3.common.evaluation import evaluate_policy
rewards, _ = evaluate_policy(
        model,
        env,
        n_eval_episodes=EVAL_EPISODES,
        deterministic=True,
        return_episode_rewards=True,
    )

In [43]:
rewards

[8280.0, 8280.0, 8280.0, 6620.0, 8280.0]

In [16]:
import numpy as np
for i, r in enumerate(rewards, 1):
        print(f"Episode {i}: reward = {r}")
print(f"\n🎯 Mean reward over {EVAL_EPISODES} episodes: {np.mean(rewards):.2f}")

Episode 1: reward = 8280.0
Episode 2: reward = 8280.0
Episode 3: reward = 8280.0
Episode 4: reward = 6620.0
Episode 5: reward = 8280.0

🎯 Mean reward over 5 episodes: 7948.00


In [17]:
rewards2, _ = evaluate_policy(model, env, n_eval_episodes=10, deterministic=True, return_episode_rewards=True)

In [18]:
rewards2

[8280.0,
 6620.0,
 4200.0,
 4200.0,
 8280.0,
 4200.0,
 8280.0,
 5060.0,
 6620.0,
 4200.0]

In [25]:
import numpy as np
import torch
from stable_baselines3 import DQN
from rl_zoo3.utils import create_test_env, get_saved_hyperparams

# 配置
ENV_ID = "SeaquestNoFrameskip-v4"
MODEL_PATH = "logs/dqn/SeaquestNoFrameskip-v4_1/best_model.zip"
STATS_PATH = "logs/dqn/SeaquestNoFrameskip-v4_1/SeaquestNoFrameskip-v4"
SEED = 0

# 读取训练时使用的超参数和 normalize 状态
hyperparams, stats_path = get_saved_hyperparams(STATS_PATH)

# 创建与训练时一致的评估环境（包含 Monitor）
env = create_test_env(
    env_id=ENV_ID,
    n_envs=1,
    stats_path=stats_path,
    seed=SEED,
    log_dir=None,
    should_render=False,
    hyperparams=hyperparams,
)

# 加载训练好的 DQN 模型
model = DQN.load(
    MODEL_PATH,
    env=env,
    buffer_size=1,
    custom_objects={
        "learning_rate": 0.0,
        "lr_schedule": lambda _: 0.0,
        "exploration_schedule": lambda _: 0.0,
    },
)
print("✅ Model loaded.")

# 单回合评估，逐步输出每帧 reward
obs = env.reset()
done = False
rewards = []

while not done:
    action, _ = model.predict(obs, deterministic=True)
    obs, reward, done, info = env.step(action)
    rewards.append(reward[0])

# 输出总得分和每一步得分
print(f"\n🎯 Total return: {np.sum(rewards)}")
print(f"📊 Step-wise rewards: {rewards}")
env.close()


Stacking 4 frames
Wrapping the env in a VecTransposeImage.
✅ Model loaded.

🎯 Total return: 23.0
📊 Step-wise rewards: [np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0

In [30]:
from rl_zoo3.utils import create_test_env, get_saved_hyperparams

env = create_test_env(
    env_id="SeaquestNoFrameskip-v4",
    n_envs=1,
    stats_path=None,
    seed=0,
    log_dir=None,
    should_render=False,
    hyperparams={
        "frame_stack": 4,
        "env_wrapper": ["stable_baselines3.common.atari_wrappers.AtariWrapper"],
        "clip_reward": False,  # 👈 关键点：关闭 reward clipping
    }
)
env = VecFrameStack(env, n_stack=4)
env = VecTransposeImage(env)

In [31]:
model = DQN.load(
    MODEL_PATH,
    env=env,
    buffer_size=1,
    custom_objects={
        "learning_rate": 0.0,
        "lr_schedule": lambda _: 0.0,
        "exploration_schedule": lambda _: 0.0,
    },
)
print("✅ Model loaded.")

# 单回合评估，逐步输出每帧 reward
obs = env.reset()
done = False
rewards = []

while not done:
    action, _ = model.predict(obs, deterministic=True)
    obs, reward, done, info = env.step(action)
    rewards.append(reward[0])

# 输出总得分和每一步得分
print(f"\n🎯 Total return: {np.sum(rewards)}")
print(f"📊 Step-wise rewards: {rewards}")

✅ Model loaded.

🎯 Total return: 23.0
📊 Step-wise rewards: [np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.fl

In [34]:
import gymnasium as gym
import ale_py
from stable_baselines3.common.atari_wrappers import AtariWrapper
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv, VecTransposeImage

def make_env_no_clip(env_id, seed=0):
    def _init():
        env = gym.make(env_id)
        env = Monitor(env)  # Track raw episode rewards
        env = AtariWrapper(env, clip_reward=False)  # 👈 关键：禁用 reward 裁剪
        return env
    return DummyVecEnv([_init])  # Vectorized env
env = make_env_no_clip("SeaquestNoFrameskip-v4", seed=0)
env = VecTransposeImage(env)  # 最后对齐 observation shape (4, 84, 84)

In [38]:
from stable_baselines3.common.atari_wrappers import AtariWrapper
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack, VecTransposeImage
from stable_baselines3.common.monitor import Monitor
import gymnasium as gym

def make_env_no_clip_and_stack(env_id, seed=0):
    def _init():
        env = gym.make(env_id)
        env = Monitor(env)  # 必须加 Monitor 才能提取 info["episode"]["r"]
        env = AtariWrapper(env, clip_reward=False)  # 禁用 reward 裁剪
        return env
    env = DummyVecEnv([_init])
    env = VecFrameStack(env, n_stack=4)  # 👈 与模型训练对齐
    env = VecTransposeImage(env)         # 👈 与模型 observation 维度对齐
    return env

env = make_env_no_clip_and_stack("SeaquestNoFrameskip-v4", seed=0)

model = DQN.load(
    MODEL_PATH,
    env=env,
    buffer_size=1,
    custom_objects={
        "learning_rate": 0.0,
        "lr_schedule": lambda _: 0.0,
        "exploration_schedule": lambda _: 0.0,
    },
)
print("✅ Model loaded.")

# 单回合评估，逐步输出每帧 reward
obs = env.reset()
done = False
rewards = []

while not done:
    action, _ = model.predict(obs, deterministic=True)
    obs, reward, done, info = env.step(action)
    rewards.append(reward[0])

# 输出总得分和每一步得分
print(f"\n🎯 Total return: {np.sum(rewards)}")
print(f"📊 Step-wise rewards: {rewards}")

✅ Model loaded.

🎯 Total return: 460.0
📊 Step-wise rewards: [np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.f