[RLlib] Fix bad assertion error in PPO when use_kl_loss=False. (r…

…ay-project#45031)
harborn · May 8, 2024 · f1e0590 · f1e0590
1 parent 287ecc0
commit f1e0590
Show file tree

Hide file tree

Showing 4 changed files with 3 additions and 6 deletions.
diff --git a/rllib/algorithms/ppo/ppo_learner.py b/rllib/algorithms/ppo/ppo_learner.py
@@ -216,13 +216,11 @@ def additional_update_for_module(
         module_id: ModuleID,
         config: "PPOConfig",
         timestep: int,
-        sampled_kl_values: dict,
     ) -> Dict[str, Any]:
         results = super().additional_update_for_module(
             module_id=module_id,
             config=config,
             timestep=timestep,
-            sampled_kl_values=sampled_kl_values,
         )
 
         # Update entropy coefficient via our Scheduler.

diff --git a/rllib/algorithms/ppo/tf/ppo_tf_learner.py b/rllib/algorithms/ppo/tf/ppo_tf_learner.py
@@ -152,17 +152,16 @@ def additional_update_for_module(
         timestep: int,
         sampled_kl_values: dict,
     ) -> Dict[str, Any]:
-        assert sampled_kl_values, "Sampled KL values are empty."
 
         results = super().additional_update_for_module(
             module_id=module_id,
             config=config,
             timestep=timestep,
-            sampled_kl_values=sampled_kl_values,
         )
 
         # Update KL coefficient.
         if config.use_kl_loss:
+            assert sampled_kl_values, "Sampled KL values are empty."
             sampled_kl = sampled_kl_values[module_id]
             curr_var = self.curr_kl_coeffs_per_module[module_id]
             if sampled_kl > 2.0 * config.kl_target:

diff --git a/rllib/algorithms/ppo/torch/ppo_torch_learner.py b/rllib/algorithms/ppo/torch/ppo_torch_learner.py
@@ -142,17 +142,16 @@ def additional_update_for_module(
         timestep: int,
         sampled_kl_values: dict,
     ) -> Dict[str, Any]:
-        assert sampled_kl_values, "Sampled KL values are empty."
 
         results = super().additional_update_for_module(
             module_id=module_id,
             config=config,
             timestep=timestep,
-            sampled_kl_values=sampled_kl_values,
         )
 
         # Update KL coefficient.
         if config.use_kl_loss:
+            assert sampled_kl_values, "Sampled KL values are empty."
             sampled_kl = sampled_kl_values[module_id]
             curr_var = self.curr_kl_coeffs_per_module[module_id]
             if sampled_kl > 2.0 * config.kl_target:

diff --git a/rllib/tuned_examples/ppo/cartpole_ppo_envrunner.py b/rllib/tuned_examples/ppo/cartpole_ppo_envrunner.py
@@ -22,6 +22,7 @@
         lr=0.0003,
         num_sgd_iter=6,
         vf_loss_coeff=0.01,
+        use_kl_loss=True,
     )
     .evaluation(
         evaluation_num_env_runners=1,