In [1]:
# Import the RL algorithm (Trainer) we would like to use.
from ray.rllib.agents.ppo import PPOTrainer

# Configure the algorithm.
config = {
    # Environment (RLlib understands openAI gym registered strings).
    "env": "Taxi-v3",
    # Use 2 environment workers (aka "rollout workers") that parallelly
    # collect samples from their own environment clone(s).
    "num_workers": 2,
    # Change this to "framework: torch", if you are using PyTorch.
    # Also, use "framework: tf2" for tf2.x eager execution.
    "framework": "torch",
    # Tweak the default model provided automatically by RLlib,
    # given the environment's observation- and action spaces.
    "model": {
        "fcnet_hiddens": [64, 64],
        "fcnet_activation": "relu",
    },
    # Set up a separate evaluation worker set for the
    # `trainer.evaluate()` call after training (see below).
    "evaluation_num_workers": 1,
    # Only for evaluation runs, render the env.
    "evaluation_config": {
        "render_env": True,
    },
}

# Create our RLlib Trainer.
trainer = PPOTrainer(config=config)

# Run it for n training iterations. A training iteration includes
# parallel sample collection by the environment workers as well as
# loss calculation on the collected batch and a model update.
for _ in range(3):
    print(trainer.train())

# Evaluate the trained Trainer (and render each timestep to the shell's
# output).
trainer.evaluate()


2022-04-26 00:43:21,087	INFO services.py:1456 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


{'episode_reward_max': -459.0, 'episode_reward_min': -974.0, 'episode_reward_mean': -810.1, 'episode_len_mean': 196.0, 'episode_media': {}, 'episodes_this_iter': 20, 'policy_reward_min': {}, 'policy_reward_max': {}, 'policy_reward_mean': {}, 'custom_metrics': {}, 'hist_stats': {'episode_reward': [-929.0, -830.0, -459.0, -641.0, -821.0, -848.0, -893.0, -866.0, -884.0, -785.0, -974.0, -749.0, -803.0, -902.0, -839.0, -821.0, -821.0, -767.0, -794.0, -776.0], 'episode_lengths': [200, 200, 120, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200]}, 'sampler_perf': {'mean_raw_obs_processing_ms': 0.03383679130207236, 'mean_inference_ms': 0.2434526545473601, 'mean_action_processing_ms': 0.014960736051194374, 'mean_env_wait_ms': 0.017763316065355993, 'mean_env_render_ms': 0.0}, 'off_policy_estimator': {}, 'num_healthy_workers': 2, 'timesteps_total': 4000, 'timesteps_this_iter': 4000, 'agent_timesteps_total': 4000, 'timers': {'sample_time_ms': 699.18, 'sample_throu

{'evaluation': {'episode_reward_max': -452.0,
  'episode_reward_min': -659.0,
  'episode_reward_mean': -570.8,
  'episode_len_mean': 200.0,
  'episode_media': {},
  'episodes_this_iter': 10,
  'policy_reward_min': {},
  'policy_reward_max': {},
  'policy_reward_mean': {},
  'custom_metrics': {},
  'hist_stats': {'episode_reward': [-551.0,
    -641.0,
    -533.0,
    -533.0,
    -632.0,
    -659.0,
    -605.0,
    -452.0,
    -497.0,
    -605.0],
   'episode_lengths': [200, 200, 200, 200, 200, 200, 200, 200, 200, 200]},
  'sampler_perf': {'mean_raw_obs_processing_ms': 0.03042535624582728,
   'mean_inference_ms': 0.2325897750587597,
   'mean_action_processing_ms': 0.014216348208647141,
   'mean_env_wait_ms': 0.016747683897309157,
   'mean_env_render_ms': 0.023595098851026147},
  'off_policy_estimator': {},
  'timesteps_this_iter': 0}}


[2m[36m(RolloutWorker pid=3686)[0m |[35mY[0m| : |[34;1mB[0m: |
[2m[36m(RolloutWorker pid=3686)[0m +---------+
[2m[36m(RolloutWorker pid=3686)[0m   (South)
[2m[36m(RolloutWorker pid=3686)[0m +---------+
[2m[36m(RolloutWorker pid=3686)[0m |R: | : :G|
[2m[36m(RolloutWorker pid=3686)[0m | : | : : |
[2m[36m(RolloutWorker pid=3686)[0m |[43m [0m: : : : |
[2m[36m(RolloutWorker pid=3686)[0m | | : | : |
[2m[36m(RolloutWorker pid=3686)[0m |[35mY[0m| : |[34;1mB[0m: |
[2m[36m(RolloutWorker pid=3686)[0m +---------+
[2m[36m(RolloutWorker pid=3686)[0m   (North)
[2m[36m(RolloutWorker pid=3686)[0m +---------+
[2m[36m(RolloutWorker pid=3686)[0m |R: | : :G|
[2m[36m(RolloutWorker pid=3686)[0m | : | : : |
[2m[36m(RolloutWorker pid=3686)[0m |[43m [0m: : : : |
[2m[36m(RolloutWorker pid=3686)[0m | | : | : |
[2m[36m(RolloutWorker pid=3686)[0m |[35mY[0m| : |[34;1mB[0m: |
[2m[36m(RolloutWorker pid=3686)[0m +---------+
[2m[36m(RolloutWorker p