In [50]:
%pip install gymnasium stable-baselines3

Note: you may need to restart the kernel to use updated packages.


In [51]:
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy


In [65]:
env = gym.make('CartPole-v1', render_mode="rgb_array")
env = DummyVecEnv([lambda: env])  # Wrap the environment for Stable Baselines 3


In [66]:
model = PPO('MlpPolicy', env, verbose=1)


Using cpu device


In [67]:
model.learn(total_timesteps=20000)


-----------------------------
| time/              |      |
|    fps             | 4720 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 2998        |
|    iterations           | 2           |
|    time_elapsed         | 1           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008174533 |
|    clip_fraction        | 0.117       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.686      |
|    explained_variance   | -0.00207    |
|    learning_rate        | 0.0003      |
|    loss                 | 6.96        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0206     |
|    value_loss           | 56.3        |
-----------------------------------------
----------------------------------

<stable_baselines3.ppo.ppo.PPO at 0x148250e10>

In [68]:
mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10)
print(f"Mean reward: {mean_reward}, std. reward: {std_reward}")




Mean reward: 494.3, std. reward: 12.506398362438324


In [69]:
model.save('ppo_cartpole')
del model  # delete trained model to demonstrate loading
model = PPO.load('ppo_cartpole', env=env)


In [70]:
episodes = 5
for episode in range(1, episodes+1):
    obs = env.reset()
    done = False
    score = 0 
    
    while not done:
        action, _states = model.predict(obs)
        obs, rewards, done, info = env.step(action)
        score += rewards
    print('Episode:{} Score:{}'.format(episode, score))


Episode:1 Score:[500.]
Episode:2 Score:[500.]
Episode:3 Score:[500.]
Episode:4 Score:[360.]
Episode:5 Score:[500.]


# look into env and model

In [71]:
# Print general info about the environment
print("Environment action space:", env.action_space)
print("Environment observation space:", env.observation_space)
print("Sample observation:", env.observation_space.sample())
print("Env metadata:" , env.metadata)

Environment action space: Discrete(2)
Environment observation space: Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)
Sample observation: [ 3.2387745e+00  3.3939743e+38 -2.6870859e-01 -1.0959583e+38]
Env metadata: {'render_modes': ['human', 'rgb_array'], 'render_fps': 50}


In [76]:
model.env

<stable_baselines3.common.vec_env.dummy_vec_env.DummyVecEnv at 0x148230210>

In [75]:
# Print the policy
print("Policy:", model.policy)
# Print the action space and observation space
print("Action space:", model.action_space)
print("Observation space:", model.observation_space)

Policy: ActorCriticPolicy(
  (features_extractor): FlattenExtractor(
    (flatten): Flatten(start_dim=1, end_dim=-1)
  )
  (pi_features_extractor): FlattenExtractor(
    (flatten): Flatten(start_dim=1, end_dim=-1)
  )
  (vf_features_extractor): FlattenExtractor(
    (flatten): Flatten(start_dim=1, end_dim=-1)
  )
  (mlp_extractor): MlpExtractor(
    (policy_net): Sequential(
      (0): Linear(in_features=4, out_features=64, bias=True)
      (1): Tanh()
      (2): Linear(in_features=64, out_features=64, bias=True)
      (3): Tanh()
    )
    (value_net): Sequential(
      (0): Linear(in_features=4, out_features=64, bias=True)
      (1): Tanh()
      (2): Linear(in_features=64, out_features=64, bias=True)
      (3): Tanh()
    )
  )
  (action_net): Linear(in_features=64, out_features=2, bias=True)
  (value_net): Linear(in_features=64, out_features=1, bias=True)
)
Action space: Discrete(2)
Observation space: Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00

In [80]:
print("Model's policy net:", model.policy.action_net)
print("Model's value net:", model.policy.value_net)

Model's policy net: Linear(in_features=64, out_features=2, bias=True)
Model's value net: Linear(in_features=64, out_features=1, bias=True)


# TODO: visualization of V(s)