In [1]:
import gym
import numpy as np

from stable_baselines3.common.env_checker import check_env
from sb3_contrib import TQC, QRDQN

from envs.single_state_mdp import SingleStateMDP
from envs.debug_env import DebugVEnv

## Single State MDP

In [2]:
env = SingleStateMDP()
check_env(env)

model = TQC('MlpPolicy', env, verbose=1)
model.learn(total_timesteps=1000)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


<sb3_contrib.tqc.tqc.TQC at 0x7f27f5cd21d0>

In [3]:
reward_list = []
obs = env.reset()
for i in range(1000):
  action, _ = model.predict(obs)
  obs, reward, done, _ = env.step(action)
  reward_list.append(reward)
  print(f"Step {i}: action={action}, reward={reward}")

Step 0: action=[0.4524057], reward=-0.6115692257881165
Step 1: action=[-0.36012304], reward=-0.12171197682619095
Step 2: action=[-0.90004605], reward=-0.541496992111206
Step 3: action=[0.08630955], reward=0.8822512626647949
Step 4: action=[-0.14776468], reward=0.44912493228912354
Step 5: action=[0.8418217], reward=-0.4248230457305908
Step 6: action=[-0.44808793], reward=0.06362521648406982
Step 7: action=[-0.02108508], reward=0.7634142637252808
Step 8: action=[-0.37467015], reward=-0.03298266977071762
Step 9: action=[0.16759717], reward=0.28477585315704346
Step 10: action=[0.6594472], reward=-1.2232227325439453
Step 11: action=[-0.860998], reward=0.09507276117801666
Step 12: action=[-0.08539402], reward=0.644324541091919
Step 13: action=[0.44264066], reward=-0.9491301774978638
Step 14: action=[0.3518796], reward=-0.1872262805700302
Step 15: action=[-0.73603666], reward=-0.5828337669372559
Step 16: action=[-0.6081049], reward=-0.20572979748249054
Step 17: action=[0.04548478], reward=0.7

## Debug Env

In [4]:
env = DebugVEnv()
check_env(env)

model = QRDQN('MlpPolicy', env, verbose=1)
model.learn(total_timesteps=1000)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 10       |
|    ep_rew_mean      | 10       |
|    exploration rate | 0.01     |
| time/               |          |
|    episodes         | 4        |
|    fps              | 13764    |
|    time_elapsed     | 0        |
|    total_timesteps  | 40       |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 10       |
|    ep_rew_mean      | 10       |
|    exploration rate | 0.01     |
| time/               |          |
|    episodes         | 8        |
|    fps              | 11995    |
|    time_elapsed     | 0        |
|    total_timesteps  | 80       |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 10       |
|    ep_rew_mean      | 10    



<sb3_contrib.qrdqn.qrdqn.QRDQN at 0x7f2851acfeb8>

In [5]:
reward_list = []
obs = env.reset()
for i in range(1000):
  action, _ = model.predict(obs)
  obs, reward, done, _ = env.step(action)
  reward_list.append(reward)
  print(f"Step {i}: action={action}, reward={reward}")

Step 0: action=0, reward=1.0
Step 1: action=0, reward=1.0
Step 2: action=0, reward=1.0
Step 3: action=0, reward=1.0
Step 4: action=0, reward=1.0
Step 5: action=0, reward=1.0
Step 6: action=0, reward=1.0
Step 7: action=0, reward=1.0
Step 8: action=0, reward=1.0
Step 9: action=0, reward=1.0
Step 10: action=0, reward=1.0
Step 11: action=0, reward=1.0
Step 12: action=1, reward=1.0
Step 13: action=1, reward=1.0
Step 14: action=1, reward=1.0
Step 15: action=1, reward=1.0
Step 16: action=1, reward=1.0
Step 17: action=1, reward=1.0
Step 18: action=1, reward=1.0
Step 19: action=1, reward=1.0
Step 20: action=1, reward=1.0
Step 21: action=1, reward=1.0
Step 22: action=1, reward=1.0
Step 23: action=1, reward=1.0
Step 24: action=1, reward=1.0
Step 25: action=1, reward=1.0
Step 26: action=1, reward=1.0
Step 27: action=1, reward=1.0
Step 28: action=1, reward=1.0
Step 29: action=1, reward=1.0
Step 30: action=1, reward=1.0
Step 31: action=1, reward=1.0
Step 32: action=1, reward=1.0
Step 33: action=1, r