In [4]:
import gym
import warnings
warnings.filterwarnings('ignore')
from stable_baselines.deepq import MlpPolicy, CnnPolicy
from stable_baselines.common.policies import LstmPolicy
from stable_baselines.common.policies import MlpLstmPolicy
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines import DQN, SAC, A2C, ACER
from stable_baselines.common.cmd_util import make_atari_env
from stable_baselines.common.vec_env import VecFrameStack

# Customize policy 

 1. policy_kwargs
 2. built-in policy를 부모 클래스로 하여 net_arch 설정
 3. ActorCriticPolicy를 부모 클래스로 하여 CNN, MLP층 조절

## 1. policy_kwargs
 + Mlp 계열 policy에만 사용 가능
 + Mlp layer의 노드 수, layer 수를 조정 가능

### MLP layer 노드 + layer 수 조절

In [6]:
from stable_baselines.sac import MlpPolicy

env = gym.make('Pendulum-v1')

# MlpPolicy의 노드 수가 각각 32, 64, 128개인 layer 3개로 설정
model = SAC(MlpPolicy, env, verbose = 1, policy_kwargs = dict(layers = [32, 64, 128]))
model.learn(1000)
params = model.get_parameter_list()
print(params)


Instructions for updating:
Use keras.layers.flatten instead.
Instructions for updating:
Please use `layer.__call__` method instead.
Instructions for updating:
Use keras.layers.Dense instead.






Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where




-----------------------------------------
| current_lr              | 0.0003      |
| ent_coef                | 0.81156945  |
| ent_coef_loss           | -0.31727538 |
| entropy                 | 1.0635351   |
| episodes                | 4           |
| fps                     | 110         |
| mean 100 episode reward | -1.26e+03   |
| n_updates               | 701         |
| policy_loss             | 17.933578   |
| qf1_loss                | 0.15176228  |
| qf2_loss                | 0.14846855  |
| time_elapsed            | 7           |
| total timesteps         | 800         |
| value_loss              | 0.18106334  |
-----------------------------------------
[<tf.Variable 'model/pi/fc0/kern

### LSTM layer 추가

In [7]:
env = gym.make('CartPole-v1')

# MLP layer에 LSTM layer 추가
model = A2C('MlpLstmPolicy', env, verbose = 1, policy_kwargs = dict(net_arch = [32, 32, 32, 'lstm'])) # lstm 층의 노드 수: 256
model.learn(total_timesteps = 1000)
parms = model.get_parameter_list()
print(parms)

Wrapping the env in a DummyVecEnv.


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
---------------------------------
| explained_variance | -0.00624 |
| fps                | 7        |
| nupdates           | 1        |
| policy_entropy     | 0.693    |
| total_timesteps    | 5        |
| value_loss         | 10.7     |
---------------------------------
---------------------------------
| explained_variance | 0.00065  |
| fps                | 206      |
| nupdates           | 100      |
| policy_entropy     | 0.693    |
| total_timesteps    | 500      |
| value_loss         | 6.06     |
---------------------------------
---------------------------------
| explained_variance | 0.00252  |
| fps                | 239      |
| nupdates           | 200      |
| policy_entropy     | 0.692    |
| total_timesteps    | 1000     |
| value_loss         | 10.6     |
---------------------------------
[<tf.Variable 'model/shared_f

### Actor-Critic 
 + pi: actor network 
 + vf: critic network 

In [8]:
env = gym.make('CartPole-v1')
model = A2C('MlpLstmPolicy', env, verbose = 1, policy_kwargs = dict(net_arch = [32,32,32,'lstm',dict(pi = [64,64],vf = [128,128])]))
model.learn(total_timesteps = 1000)
parms = model.get_parameter_list()
print(parms)

Wrapping the env in a DummyVecEnv.
---------------------------------
| explained_variance | 0.00205  |
| fps                | 6        |
| nupdates           | 1        |
| policy_entropy     | 0.693    |
| total_timesteps    | 5        |
| value_loss         | 10.6     |
---------------------------------
---------------------------------
| explained_variance | -0.0192  |
| fps                | 192      |
| nupdates           | 100      |
| policy_entropy     | 0.692    |
| total_timesteps    | 500      |
| value_loss         | 3.69     |
---------------------------------
---------------------------------
| explained_variance | -0.00527 |
| fps                | 222      |
| nupdates           | 200      |
| policy_entropy     | 0.693    |
| total_timesteps    | 1000     |
| value_loss         | 10.7     |
---------------------------------
[<tf.Variable 'model/shared_fc0/w:0' shape=(4, 32) dtype=float32_ref>, <tf.Variable 'model/shared_fc0/b:0' shape=(32,) dtype=float32_ref>, <tf.Variab

## 2. net_arch 
 + CnnPolicy는 커스터마이징 불가 --> net_arch에 지정해도 nature_cnn으로 자동 설정

MLP층에 LSTM 층을 연결하는 policy

In [11]:
from stable_baselines.common.atari_wrappers import make_atari

class CustomPolicy(LstmPolicy): # 부모 클래스: LstmPolicy
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs, feature_extraction = 'mlp')

env = make_atari('MsPacmanNoFrameskip-v0')
print(env.observation_space)

agent = A2C(CustomPolicy, env)
agent.learn(1000)
params = agent.get_parameter_list()
print(params)

Box([[[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 ...

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]], [[[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 ...

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
 

부모 클래스를 LstmPolicy로 하는 Actor-critic 알고리즘
 + 구조: MLP(8개) - LSTM(256개) - actor: [128, 128, 128], critic: [128, 128, 128]

In [12]:
class CustomPolicy(LstmPolicy): # 부모 클래스: Lstm --> Actor-Critic 계열 알고리즘 구성
    def __init__(self, *args,**kwargs):
        super().__init__(*args,**kwargs, 
                        net_arch=[8,'lstm',dict(pi = [128,128,128],vf = [128,128,128])], feature_extraction='mlp')

env = gym.make('CartPole-v1')
   
model = A2C(CustomPolicy, env, verbose = 1)
model.learn(total_timesteps = 1000)
parms = model.get_parameter_list()
print(parms)

Wrapping the env in a DummyVecEnv.
---------------------------------
| explained_variance | 0.0124   |
| fps                | 6        |
| nupdates           | 1        |
| policy_entropy     | 0.693    |
| total_timesteps    | 5        |
| value_loss         | 10.8     |
---------------------------------
---------------------------------
| explained_variance | 0.0174   |
| fps                | 185      |
| nupdates           | 100      |
| policy_entropy     | 0.693    |
| total_timesteps    | 500      |
| value_loss         | 10.6     |
---------------------------------
---------------------------------
| explained_variance | 0.0148   |
| fps                | 213      |
| nupdates           | 200      |
| policy_entropy     | 0.693    |
| total_timesteps    | 1000     |
| value_loss         | 10.5     |
---------------------------------
[<tf.Variable 'model/shared_fc0/w:0' shape=(4, 8) dtype=float32_ref>, <tf.Variable 'model/shared_fc0/b:0' shape=(8,) dtype=float32_ref>, <tf.Variable