# ペッツを対象とした深層強化学習

## ライブラリのインストール

In [8]:
!pip install tensorflow
!pip install tf_agents



## ライブラリのインポート

In [9]:
import tensorflow as tf
from tensorflow import keras

from tf_agents.environments import py_environment, tf_py_environment
from tf_agents.agents.dqn import dqn_agent
from tf_agents.networks import network
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.policies import policy_saver
from tf_agents.trajectories import time_step as ts
from tf_agents.trajectories import trajectory
from tf_agents.specs import array_spec
from tf_agents.utils import common
from tf_agents.drivers import dynamic_step_driver

In [10]:
import numpy as np
import random

## 初期乱数の設定
この値を変えると別の学習結果が得られる

In [11]:
seed = 0
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)

## 環境クラス
行動を受け取り状態を遷移させ、報酬を与える

In [12]:
class EnvironmentSimulater(py_environment.PyEnvironment):
    def __init__(self):
        super(EnvironmentSimulater, self).__init__()
        self._observation_spec = array_spec.BoundedArraySpec(
                shape=(1,), dtype=np.int32, minimum=0, maximum=2
        )
        self._action_spec = array_spec.BoundedArraySpec(
                shape=(), dtype=np.int32, minimum=0, maximum=2
        )
        self._reset()
    
    def observation_spec(self):
        return self._observation_spec
 
    def action_spec(self):
        return self._action_spec

    def _reset(self):
        self._state = 0
        return ts.restart(np.array([self._state], dtype=np.int32))
    
    def _step(self, action):
        reward = 0
        if self._state==0:#閉じている
            if action==0:#開ける
                self._state = 1
        elif self._state==1:#開いていて，ミント菓子がある
            if action==1:#閉じる
                self._state = 0
            elif action==2:#傾ける
                self._state = 2
                reward = 1
        else:#開いていて，ミント菓子がない
            if action==1:
                self._state = 0

        return ts.transition(np.array([self._state], dtype=np.int32), reward=reward, discount=1)


## ネットワークの設定

In [13]:
class MyQNetwork(network.Network):
    def __init__(self, observation_spec, action_spec, n_hidden_channels=4, name='QNetwork'):
        n_action = action_spec.maximum - action_spec.minimum + 1
        super(MyQNetwork, self).__init__(
            input_tensor_spec=observation_spec, 
            state_spec=(), 
            name=name
        )
        self.model = keras.Sequential(
            [
                keras.layers.Dense(n_hidden_channels, activation='tanh'),
                keras.layers.Dense(n_hidden_channels, activation='tanh'),
                keras.layers.Dense(n_action),
            ]
        )

    def call(self, observation, step_type=None, network_state=(), training=True):
        actions = self.model(observation, training=training)
        return actions, network_state


## 深層強化学習の実行

In [14]:
env_py = EnvironmentSimulater()
env = tf_py_environment.TFPyEnvironment(env_py)

primary_network = MyQNetwork( 
    env.observation_spec(), 
    env.action_spec()
)

n_step_update = 1
agent = dqn_agent.DqnAgent(
    env.time_step_spec(),
    env.action_spec(),
    q_network=primary_network,
    optimizer=keras.optimizers.Adam(learning_rate=1e-2),
    n_step_update=n_step_update,
    epsilon_greedy=1.0,
    target_update_tau=1.0,
    target_update_period=10,
    gamma=0.9,
    td_errors_loss_fn = common.element_wise_squared_loss,
    train_step_counter = tf.Variable(0)
)
agent.initialize()
agent.train = common.function(agent.train)

policy = agent.collect_policy

replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    data_spec=agent.collect_data_spec,
    batch_size=env.batch_size,
    max_length=10**6
)

dataset = replay_buffer.as_dataset(
    num_parallel_calls=3,
    sample_batch_size=32,
    num_steps=n_step_update+1
).prefetch(3)
iterator = iter(dataset)

env.reset()
driver = dynamic_step_driver.DynamicStepDriver(
    env, 
    policy, 
    observers=[replay_buffer.add_batch], 
)
driver.run(maximum_iterations=100)

num_episodes = 100
line_epsilon = np.linspace(start=1.0, stop=0.0, num=num_episodes)

for episode in range(num_episodes):
    episode_rewards = 0
    episode_average_loss = []
    policy._epsilon = line_epsilon[episode]
    time_step = env.reset()

    for t in range(10):
        policy_step = policy.action(time_step)
        next_time_step = env.step(policy_step.action)

        traj =  trajectory.from_transition(time_step, policy_step, next_time_step)
        replay_buffer.add_batch(traj)

        experience, _ = next(iterator)
        loss_info = agent.train(experience=experience)

        S = time_step.observation.numpy().tolist()[0]#状態
        A = policy_step.action.numpy().tolist()[0]#行動
        R = next_time_step.reward.numpy().astype('int').tolist()[0]#報酬
        print(S, A, R)
        episode_average_loss.append(loss_info.loss.numpy())#平均loss
        episode_rewards += R#合計報酬

        time_step = next_time_step

    print(f'Episode:{episode+1}, Rewards:{episode_rewards}, Average Loss:{np.mean(episode_average_loss):.6f}, Current Epsilon:{policy._epsilon:.6f}')

tf_policy_saver = policy_saver.PolicySaver(policy=agent.policy)
tf_policy_saver.save(export_dir='policy')


Instructions for updating:
Use `as_dataset(..., single_deterministic_pass=False) instead.
Instructions for updating:
back_prop=False is deprecated. Consider using tf.stop_gradient instead.
Instead of:
results = tf.foldr(fn, elems, back_prop=False)
Use:
results = tf.nest.map_structure(tf.stop_gradient, tf.foldr(fn, elems))


UnknownError: Graph execution error:

Detected at node 'FloorMod' defined at (most recent call last):
    File "C:\Users\makino\anaconda3\lib\runpy.py", line 194, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "C:\Users\makino\anaconda3\lib\runpy.py", line 87, in _run_code
      exec(code, run_globals)
    File "C:\Users\makino\anaconda3\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
      app.launch_new_instance()
    File "C:\Users\makino\anaconda3\lib\site-packages\traitlets\config\application.py", line 845, in launch_instance
      app.start()
    File "C:\Users\makino\anaconda3\lib\site-packages\ipykernel\kernelapp.py", line 612, in start
      self.io_loop.start()
    File "C:\Users\makino\anaconda3\lib\site-packages\tornado\platform\asyncio.py", line 149, in start
      self.asyncio_loop.run_forever()
    File "C:\Users\makino\anaconda3\lib\asyncio\base_events.py", line 570, in run_forever
      self._run_once()
    File "C:\Users\makino\anaconda3\lib\asyncio\base_events.py", line 1859, in _run_once
      handle._run()
    File "C:\Users\makino\anaconda3\lib\asyncio\events.py", line 81, in _run
      self._context.run(self._callback, *self._args)
    File "C:\Users\makino\anaconda3\lib\site-packages\tornado\ioloop.py", line 690, in <lambda>
      lambda f: self._run_callback(functools.partial(callback, future))
    File "C:\Users\makino\anaconda3\lib\site-packages\tornado\ioloop.py", line 743, in _run_callback
      ret = callback()
    File "C:\Users\makino\anaconda3\lib\site-packages\tornado\gen.py", line 787, in inner
      self.run()
    File "C:\Users\makino\anaconda3\lib\site-packages\tornado\gen.py", line 748, in run
      yielded = self.gen.send(value)
    File "C:\Users\makino\anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 365, in process_one
      yield gen.maybe_future(dispatch(*args))
    File "C:\Users\makino\anaconda3\lib\site-packages\tornado\gen.py", line 209, in wrapper
      yielded = next(result)
    File "C:\Users\makino\anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 268, in dispatch_shell
      yield gen.maybe_future(handler(stream, idents, msg))
    File "C:\Users\makino\anaconda3\lib\site-packages\tornado\gen.py", line 209, in wrapper
      yielded = next(result)
    File "C:\Users\makino\anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 543, in execute_request
      self.do_execute(
    File "C:\Users\makino\anaconda3\lib\site-packages\tornado\gen.py", line 209, in wrapper
      yielded = next(result)
    File "C:\Users\makino\anaconda3\lib\site-packages\ipykernel\ipkernel.py", line 306, in do_execute
      res = shell.run_cell(code, store_history=store_history, silent=silent)
    File "C:\Users\makino\anaconda3\lib\site-packages\ipykernel\zmqshell.py", line 536, in run_cell
      return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
    File "C:\Users\makino\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2877, in run_cell
      result = self._run_cell(
    File "C:\Users\makino\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2923, in _run_cell
      return runner(coro)
    File "C:\Users\makino\anaconda3\lib\site-packages\IPython\core\async_helpers.py", line 68, in _pseudo_sync_runner
      coro.send(None)
    File "C:\Users\makino\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3146, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "C:\Users\makino\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3338, in run_ast_nodes
      if (await self.run_code(code, result,  async_=asy)):
    File "C:\Users\makino\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3418, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "<ipython-input-14-cdf0815f98b5>", line 66, in <module>
      loss_info = agent.train(experience=experience)
    File "C:\Users\makino\anaconda3\lib\site-packages\tf_agents\agents\tf_agent.py", line 336, in train
      loss_info = self._train_fn(
    File "C:\Users\makino\anaconda3\lib\site-packages\tf_agents\utils\common.py", line 188, in with_check_resource_vars
      return fn(*fn_args, **fn_kwargs)
    File "C:\Users\makino\anaconda3\lib\site-packages\tf_agents\agents\dqn\dqn_agent.py", line 421, in _train
      self._update_target()
    File "C:\Users\makino\anaconda3\lib\site-packages\tf_agents\utils\common.py", line 491, in __call__
      return tf.distribute.get_replica_context().merge_call(call)
    File "C:\Users\makino\anaconda3\lib\site-packages\tf_agents\utils\common.py", line 484, in call
      remainder = tf.math.mod(self._counter.assign_add(1), period)
Node: 'FloorMod'
JIT compilation failed.
	 [[{{node FloorMod}}]] [Op:__inference_train_1808]