In [None]:
# 2020-11-02 created by Akson

In [None]:
# Code18.1
# 安装显示用的依赖包（colab中运行）

try:
    # %tensorflow_version only exists in Colab.
    %tensorflow_version 2.x
    !apt update && apt install -y libpq-dev libsdl2-dev swig xorg-dev xvfb 
    # !pip install -q -U tf-agents-nightly pyvirtualdisplay gym[atari]
    !pip install -q -U tf-agents pyvirtualdisplay gym[atari]
    IS_COLAB = True
except Exception:
    IS_COLAB = False

In [None]:
# Code18.2
# import

import tensorflow as tf
from tensorflow import keras
import numpy as np

%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

import matplotlib.animation as animation
mpl.rc('animation', html='jshtml')

import gym

try:
    import pyvirtualdisplay
    display = pyvirtualdisplay.Display(visible=0, size=(1400, 900)).start()
except ImportError:
    pass

In [None]:
# Code18.3
# gym test

env = gym.make('CartPole-v1')
obs = env.reset()
print(obs)
env.render()

In [None]:
# Code18.4
# plot image func

def plot_environment(env, figsize = (5, 4)):
    plt.figure(figsize = figsize)
    img = env.render(mode = 'rgb_array')
    plt.imshow(img)
    plt.axis('off')

    return img

In [None]:
# Code18.5
# plot env img

plot_environment(env)
plt.show()

In [None]:
# Code18.6
# other result

print(env.action_space)
action = 1
obs, reward, done, info = env.step(action)
print(obs)
print(reward)
print(done)
print(info)

In [None]:
# Code18.7
# basic_policy

def basic_policy(obs):
    angle = obs[2]
    return 0 if angle < 0 else 1

totals = []
for episode in range(500):
    episode_rewards = 0
    obs = env.reset()
    for step in range(200):
        action = basic_policy(obs)
        obs, reward, done, info = env.step(action)
        episode_rewards += reward
        if done:
            break
    totals.append(episode_rewards)

print(np.mean(totals), np.std(totals), np.min(totals), np.max(totals))

In [None]:
# Code18.8
# plot animation func

def update_scene(num, frames, patch):
    patch.set_data(frames[num])
    return patch

def plot_animation(frams, repeat = False, interval = 40):
    fig = plt.figure()
    patch = plt.imshow(frames[0])
    plt.axis('off')
    anim = animation.FuncAnimation(fig, update_scene, fargs = (frames, patch), frames = len(frames), repeat = repeat, interval = interval)
    plt.close()
    return anim

In [None]:
# Code18.9
# draw it

frames = []
obs = env.reset()
for step in range(200):
    img = env.render(mode = 'rgb_array')
    frames.append(img)
    action = basic_policy(obs)

    obs, reward, done, info = env.step(action)
    if done:
        break

plot_animation(frames)

In [None]:
# Code18.10
# create NN model

keras.backend.clear_session()
tf.random.set_seed(42)
np.random.seed(42)

n_inputs = 4
model = keras.models.Sequential([
    keras.layers.Dense(5, activation = 'elu', input_shape = [n_inputs]),
    keras.layers.Dense(1, activation = 'sigmoid')
])

In [None]:
# Code18.11
# train func

def render_policy_net(model, n_max_steps = 200, seed = 42):
    frames = []
    env = gym.make('CartPole-v1')
    env.seed(seed)
    np.random.seed(seed)
    obs = env.reset()

    for step in range(n_max_steps):
        frames.append(env.render(mode = 'rgb_array'))
        left_proba = model.predict(obs.reshape(1, -1))
        action = int(np.random.rand() > left_proba)
        obs, reward, done, info = env.step(action)

        if done:
            break
    
    env.close()
    return frames

In [None]:
# Code18.12
# train it

frames = render_policy_net(model)
plot_animation(frames)

In [None]:
# Code18.13
# 一些准备函数

# 前进一步
def play_one_step(env, obs, model, loss_fn):
    with tf.GradientTape() as tape:
        left_proba = model(obs[np.newaxis])
        action = (tf.random.uniform([1, 1]) > left_proba)
        y_target = tf.constant([[1.]]) - tf.cast(action, tf.float32)
        loss = tf.reduce_mean(loss_fn(y_target, left_proba))
    grads = tape.gradient(loss, model.trainable_variables)
    obs, reward, done, info = env.step(int(action[0, 0].numpy()))

    return obs, reward, done, grads

# 玩多步
def play_multiple_episodes(env, n_episodes, n_max_steps, model, loss_fn):
    all_rewards = []
    all_grads = []
    for episode in range(n_episodes):
        current_rewards = []
        current_grads = []
        obs = env.reset()
        for step in range(n_max_steps):
            obs, reward, done, grads = play_one_step(env, obs, model, loss_fn)
            current_rewards.append(reward)
            current_grads.append(grads)
            if done:
                break
        all_rewards.append(current_rewards)
        all_grads.append(current_grads)
    return all_rewards, all_grads

# 计算这一批每一步的折扣奖励
def discount_rewards(rewards, discount_rate):
    discounted = np.array(rewards)
    for step in range(len(rewards) -2, -1, -1):
        discounted[step] += discounted[step + 1] * discount_rate
    return discounted

# 将这一批奖励折扣化和标准化
def discount_and_normalize_rewards(all_rewards, discount_rate):
    all_discounted_rewards = [discount_rewards(rewards, discount_rate) for rewards in all_rewards]
    flat_rewards = np.concatenate(all_discounted_rewards)
    reward_mean = flat_rewards.mean()
    reward_std = flat_rewards.std()

    return [(discounted_rewards - reward_mean) / reward_std for discounted_rewards in all_discounted_rewards]

# test
print(discount_rewards([10, 0, -50], discount_rate = 0.8))
print(discount_and_normalize_rewards([[10, 0, -50], [10, 20]], discount_rate = 0.8))

In [None]:
# Code18.14
# pepare some factor

n_iterations = 150
n_episodes_per_update = 10
n_max_steps = 200
discount_rate = 0.95

optimizer = keras.optimizers.Adam(lr = 0.01)
loss_fn = keras.losses.binary_crossentropy

In [None]:
# Code18.15
# create model

model = keras.models.Sequential([
    keras.layers.Dense(5, activation = 'elu', input_shape = [4]),
    keras.layers.Dense(1, activation = 'sigmoid')
])

In [None]:
# Code18.16
# train it

env = gym.make('CartPole-v1')
env.seed(42)

for iteration in range(n_iterations):
    all_rewards, all_grads = play_multiple_episodes(env, n_episodes_per_update, n_max_steps, model, loss_fn)
    total_rewards = sum(map(sum, all_rewards))
    print('\rIteration: {}, mean rewards: {:.1f}'.format(iteration, total_rewards / n_episodes_per_update), end = '')
    all_final_rewards = discount_and_normalize_rewards(all_rewards, discount_rate)
    all_mean_grads = []
    for var_index in range(len(model.trainable_variables)):
        mean_grads = tf.reduce_mean([final_reward * all_grads[episode_index][step][var_index] for episode_index, final_rewards in enumerate(all_final_rewards) for step, final_reward in enumerate(final_rewards)], axis = 0)
        all_mean_grads.append(mean_grads)
    optimizer.apply_gradients(zip(all_mean_grads, model.trainable_variables))

env.close()
frames = render_policy_net(model)
plot_animation(frames)

In [None]:
# Code18.17
# print markov chain

transition_probabilities = [
    [0.7, 0.2, 0.0, 0.1],
    [0.0, 0.0, 0.9, 0.1],
    [0.0, 1.0, 0.0, 0.0],
    [0.0, 0.0, 0.0, 1.0]
]

n_max_steps = 50

def print_sequence():
    current_state = 0
    print('States:', end = ' ')
    for step in range(n_max_steps):
        print(current_state, end = ' ')
        if current_state == 3:
            break
        current_state = np.random.choice(range(4), p = transition_probabilities[current_state])
    else: 
        print('...', end = '')
    print()

for _ in range(10):
    print_sequence()

In [None]:
# Code18.18
# some dataset

transition_probabilities = [
    [[0.7, 0.3, 0.0], [1.0, 0.0, 0.0], [0.8, 0.2, 0.0]],
    [[0.0, 1.0, 0.0], None, [0.0, 0.0, 1.0]],
    [None, [0.8, 0.1, 0.1], None]
]

rewards = [
    [[10, 0, 0], [0, 0, 0], [0, 0, 0]],
    [[0, 0, 0], [0, 0, 0], [0, 0, -50]],
    [[0, 0, 0], [40, 0, 0], [0, 0, 0]]
]

possible_actions = [[0, 1, 2], [0, 2], [1]]

In [None]:
# Code18.19
# Q-iteration

def cal_Q_value(gamma, transition_probabilities, rewards):
    Q_values = np.full((3, 3), -np.inf)
    for state, actions in enumerate(possible_actions):
        Q_values[state, actions] = 0.0
    
    for iteration in range(50):
        Q_prev = Q_values.copy()
        for s in range(3):
            for a in possible_actions[s]:
                Q_values[s, a] = np.sum([transition_probabilities[s][a][sp] * (rewards[s][a][sp] + gamma * np.max(Q_prev[sp])) for sp in range(3)])
    return Q_values

Q_values = cal_Q_value(0.95, transition_probabilities, rewards)
print(Q_values)
print(np.argmax(Q_values, axis = 1))

Q_values = cal_Q_value(0.90, transition_probabilities, rewards)
print(Q_values)
print(np.argmax(Q_values, axis = 1))


In [None]:
# Code18.20
# Q-learning

def step(state, action):
    probas = transition_probabilities[state][action]
    next_state = np.random.choice([0, 1, 2], p = probas)
    reward = rewards[state][action][next_state]
    return next_state, reward

def exploration_policy(state):
    return np.random.choice(possible_actions[state])

Q_values = np.full((3, 3), -np.inf)
for state, actions in enumerate(possible_actions):
    Q_values[state][actions] = 0

alpha0 = 0.05
decay = 0.005
gamma = 0.90
state = 0

for iteration in range(10000):
    action = exploration_policy(state)
    next_state, reward = step(state, action)
    next_value = np.max(Q_values[next_state])
    alpha = alpha0 / (1 + iteration * decay)
    Q_values[state, action] *= 1 - alpha
    Q_values[state, action] += alpha * (reward + gamma * next_value)
    state = next_state

print(Q_values)
print(np.argmax(Q_values, axis = 1)) 

In [None]:
# Code18.21
# DQN

env = gym.make('CartPole-v1')
# create model
input_shape = [4]
n_outputs = 2

model = keras.models.Sequential([
    keras.layers.Dense(32, activation = 'elu', input_shape = input_shape),
    keras.layers.Dense(32, activation = 'elu'),
    keras.layers.Dense(n_outputs)
])

# 确定行动策略
def epsilon_greedy_policy(state, epsilon = 0):
    # 采用随机行动
    if np.random.rand() < epsilon:
        return np.random.randint(2)
    # 采取当前最大Q值的策略
    else:
        Q_values = model.predict(state[np.newaxis])
        return np.argmax(Q_values[0])

# 定义重放缓冲区
from collections import deque
replay_memory = deque(maxlen = 2000)

# 从重放缓冲区中随机抽取一批经验
def sample_experiences(batch_size):
    indices = np.random.randint(len(replay_memory), size = batch_size)
    batch = [replay_memory[index] for index in indices]
    states, actions, rewards, next_states, dones = [np.array([experience[field_index] for experience in batch]) for field_index in range(5)]
    return states, actions, rewards, next_states, dones

# 行动一步
def play_one_step(env, state, epsilon):
    action = epsilon_greedy_policy(state, epsilon)
    next_state, reward, done, info = env.step(action)
    replay_memory.append((state, action, reward, next_state, done))
    return next_state, reward, done, info

In [None]:
# Code18.22
# 定义训练超参数和训练步骤

batch_size = 32
discount_rate = 0.95
optimizer = keras.optimizers.Adam(lr = 1e-3)
loss_fn = keras.losses.mean_squared_error

def training_step(batch_size):
    # 从重放缓冲区中取一批经验数据
    experiences = sample_experiences(batch_size)
    # 取出经验中的具体对象
    states, actions, rewards, next_states, dones = experiences
    # 使用当前模型预测下一个Q值列表
    next_Q_values = model.predict(next_states)
    # 取出Q值列表中最大的
    max_next_Q_values = np.max(next_Q_values, axis = 1)
    # 使用公式计算出目标Q值
    target_Q_values = (rewards + (1 - dones) * discount_rate * max_next_Q_values)
    # 将目标Q值转成一列
    target_Q_values = target_Q_values.reshape(-1, 1)
    # 掩码
    mask = tf.one_hot(actions, n_outputs)
    with tf.GradientTape() as tape:
        # 计算全部可能动作的Q值
        all_Q_values = model(states)
        # 只保留经验中的动作的Q值
        Q_values = tf.reduce_sum(all_Q_values * mask, axis = 1, keepdims = True)
        # 使用均方误差作为损失
        loss = tf.reduce_mean(loss_fn(target_Q_values, Q_values))
    # 计算梯度
    grads = tape.gradient(loss, model.trainable_variables)
    # 应用梯度下降
    optimizer.apply_gradients(zip(grads, model.trainable_variables))


In [None]:
# Code18.23
# train it!

rewards = []
best_score = 0

for episode in range(600):
    obs = env.reset()
    for step in range(200):
        epsilon = max(1 - episode / 500, 0.01)
        obs, reward, done, info = play_one_step(env, obs, epsilon)
        if done:
            break
    rewards.append(step)
    if step > best_score:
        best_weights = model.get_weights()
        best_score = step
    print('\rEpisode: {}, Steps: {}, eps: {:.3f}'.format(episode, step + 1, epsilon), end = '')
    if episode > 50:
        training_step(batch_size)

model.set_weights(best_weights)

In [None]:
# Code18.24
# draw it

plt.figure(figsize = (8, 4))
plt.plot(rewards)
plt.xlabel('Episode', fontsize = 14)
plt.ylabel('Sum of rewards', fontsize = 14)
plt.show()

In [None]:
# Code18.25
# 看看效果

state = env.reset()

frames = []

for step in range(200):
    action = epsilon_greedy_policy(state)
    state, reward, done, info = env.step(action)
    if done:
        break
    img = env.render(mode="rgb_array")
    frames.append(img)
    
plot_animation(frames)

In [None]:
# Code18.25
# Double DQN

# 准备所需要的模型
model = keras.models.Sequential([
    keras.layers.Dense(32, activation = 'elu', input_shape = [4]),
    keras.layers.Dense(32, activation = 'elu'),
    keras.layers.Dense(n_outputs)
])

target = keras.models.clone_model(model)
target.set_weights(model.get_weights())

In [None]:
# Code18.26
# 定义超参数与训练函数

batch_size = 32
discount_rate = 0.95
optimizer = keras.optimizers.Adam(lr = 1e-3)
loss_fn = keras.losses.Huber()

def training_step(batch_size):
    experiences = sample_experiences(batch_size)
    states, actions, rewards, next_states, dones = experiences

    next_Q_values = model.predict(next_states)
    best_next_actions = np.argmax(next_Q_values, axis = 1)
    next_mask = tf.one_hot(best_next_actions, n_outputs).numpy()
    next_best_Q_values = (target.predict(next_states) * next_mask).sum(axis = 1)

    target_Q_values = (rewards + (1 - dones) * discount_rate * next_best_Q_values)
    target_Q_values = target_Q_values.reshape(-1, 1)
    mask = tf.one_hot(actions, n_outputs)

    with tf.GradientTape() as tape:
        all_Q_values = model(states)
        Q_values = tf.reduce_sum(all_Q_values * mask, axis = 1, keepdims = True)
        loss = tf.reduce_mean(loss_fn(target_Q_values, Q_values))
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

In [None]:
# Code18.27
# train it

replay_memory = deque(maxlen = 2000)
rewards = []
best_score = 0

for episode in range(600):
    obs = env.reset()
    for step in range(200):
        epsilon = max(1 - episode / 500, 0.01)
        obs, reward, done, info = play_one_step(env, obs, epsilon)
        if done:
            break
    
    rewards.append(step)
    if step > best_score:
        best_weights = model.get_weights()
        best_score = step
    print('\rEpisode: {}, Steps: {}, eps: {:.3f}'.format(episode, step + 1, epsilon), end = '')
    if episode > 50:
        training_step(batch_size)
    if episode % 50 == 0:
        target.set_weights(model.get_weights())

model.set_weights(best_weights)

plt.figure(figsize = (8, 4))
plt.plot(rewards)
plt.xlabel('Episode', fontsize = 14)
plt.ylabel('Sum of rewards', fontsize = 14)
plt.show()

In [None]:
# Code18.28
# 看看效果

state = env.reset()

frames = []

for step in range(200):
    action = epsilon_greedy_policy(state)
    state, reward, done, info = env.step(action)
    if done:
        break
    img = env.render(mode="rgb_array")
    frames.append(img)
    
plot_animation(frames)

In [None]:
# Code18.29
# dueling double dqn

K = keras.backend

# input layer
input_states = keras.layers.Input(shape = [4])
# hidden layer
hidden1 = keras.layers.Dense(32, activation = 'elu')(input_states)
hidden2 = keras.layers.Dense(32, activation = 'elu')(hidden1)

state_values = keras.layers.Dense(1)(hidden2)
raw_advantages = keras.layers.Dense(n_outputs)(hidden2)
advantages = raw_advantages - K.max(raw_advantages, axis = 1, keepdims = True)

Q_values = state_values + advantages
model = keras.models.Model(inputs = [input_states], outputs = [Q_values])

target = keras.models.clone_model(model)
target.set_weights(model.get_weights())

In [None]:
# Code18.30
# train func

batch_size = 32
discount_rate = 0.95
optimizer = keras.optimizers.Adam(lr = 1e-2)
loss_fn = keras.losses.Huber()

def training_step(batch_size):
    experiences = sample_experiences(batch_size)
    states, actions, rewards, next_states, dones = experiences

    next_Q_values = model.predict(next_states)
    best_next_actions = np.argmax(next_Q_values, axis = 1)

    next_mask = tf.one_hot(best_next_actions, n_outputs).numpy()
    next_best_Q_values = (target.predict(next_states) * next_mask).sum(axis = 1)
    
    target_Q_values = (rewards + (1 - dones) * discount_rate * next_best_Q_values)
    target_Q_valuse = target_Q_values.reshape(-1, 1)

    mask = tf.one_hot(actions, n_outputs)
    with tf.GradientTape() as tape:
        all_Q_values = model(states)
        Q_values = tf.reduce_sum(all_Q_values * mask, axis = 1, keepdims = True)
        loss = tf.reduce_mean(loss_fn(target_Q_values, Q_values))
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))


In [None]:
# Code18.31
# train it

replay_memory = deque(maxlen = 2000)

rewards = []
best_score = 0

for episode in range(600):
    obs = env.reset()
    for step in range(200):
        epsilon = max(1 - episode / 500, 0.01)
        obs, reward, done, info = play_one_step(env, obs, epsilon)
        if done:
            break
    
    rewards.append(step)
    if step > best_score:
        best_weights = model.get_weights()
        best_score = step
    print('\rEpisode: {}, Steps: {}, wps: {:.3f}'.format(episode, step + 1, epsilon), end = '')

    if episode > 50:
        training_step(batch_size)
    if episode % 200 == 0:
        target.set_weights(model.get_weights())

model.set_weights(best_weights)

plt.plot(rewards)
plt.xlabel('Episode')
plt.ylabel('Sum of rewards')
plt.show()

In [None]:
# Code18.32
# try it

state = env.reset()
frames = []

for step in range(200):
    action = epsilon_greedy_policy(state)
    state, reward, done, info = env.step(action)
    if done:
        break
    
    img = env.render(mode = 'rgb_array')
    frames.append(img)

plot_animation(frames)


In [None]:
# Code18.33
# Done!

env.close()

In [None]:
# 下面这些部分在Colab上运行报错，我就放在本地了
# Code18.34
# hello tf-agents

from tf_agents.environments import suite_gym

env = suite_gym.load('Breakout-v4')
print(env)
print(env.gym)
print(env.reset())
print(env.step(np.array([1])))

In [None]:
# Code18.35
# get some other imformation

print(env.observation_spec())
print(env.action_spec())
print(env.time_step_spec())
print(env.gym.get_action_meanings())

In [None]:
# Code18.36
# use some wrappers

from tf_agents.environments.wrappers import ActionRepeat

repeating_env = ActionRepeat(env, times = 4)

print(repeating_env)
repeating_env.unwrapped

In [None]:
# Code18.37
# wrappers info
import tf_agents.environments.wrappers

for name in dir(tf_agents.environments.wrappers):
    obj = getattr(tf_agents.environments.wrappers, name)
    if hasattr(obj, '__base__') and issubclass(obj, tf_agents.environments.wrappers.PyEnvironmentBaseWrapper):
        print('{:27s} {}'.format(name, obj.__doc__.split('\n')[0]))


In [None]:
# Code18.38

from functools import partial
from gym.wrappers import TimeLimit

limited_repeating_env = suite_gym.load('Breakout-v4', gym_env_wrappers = [partial(TimeLimit, max_episode_steps = 10000)], env_wrappers = [partial(ActionRepeat, times = 4)])

print(limited_repeating_env)
limited_repeating_env.unwrapped

In [None]:
# Code18.39

from tf_agents.environments import suite_atari
from tf_agents.environments.atari_preprocessing import AtariPreprocessing
from tf_agents.environments.atari_wrappers import FrameStack4

max_episode_steps = 27000
environment_name = 'BreakoutNoFrameskip-v4'

env = suite_atari.load(environment_name, max_episode_steps = max_episode_steps, gym_env_wrappers = [AtariPreprocessing, FrameStack4])

# some example
env.seed(42)
env.reset()
time_step = env.step(np.array([1]))

for _ in range(4):
    time_step = env.step(np.array([3]))

# draw it
def plot_observation(obs):
    obs = obs.astype(np.float32)
    img = obs[..., :3]
    current_frame_delta = np.maximum(obs[..., 3] - obs[..., :3].mean(axis = -1), 0.)
    img[..., 0] += current_frame_delta
    img[..., 2] += current_frame_delta
    img = np.clip(img / 150, 0, 1)
    plt.imshow(img)
    plt.axis('off')

plt.figure(figsize = (6, 6))
plot_observation(time_step.observation)
plt.show()

In [None]:
# Code18.40
#

from tf_agents.environments.tf_py_environment import TFPyEnvironment

tf_env = TFPyEnvironment(env)

In [None]:
# Code18.41
# create DQN

from tf_agents.networks.q_network import QNetwork

preprocessing_layer = keras.layers.Lambda(lambda obs: tf.cast(obs, np.float32) / 255.0)
conv_layer_params = [(32, (8, 8), 4), (64, (4, 4), 2), (64, (3, 3), 1)]
fc_layer_params = [512]

# model DQN
q_net = QNetwork(
    #
    tf_env.observation_spec(),
    tf_env.action_spec(),
    preprocessing_layers = preprocessing_layer,
    conv_layer_params = conv_layer_params,
    fc_layer_params = fc_layer_params
)

In [None]:
# Code18.42
# Create DQN agent

from tf_agents.agents.dqn.dqn_agent import DqnAgent

train_step = tf.Variable(0)
update_period = 4
optimizer = tf.compat.v1.train.RMSPropOptimizer(learning_rate = 2.5e-4, decay = 0.95, momentum = 0.0, epsilon = 0.00001, centered = True)

epsilon_fn = keras.optimizers.schedules.PolynomialDecay(initial_learning_rate = 1.0, decay_steps = 250000 // update_period, end_learning_rate = 0.01)

agent = DqnAgent(
    tf_env.time_step_spec(),
    tf_env.action_spec(),
    q_network = q_net,
    optimizer = optimizer,
    target_update_period = 2000,
    td_errors_loss_fn = keras.losses.Huber(reduction = 'none'),
    gamma = 0.99,
    train_step_counter = train_step,
    epsilon_greedy = lambda: epsilon_fn(train_step)
)

agent.initialize()

In [None]:
# Code18.43
# 建立重放缓冲区

from tf_agents.replay_buffers import tf_uniform_replay_buffer

replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(data_spec = agent.collect_data_spec, batch_size = tf_env.batch_size, max_length = 1000000)

replay_buffer_observer = replay_buffer.add_batch

In [None]:
# Code18.44
# create observer

class ShowProgress:
    def __init__(self, total):
        self.counter = 0
        self.total = total
    
    def __call__(self, trajectory):
        if not trajectory.is_boundary():
            self.counter += 1
        if self.counter % 100 == 0:
            print('\r{}/{}'.format(self.counter, self.total), end = '')


In [None]:
# Code18.45
# create train metrics

from tf_agents.metrics import tf_metrics

train_metrics = [
    # 回合数目
    tf_metrics.NumberOfEpisodes(),
    # 步骤数目
    tf_metrics.EnvironmentSteps(),
    # 每回合的平均回报
    tf_metrics.AverageReturnMetric(),
    # 平均回合的长度
    tf_metrics.AverageEpisodeLengthMetric(),
]

train_metrics[0].result()

In [None]:
# Code18.46
# log

from tf_agents.eval.metric_utils import log_metrics
import logging

logging.getLogger().setLevel(logging.INFO)
log_metrics(train_metrics)

In [None]:
# Code18.47
# create step driver

from tf_agents.drivers.dynamic_step_driver import DynamicStepDriver

collect_driver = DynamicStepDriver(
    tf_env,
    agent.collect_policy,
    observers = [replay_buffer_observer] + train_metrics,
    num_steps = update_period
)

In [None]:
# Code18.48
# 

from tf_agents.policies.random_tf_policy import RandomTFPolicy

initial_collect_policy = RandomTFPolicy(tf_env.time_step_spec(), tf_env.action_spec())
init_driver = DynamicStepDriver(
    tf_env,
    initial_collect_policy,
    observers = [replay_buffer.add_batch, ShowProgress(20000)],
    num_steps = 20000
)

final_time_step, final_policy_state = init_driver.run()

In [None]:
# Code18.49
# trajectories test

tf.random.set_seed(887)

trajectories, buffer_info = replay_buffer.get_next(sample_batch_size = 2, num_steps = 3)

print(trajectories._fields)
print(trajectories.observation.shape)

In [None]:
# Code18.50

from tf_agents.trajectories.trajectory import to_transition

time_steps, action_steps, next_time_steps = to_transition(trajectories)

print(time_steps.observation.shape)
print(trajectories.step_type.numpy())

plt.figure(figsize = (10, 6.8))
for row in range(2):
    for col in range(3):
        plt.subplot(2, 3, row *3 + col + 1)
        plot_observation(trajectories.observation[row, col].numpy())
plt.subplots_adjust(left = 0, right = 1, bottom = 0, top = 1, hspace = 0, wspace = 0.02)
plt.show()

In [None]:
# Code18.50
# create dataset

dataset = replay_buffer.as_dataset(sample_batch_size = 64, num_steps = 2, num_parallel_calls = 3).prefetch(3)

In [None]:
# Code18.51
# train it!

from tf_agents.utils.common import function

collect_driver.run = function(collect_driver.run)
agent.train = function(agent.train)

def train_agent(n_iterations):
    time_step = None
    policy_state = agent.collect_policy.get_initial_state(tf_env.batch_size)
    iterator = iter(dataset)
    for iteration in range(n_iterations):
        time_step, policy_state = collect_driver.run(time_step, policy_state)
        trajectories, buffer_info = next(iterator)
        train_loss = agent.train(trajectories)

        print('\r{} loss: {:.5f}'.format(iteration, train_loss.loss.numpy()), end = '')
        if iteration % 1000 == 0:
            log_metrics(train_metrics)

train_agent(n_iterations = 1000)

In [None]:
# Code18.52
# draw it!

frames = []
def save_frames(trajectory):
    global frames
    frames.append(tf_env.pyenv.envs[0].render(mode = 'rgb_array'))

prev_lives = tf_env.pyenv.envs[0].ale.lives()
def reset_and_fire_on_life_lost(trajectory):
    global prev_lives
    lives = tf_env.pyenv.envs[0].ale.lives()
    if prev_lives != lives:
        tf_env.reset()
        tf_env.pyenv.envs[0].step(np.array([1]))
        prev_lives = lives

watch_driver = DynamicStepDriver(
    tf_env,
    agent.policy,
    observers = [save_frames, reset_and_fire_on_life_lost, ShowProgress(1000)],
    num_steps = 1000
)

final_time_step, final_policy_state = watch_driver.run()

plot_animation(frames)
