# Lab15_108032053

## 1. Model & Training

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import moviepy.editor as mpy
import skimage.transform
from IPython.display import Image, display

import tensorflow as tf
import tensorflow_probability as tfp
import tensorflow.keras.losses as kls

2024-01-03 00:00:51.355861: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9360] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-03 00:00:51.355915: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-03 00:00:51.355942: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1537] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-03 00:00:51.365316: I tensorflow/core/platform/cpu_feature_guard.cc:183] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE3 SSE4.1 SSE4.2 AVX, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
gpus = tf.config.list_physical_devices("GPU") 
if gpus:
    try:
        # Restrict TensorFlow to only use the fourth GPU
        tf.config.set_visible_devices(gpus[0], 'GPU')

        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        
        logical_gpus = tf.config.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

1 Physical GPUs, 1 Logical GPUs


2024-01-03 00:00:53.657005: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1883] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 31136 MB memory:  -> device: 0, name: Tesla V100-SXM2-32GB, pci bus id: 0000:b1:00.0, compute capability: 7.0


In [4]:
import os
os.environ["SDL_VIDEODRIVER"] = "dummy"  # this line make pop-out window not appear
from ple.games.flappybird import FlappyBird
from ple import PLE

game = FlappyBird()
env = PLE(game, fps=30, display_screen=False)  # environment interface to game
env.reset_game()

test_game = FlappyBird()
test_env = PLE(test_game, fps=30, display_screen=False)
test_env.reset_game()

couldn't import doomish
Couldn't import doom




In [5]:
path = './movie_f' 
if not os.path.exists(path):
    os.makedirs(path)

In [6]:
hparas = {
    'image_size': 84,
    'num_stack': 4,
    'action_dim': len(env.getActionSet()),
    'hidden_size': 256,
    'lr': 0.0001,
    'gamma': 0.99,
    'lambda': 0.95,
    'clip_val': 0.2,
    'ppo_epochs': 8,
    'test_epochs': 1,
    'num_steps': 512,
    'mini_batch_size': 64,
    'target_reward': 200,
    'max_episode': 30000,
}

In [7]:
def make_anim(images, fps=60, true_image=False):
    duration = len(images) / fps

    def make_frame(t):
        try:
            x = images[int(len(images) / duration * t)]
        except:
            x = images[-1]

        if true_image:
            return x.astype(np.uint8)
        else:
            return ((x + 1) / 2 * 255).astype(np.uint8)

    clip = mpy.VideoClip(make_frame, duration=duration)
    clip.fps = fps
    
    return clip

In [8]:
def preprocess_screen(screen):
    screen = skimage.transform.rotate(screen, -90, resize=True)
    screen = screen[:400, :]
    screen = skimage.transform.resize(screen, [hparas['image_size'], hparas['image_size'], 1])
    return screen.astype(np.float32)

def frames_to_state(input_frames):
    if(len(input_frames) == 1):
        state = np.concatenate(input_frames*4, axis=-1)
    elif(len(input_frames) == 2):
        state = np.concatenate(input_frames[0:1]*2 + input_frames[1:]*2, axis=-1)
    elif(len(input_frames) == 3):
        state = np.concatenate(input_frames + input_frames[2:], axis=-1)
    else:
        state = np.concatenate(input_frames[-4:], axis=-1)

    return state

In [9]:
class ActorCriticNetwork(tf.keras.Model):
    def __init__(self, hparas):
        super().__init__()

        self.feature_extractor = tf.keras.Sequential([
          # Convolutional Layers
          tf.keras.layers.Conv2D(filters=32, kernel_size=8, strides=4),
          tf.keras.layers.ReLU(),
          tf.keras.layers.Conv2D(filters=64, kernel_size=4, strides=2),
          tf.keras.layers.ReLU(),
          tf.keras.layers.Conv2D(filters=64, kernel_size=3, strides=1),
          tf.keras.layers.ReLU(),
          # Embedding Layers
          tf.keras.layers.Flatten(),
          tf.keras.layers.Dense(hparas['hidden_size']),
          tf.keras.layers.ReLU(),
        ])

        # Actor Network
        self.actor = tf.keras.layers.Dense(hparas['action_dim'], activation='softmax')
        # Critic Network
        self.critic = tf.keras.layers.Dense(1, activation = None)

    def call(self, input):
        x = self.feature_extractor(input)
        action_logits = self.actor(x)
        value = self.critic(x)
        return action_logits, value

In [10]:
class Agent():
    def __init__(self, hparas):
        self.gamma = hparas['gamma']
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=hparas['lr'])
        self.actor_critic = ActorCriticNetwork(hparas)
        self.clip_pram = hparas['clip_val']
    
    def ppo_iter(self, mini_batch_size, states, actions, log_probs, returns, advantage):
        batch_size = states.shape[0]
        for _ in range(batch_size // mini_batch_size):
            rand_ids = tf.convert_to_tensor(np.random.randint(0, batch_size, mini_batch_size), dtype=tf.int32)
            yield tf.gather(states, rand_ids), tf.gather(actions, rand_ids), tf.gather(log_probs, rand_ids), \
             tf.gather(returns, rand_ids), tf.gather(advantage, rand_ids)
    
    def ppo_update(self, ppo_epochs, mini_batch_size, states, actions, log_probs, discount_rewards, advantages):       
        total_actor_loss = 0
        total_critic_loss = 0
        for _ in range(ppo_epochs):
            for state, action, old_log_probs, reward, advantage in self.ppo_iter(mini_batch_size, states, actions, log_probs, discount_rewards, advantages):
                reward = tf.expand_dims(reward, axis=-1)

                with tf.GradientTape() as tape:
                    prob, value = self.actor_critic(state, training=True)
                    dist = tfp.distributions.Categorical(probs=prob, dtype=tf.float32)
                    entropy = tf.math.reduce_mean(dist.entropy())
                    new_log_probs = dist.log_prob(action)

                    # PPO ratio
                    ratio = tf.math.exp(new_log_probs - old_log_probs)
                    surr1 = ratio * advantage
                    surr2 = tf.clip_by_value(ratio, 1.0 - self.clip_pram, 1.0 + self.clip_pram) * advantage

                    actor_loss = tf.math.negative(tf.math.reduce_mean(tf.math.minimum(surr1, surr2))) - 0.1 * entropy
                    critic_loss = 0.5 * tf.math.reduce_mean(kls.mean_squared_error(reward, value))

                    total_loss = actor_loss + critic_loss
            
                # single optimizer
                grads = tape.gradient(total_loss, self.actor_critic.trainable_variables)
                self.optimizer.apply_gradients(zip(grads, self.actor_critic.trainable_variables))
      
                total_actor_loss += actor_loss
                total_critic_loss += critic_loss
        return total_actor_loss, total_critic_loss

In [11]:
# https://arxiv.org/pdf/1506.02438.pdf
# Equation 16
def compute_gae(rewards, masks, values, gamma, LAMBDA):
    gae = 0
    returns = []
    for i in reversed(range(len(rewards))):
        delta = rewards[i] + gamma * values[i + 1] * masks[i] - values[i]
        gae = delta + gamma * LAMBDA * masks[i] * gae
        returns.append(gae + values[i])

    returns.reverse()
    return returns

In [12]:
def test_reward(test_env, agent):
    total_reward = 0
    # Reset the environment
    test_env.reset_game()
    input_frames = [preprocess_screen(test_env.getScreenGrayscale())]

    while not test_env.game_over():

        state = frames_to_state(input_frames)
        state = tf.expand_dims(state, axis=0)
        prob, value = agent.actor_critic(state)

        action = np.argmax(prob[0].numpy())
        reward = test_env.act(test_env.getActionSet()[action])
        total_reward += reward

        input_frames.append(preprocess_screen(test_env.getScreenGrayscale()))

    return total_reward

In [13]:
agent = Agent(hparas)
max_episode = hparas['max_episode']
test_per_n_episode = 10
force_save_per_n_episode = 1000
early_stop_reward = 10

start_s = 0
best_reward = -5.0

checkpoint = tf.train.Checkpoint(
    actor_critic = agent.actor_critic,
    optimizer = agent.optimizer,
)

# Load from old checkpoint
# checkpoint.restore('ckpt_dir/ckpt-?')

In [14]:
ep_reward = []
total_avgr = []
early_stop = False
avg_rewards_list = []

env.reset_game()

for s in range(0, max_episode):
    if early_stop == True:
        break

    rewards = []
    states = []
    actions = []
    log_probs = []
    masks = []
    values = []

    display_frames = [env.getScreenRGB()]
    input_frames = [preprocess_screen(env.getScreenGrayscale())]

    for step in range(hparas['num_steps']):

        state = frames_to_state(input_frames)
        state = tf.expand_dims(state, axis=0)
        prob, value = agent.actor_critic(state)

        dist = tfp.distributions.Categorical(probs=prob[0], dtype=tf.float32)
        action = dist.sample(1)
        log_prob = dist.log_prob(action)

        reward = env.act(env.getActionSet()[int(action.numpy())])

        done = env.game_over()

        states.append(state)
        actions.append(action)
        values.append(value[0])
        log_probs.append(log_prob)
        rewards.append(tf.convert_to_tensor(reward, dtype=tf.float32))
        masks.append(tf.convert_to_tensor(1-int(done), dtype=tf.float32))

        display_frames.append(env.getScreenRGB())
        input_frames.append(preprocess_screen(env.getScreenGrayscale()))

        if done:
            env.reset_game()
            input_frames = [preprocess_screen(env.getScreenGrayscale())]
  
    _, next_value = agent.actor_critic(state)
    values.append(next_value[0])

    returns = compute_gae(rewards, masks, values, hparas['gamma'], hparas['lambda'])

    returns = tf.concat(returns, axis=0)
    log_probs = tf.concat(log_probs, axis=0)
    values = tf.concat(values, axis=0)
    states = tf.concat(states, axis=0)
    actions = tf.concat(actions, axis=0)
    advantage = returns - values[:-1]

    a_loss, c_loss = agent.ppo_update(hparas['ppo_epochs'], hparas['mini_batch_size'], states, actions, log_probs, returns, advantage)
    print('[Episode %d]  Actor loss: %.5f, Critic loss: %.5f' % (s, a_loss, c_loss))

    if s % test_per_n_episode == 0:
        # test agent hparas['test_epochs'] times to get the average reward
        avg_reward = np.mean([test_reward(test_env, agent) for _ in range(hparas['test_epochs'])])
        print("Test average reward is %.1f, Current best average reward is %.1f\n" % (avg_reward, best_reward))
        avg_rewards_list.append(avg_reward)

        if avg_reward > best_reward:
            best_reward = avg_reward
            agent.actor_critic.save('./save/Actor/model_actor_{}_{}'.format(s, avg_reward), save_format="tf")
            checkpoint.save(file_prefix = './save/checkpoints/ckpt')

    if s % force_save_per_n_episode == 0:
        agent.actor_critic.save('./save/Actor/model_actor_{}_{}'.format(s, avg_reward), save_format="tf")
        checkpoint.save(file_prefix = './save/checkpoints/ckpt')
        clip = make_anim(display_frames, fps=60, true_image=True).rotate(-90)
        clip.write_videofile("movie_f/{}_demo-{}.webm".format('Lab15', s), fps=60)
        display(clip.ipython_display(fps=60, autoplay=1, loop=1, maxduration=120))

    if best_reward >= early_stop_reward:
        early_stop = True

2024-01-03 00:00:55.026636: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:442] Loaded cuDNN version 8906
2024-01-03 00:01:08.926586: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x56107e7a1f20 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-01-03 00:01:08.926623: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Tesla V100-SXM2-32GB, Compute Capability 7.0
2024-01-03 00:01:08.936553: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-01-03 00:01:09.017115: I ./tensorflow/compiler/jit/device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[Episode 0]  Actor loss: 86.21421, Critic loss: 59.75151
Test average reward is -5.0, Current best average reward is -5.0

INFO:tensorflow:Assets written to: ./save/Actor/model_actor_0_-5.0/assets


INFO:tensorflow:Assets written to: ./save/Actor/model_actor_0_-5.0/assets


Moviepy - Building video movie_f/Lab15_demo-0.webm.
Moviepy - Writing video movie_f/Lab15_demo-0.webm



                                                               

Moviepy - Done !
Moviepy - video ready movie_f/Lab15_demo-0.webm
Moviepy - Building video __temp__.mp4.
Moviepy - Writing video __temp__.mp4



                                                               

Moviepy - Done !
Moviepy - video ready __temp__.mp4


[Episode 1]  Actor loss: 35.88290, Critic loss: 26.41533
[Episode 2]  Actor loss: 35.92574, Critic loss: 22.29245
[Episode 3]  Actor loss: 16.55899, Critic loss: 10.39134
[Episode 4]  Actor loss: 21.13348, Critic loss: 10.99999
[Episode 5]  Actor loss: 10.39707, Critic loss: 6.96987
[Episode 6]  Actor loss: 9.64706, Critic loss: 5.22790
[Episode 7]  Actor loss: 2.86627, Critic loss: 4.48298
[Episode 8]  Actor loss: -3.23313, Critic loss: 3.98757
[Episode 9]  Actor loss: 2.97530, Critic loss: 2.91045
[Episode 10]  Actor loss: -0.29919, Critic loss: 3.42077
Test average reward is -5.0, Current best average reward is -5.0

[Episode 11]  Actor loss: -3.25367, Critic loss: 3.21691
[Episode 12]  Actor loss: -0.21270, Critic loss: 2.88098
[Episode 13]  Actor loss: -2.39696, Critic loss: 2.14817
[Episode 14]  Actor loss: -7.43690, Critic loss: 1.75484
[Episode 15]  Actor loss: -5.80470, Critic loss: 2.36451
[Episode 16]  Actor loss: -4.33840, Critic loss: 2.18721
[Episode 17]  Actor loss: -3.5

INFO:tensorflow:Assets written to: ./save/Actor/model_actor_80_-4.0/assets


[Episode 81]  Actor loss: -1.90184, Critic loss: 0.75246
[Episode 82]  Actor loss: -4.45743, Critic loss: 0.79861
[Episode 83]  Actor loss: -3.91775, Critic loss: 0.35031
[Episode 84]  Actor loss: -4.08054, Critic loss: 1.29376
[Episode 85]  Actor loss: -5.07557, Critic loss: 0.60348
[Episode 86]  Actor loss: -5.30708, Critic loss: 0.85273
[Episode 87]  Actor loss: -1.61056, Critic loss: 0.72088
[Episode 88]  Actor loss: -4.68715, Critic loss: 0.45143
[Episode 89]  Actor loss: -3.65534, Critic loss: 0.53827
[Episode 90]  Actor loss: -4.60834, Critic loss: 0.45566
Test average reward is -5.0, Current best average reward is -4.0

[Episode 91]  Actor loss: -5.79653, Critic loss: 0.38631
[Episode 92]  Actor loss: -4.19550, Critic loss: 0.44622
[Episode 93]  Actor loss: -5.82714, Critic loss: 0.37591
[Episode 94]  Actor loss: -3.21290, Critic loss: 0.54959
[Episode 95]  Actor loss: -5.04592, Critic loss: 0.49039
[Episode 96]  Actor loss: -5.31943, Critic loss: 0.38310
[Episode 97]  Actor lo

INFO:tensorflow:Assets written to: ./save/Actor/model_actor_1000_-5.0/assets


Moviepy - Building video movie_f/Lab15_demo-1000.webm.
Moviepy - Writing video movie_f/Lab15_demo-1000.webm



                                                               

Moviepy - Done !
Moviepy - video ready movie_f/Lab15_demo-1000.webm
Moviepy - Building video __temp__.mp4.
Moviepy - Writing video __temp__.mp4



                                                               

Moviepy - Done !
Moviepy - video ready __temp__.mp4




[Episode 1001]  Actor loss: -3.10149, Critic loss: 2.43227
[Episode 1002]  Actor loss: -3.43317, Critic loss: 0.96499
[Episode 1003]  Actor loss: -2.80017, Critic loss: 1.36634
[Episode 1004]  Actor loss: -7.80133, Critic loss: 1.37338
[Episode 1005]  Actor loss: 3.14537, Critic loss: 1.41205
[Episode 1006]  Actor loss: -5.91135, Critic loss: 2.28975
[Episode 1007]  Actor loss: -8.54807, Critic loss: 1.05255
[Episode 1008]  Actor loss: 4.08659, Critic loss: 1.53071
[Episode 1009]  Actor loss: -1.20922, Critic loss: 2.10645
[Episode 1010]  Actor loss: -1.69241, Critic loss: 0.87430
Test average reward is -5.0, Current best average reward is -4.0

[Episode 1011]  Actor loss: -3.97175, Critic loss: 1.89866
[Episode 1012]  Actor loss: -15.01127, Critic loss: 1.40603
[Episode 1013]  Actor loss: -2.20975, Critic loss: 1.28202
[Episode 1014]  Actor loss: -8.15239, Critic loss: 2.01755
[Episode 1015]  Actor loss: -7.60563, Critic loss: 1.18779
[Episode 1016]  Actor loss: -13.87050, Critic loss

INFO:tensorflow:Assets written to: ./save/Actor/model_actor_1420_0.0/assets


[Episode 1421]  Actor loss: -11.50538, Critic loss: 1.90567
[Episode 1422]  Actor loss: -11.32860, Critic loss: 3.37282
[Episode 1423]  Actor loss: -3.12509, Critic loss: 2.84699
[Episode 1424]  Actor loss: -9.71438, Critic loss: 2.47768
[Episode 1425]  Actor loss: -2.38902, Critic loss: 2.29618
[Episode 1426]  Actor loss: 6.87491, Critic loss: 4.43540
[Episode 1427]  Actor loss: 5.79751, Critic loss: 1.57903
[Episode 1428]  Actor loss: -6.04728, Critic loss: 1.70800
[Episode 1429]  Actor loss: -15.18020, Critic loss: 2.86276
[Episode 1430]  Actor loss: -10.48341, Critic loss: 1.84192
Test average reward is -5.0, Current best average reward is 0.0

[Episode 1431]  Actor loss: -4.08473, Critic loss: 1.34788
[Episode 1432]  Actor loss: -5.22561, Critic loss: 1.66149
[Episode 1433]  Actor loss: -2.57361, Critic loss: 1.25113
[Episode 1434]  Actor loss: -4.80865, Critic loss: 1.94058
[Episode 1435]  Actor loss: 0.68635, Critic loss: 1.44122
[Episode 1436]  Actor loss: -7.59023, Critic loss

INFO:tensorflow:Assets written to: ./save/Actor/model_actor_1830_9.0/assets


[Episode 1831]  Actor loss: 4.78524, Critic loss: 6.18305
[Episode 1832]  Actor loss: -15.92692, Critic loss: 2.85555
[Episode 1833]  Actor loss: -4.49189, Critic loss: 2.90900
[Episode 1834]  Actor loss: -9.98656, Critic loss: 2.14589
[Episode 1835]  Actor loss: -16.31160, Critic loss: 3.32817
[Episode 1836]  Actor loss: -36.33047, Critic loss: 5.87552
[Episode 1837]  Actor loss: -3.54860, Critic loss: 3.12959
[Episode 1838]  Actor loss: -16.52726, Critic loss: 4.41093
[Episode 1839]  Actor loss: -4.72720, Critic loss: 3.22068
[Episode 1840]  Actor loss: -16.04883, Critic loss: 3.97990
Test average reward is -5.0, Current best average reward is 9.0

[Episode 1841]  Actor loss: 2.57725, Critic loss: 6.08379
[Episode 1842]  Actor loss: 14.56127, Critic loss: 5.46941
[Episode 1843]  Actor loss: 16.08176, Critic loss: 3.23301
[Episode 1844]  Actor loss: 1.73449, Critic loss: 3.02099
[Episode 1845]  Actor loss: -2.92683, Critic loss: 3.32141
[Episode 1846]  Actor loss: -4.66050, Critic los

INFO:tensorflow:Assets written to: ./save/Actor/model_actor_2000_-3.0/assets


Moviepy - Building video movie_f/Lab15_demo-2000.webm.
Moviepy - Writing video movie_f/Lab15_demo-2000.webm



                                                               

Moviepy - Done !
Moviepy - video ready movie_f/Lab15_demo-2000.webm
Moviepy - Building video __temp__.mp4.
Moviepy - Writing video __temp__.mp4



                                                               

Moviepy - Done !
Moviepy - video ready __temp__.mp4


[Episode 2001]  Actor loss: -17.88203, Critic loss: 5.83291
[Episode 2002]  Actor loss: -8.58813, Critic loss: 5.27269
[Episode 2003]  Actor loss: -8.64155, Critic loss: 6.73571
[Episode 2004]  Actor loss: -13.42996, Critic loss: 5.70772
[Episode 2005]  Actor loss: -1.58892, Critic loss: 8.06225
[Episode 2006]  Actor loss: -5.19632, Critic loss: 4.72737
[Episode 2007]  Actor loss: -4.77418, Critic loss: 5.30364
[Episode 2008]  Actor loss: 14.51168, Critic loss: 4.65980
[Episode 2009]  Actor loss: 11.42922, Critic loss: 2.54002
[Episode 2010]  Actor loss: -7.58777, Critic loss: 3.49441
Test average reward is -5.0, Current best average reward is 9.0

[Episode 2011]  Actor loss: -22.83396, Critic loss: 5.77822
[Episode 2012]  Actor loss: -4.20153, Critic loss: 4.68222
[Episode 2013]  Actor loss: -16.43517, Critic loss: 7.23150
[Episode 2014]  Actor loss: -24.01263, Critic loss: 3.65473
[Episode 2015]  Actor loss: -12.62490, Critic loss: 4.83064
[Episode 2016]  Actor loss: -4.93087, Critic

INFO:tensorflow:Assets written to: ./save/Actor/model_actor_2520_13.0/assets


## 2. Result

In [16]:
clip = make_anim(display_frames, fps=60, true_image=True).rotate(-90)
clip.write_videofile("movie_f/{}_demo-{}.webm".format('Lab15', 2520), fps=60)
display(clip.ipython_display(fps=60, autoplay=1, loop=1, maxduration=120))

Moviepy - Building video movie_f/Lab15_demo-2520.webm.
Moviepy - Writing video movie_f/Lab15_demo-2520.webm



                                                               

Moviepy - Done !
Moviepy - video ready movie_f/Lab15_demo-2520.webm
Moviepy - Building video __temp__.mp4.
Moviepy - Writing video __temp__.mp4



                                                               

Moviepy - Done !
Moviepy - video ready __temp__.mp4


## 3. Report

在這份 PPO X GAE 的 notebook 中，model 主要有以下兩個部分。

- `ActorCriticNetwork` 是使用 deep neural network 來取代之前 state 的部分，他的 input 會是由 `frames_to_state` function 將一串 frame 取最後 4 張 concate 起來成一張 image (將channel concate 起來)，而這個 image 首先會經由 CNN 作為 feature extractor，再將 feature 分別由 actor network 和 critic network 產生 action 的 softmax output 和 state value，這兩個 network 都是只用一層 Dense 所組成的。

- `Agent` 則是實作了 PPO 的演算法，包括 PPO 的 update 和從環境做 sample，其中使用了剛剛的 `ActorCriticNetwork`，並用 `Adam` 做為 optimizer，在 update 時會去對 PPO ratio 做 clip。

再來 GAE 的部分是實作在 `compute_gae` function 中，會估算出 total reward。

最後是 training 的過程，在每個 epoch 會先跑過固定的 step，並對 PPO 做 update。另外在每十個 epoch 就會去 test 的環境做測試，計算出平均 reward，並記錄下最好的平均 reward，當這個 reward 大於我們設定的 10 時，就會提前結束 training 過程。

在我的測試中，actor loss 和 critic loss 大致上都會維持在同一個區間震盪，每過一段時間後又會到不同區間繼續震盪，除了一開始之外，沒有很明顯的下降趨勢。而測試的平均 reward 是在第 1830 epoch 時才從 0 衝到 9，最後在第 2520 epoch 超過 10，到達 13，這時的 flappy bird 可以通過最多 5 根管子。