## Code

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import moviepy.editor as mpy
import skimage.transform
from IPython.display import Image, display

import tensorflow as tf
import tensorflow_probability as tfp
import tensorflow.keras.losses as kls

2021-12-29 01:44:08.593510: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1


In [None]:
gpus = tf.config.list_physical_devices("GPU") 
if gpus:
    try:
        # Restrict TensorFlow to only use the fourth GPU
        tf.config.set_visible_devices(gpus[1], 'GPU')

        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        
        logical_gpus = tf.config.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

2 Physical GPUs, 1 Logical GPUs


2021-12-29 01:44:09.435416: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2021-12-29 01:44:09.436006: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2021-12-29 01:44:09.460150: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:941] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-12-29 01:44:09.460346: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: NVIDIA GeForce RTX 2080 Ti computeCapability: 7.5
coreClock: 1.635GHz coreCount: 68 deviceMemorySize: 10.76GiB deviceMemoryBandwidth: 573.69GiB/s
2021-12-29 01:44:09.460395: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:941] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA 

In [None]:
import os
os.environ["SDL_VIDEODRIVER"] = "dummy"  # this line make pop-out window not appear
from ple.games.flappybird import FlappyBird
from ple import PLE

game = FlappyBird()
env = PLE(game, fps=30, display_screen=False)  # environment interface to game
env.reset_game()

test_game = FlappyBird()
test_env = PLE(test_game, fps=30, display_screen=False)
test_env.reset_game()

couldn't import doomish
Couldn't import doom




In [None]:
hparas = {
    'image_size': 84,
    'num_stack': 4,
    'action_dim': len(env.getActionSet()),
    'hidden_size': 256,
    'lr': 0.0001,
    'gamma': 0.99,
    'lambda': 0.95,
    'clip_val': 0.2,
    'ppo_epochs': 8,
    'test_epochs': 1,
    'num_steps': 512,
    'mini_batch_size': 64,
    'target_reward': 200,
    'max_episode': 30000,
}

In [None]:
# Please do not modify this method
def make_anim(images, fps=60, true_image=False):
    duration = len(images) / fps

    def make_frame(t):
        try:
            x = images[int(len(images) / duration * t)]
        except:
            x = images[-1]

        if true_image:
            return x.astype(np.uint8)
        else:
            return ((x + 1) / 2 * 255).astype(np.uint8)

    clip = mpy.VideoClip(make_frame, duration=duration)
    clip.fps = fps
    
    return clip

In [None]:
def preprocess_screen(screen):
    screen = skimage.transform.rotate(screen, -90, resize=True)
    screen = screen[:400, :]
    screen = skimage.transform.resize(screen, [hparas['image_size'], hparas['image_size'], 1])
    return screen.astype(np.float32)

def frames_to_state(input_frames):
    if(len(input_frames) == 1):
        state = np.concatenate(input_frames*4, axis=-1)
    elif(len(input_frames) == 2):
        state = np.concatenate(input_frames[0:1]*2 + input_frames[1:]*2, axis=-1)
    elif(len(input_frames) == 3):
        state = np.concatenate(input_frames + input_frames[2:], axis=-1)
    else:
        state = np.concatenate(input_frames[-4:], axis=-1)

    return state

In [None]:
class ActorCriticNetwork(tf.keras.Model):
    def __init__(self, hparas):
        super().__init__()

        self.feature_extractor = tf.keras.Sequential([
          # Convolutional Layers
          tf.keras.layers.Conv2D(filters=32, kernel_size=8, strides=4),
          tf.keras.layers.ReLU(),
          tf.keras.layers.Conv2D(filters=64, kernel_size=4, strides=2),
          tf.keras.layers.ReLU(),
          tf.keras.layers.Conv2D(filters=64, kernel_size=3, strides=1),
          tf.keras.layers.ReLU(),
          # Embedding Layers
          tf.keras.layers.Flatten(),
          tf.keras.layers.Dense(hparas['hidden_size']),
          tf.keras.layers.ReLU(),
        ])

        # Actor Network
        self.actor = tf.keras.layers.Dense(hparas['action_dim'], activation='softmax')
        # Critic Network
        self.critic = tf.keras.layers.Dense(1, activation = None)

    def call(self, input):
        x = self.feature_extractor(input)
        action_logits = self.actor(x)
        value = self.critic(x)
        return action_logits, value

In [None]:
class Agent():
    def __init__(self, hparas):
        self.gamma = hparas['gamma']
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=hparas['lr'])
        self.actor_critic = ActorCriticNetwork(hparas)
        self.clip_pram = hparas['clip_val']
    
    def ppo_iter(self, mini_batch_size, states, actions, log_probs, returns, advantage):
        batch_size = states.shape[0]
        for _ in range(batch_size // mini_batch_size):
            rand_ids = tf.convert_to_tensor(np.random.randint(0, batch_size, mini_batch_size), dtype=tf.int32)
            yield tf.gather(states, rand_ids), tf.gather(actions, rand_ids), tf.gather(log_probs, rand_ids), \
             tf.gather(returns, rand_ids), tf.gather(advantage, rand_ids)
    
    def ppo_update(self, ppo_epochs, mini_batch_size, states, actions, log_probs, discount_rewards, advantages):       
        total_actor_loss = 0
        total_critic_loss = 0
        for _ in range(ppo_epochs):
            for state, action, old_log_probs, reward, advantage in self.ppo_iter(mini_batch_size, states, actions, log_probs, discount_rewards, advantages):
                reward = tf.expand_dims(reward, axis=-1)

                with tf.GradientTape() as tape:
                    prob, value = self.actor_critic(state, training=True)
                    dist = tfp.distributions.Categorical(probs=prob, dtype=tf.float32)
                    entropy = tf.math.reduce_mean(dist.entropy())
                    new_log_probs = dist.log_prob(action)

                    # PPO ratio
                    ratio = tf.math.exp(new_log_probs - old_log_probs)
                    surr1 = ratio * advantage
                    surr2 = tf.clip_by_value(ratio, 1.0 - self.clip_pram, 1.0 + self.clip_pram) * advantage

                    actor_loss = tf.math.negative(tf.math.reduce_mean(tf.math.minimum(surr1, surr2))) - 0.1 * entropy
                    critic_loss = 0.5 * tf.math.reduce_mean(kls.mean_squared_error(reward, value))

                    total_loss = actor_loss + critic_loss
            
                # single optimizer
                grads = tape.gradient(total_loss, self.actor_critic.trainable_variables)
                self.optimizer.apply_gradients(zip(grads, self.actor_critic.trainable_variables))
      
                total_actor_loss += actor_loss
                total_critic_loss += critic_loss
        return total_actor_loss, total_critic_loss

In [None]:
# https://arxiv.org/pdf/1506.02438.pdf
# Equation 16
def compute_gae(rewards, masks, values, gamma, LAMBDA):
    gae = 0
    returns = []
    for i in reversed(range(len(rewards))):
        delta = rewards[i] + gamma * values[i + 1] * masks[i] - values[i]
        gae = delta + gamma * LAMBDA * masks[i] * gae
        returns.append(gae + values[i])

    returns.reverse()
    return returns

In [None]:
def test_reward(test_env, agent):
    total_reward = 0
    # Reset the environment
    test_env.reset_game()
    input_frames = [preprocess_screen(test_env.getScreenGrayscale())]

    while not test_env.game_over():

        state = frames_to_state(input_frames)
        state = tf.expand_dims(state, axis=0)
        prob, value = agent.actor_critic(state)

        action = np.argmax(prob[0].numpy())
        reward = test_env.act(test_env.getActionSet()[action])
        total_reward += reward

        input_frames.append(preprocess_screen(test_env.getScreenGrayscale()))

    return total_reward

In [None]:
agent = Agent(hparas)
max_episode = hparas['max_episode']
test_per_n_episode = 10
force_save_per_n_episode = 1000
early_stop_reward = 10

start_s = 0
best_reward = -5.0

checkpoint = tf.train.Checkpoint(
    actor_critic = agent.actor_critic,
    optimizer = agent.optimizer,
)

# Load from old checkpoint
# checkpoint.restore('ckpt_dir/ckpt-?')

In [None]:
ep_reward = []
total_avgr = []
early_stop = False
avg_rewards_list = []

env.reset_game()

for s in range(0, max_episode):
    if early_stop == True:
        break

    rewards = []
    states = []
    actions = []
    log_probs = []
    masks = []
    values = []

    display_frames = [env.getScreenRGB()]
    input_frames = [preprocess_screen(env.getScreenGrayscale())]

    for step in range(hparas['num_steps']):

        state = frames_to_state(input_frames)
        state = tf.expand_dims(state, axis=0)
        prob, value = agent.actor_critic(state)

        dist = tfp.distributions.Categorical(probs=prob[0], dtype=tf.float32)
        action = dist.sample(1)
        log_prob = dist.log_prob(action)

        reward = env.act(env.getActionSet()[int(action.numpy())])

        done = env.game_over()

        states.append(state)
        actions.append(action)
        values.append(value[0])
        log_probs.append(log_prob)
        rewards.append(tf.convert_to_tensor(reward, dtype=tf.float32))
        masks.append(tf.convert_to_tensor(1-int(done), dtype=tf.float32))

        display_frames.append(env.getScreenRGB())
        input_frames.append(preprocess_screen(env.getScreenGrayscale()))

        if done:
            env.reset_game()
            input_frames = [preprocess_screen(env.getScreenGrayscale())]
  
    _, next_value = agent.actor_critic(state)
    values.append(next_value[0])

    returns = compute_gae(rewards, masks, values, hparas['gamma'], hparas['lambda'])

    returns = tf.concat(returns, axis=0)
    log_probs = tf.concat(log_probs, axis=0)
    values = tf.concat(values, axis=0)
    states = tf.concat(states, axis=0)
    actions = tf.concat(actions, axis=0)
    advantage = returns - values[:-1]

    a_loss, c_loss = agent.ppo_update(hparas['ppo_epochs'], hparas['mini_batch_size'], states, actions, log_probs, returns, advantage)
    print('[Episode %d]  Actor loss: %.5f, Critic loss: %.5f' % (s, a_loss, c_loss))

    if s % test_per_n_episode == 0:
        # test agent hparas['test_epochs'] times to get the average reward
        avg_reward = np.mean([test_reward(test_env, agent) for _ in range(hparas['test_epochs'])])
        print("Test average reward is %.1f, Current best average reward is %.1f\n" % (avg_reward, best_reward))
        avg_rewards_list.append(avg_reward)

        if avg_reward > best_reward:
            best_reward = avg_reward
            agent.actor_critic.save('./joshua/save/Actor/model_actor_{}_{}'.format(s, avg_reward), save_format="tf")
            checkpoint.save(file_prefix = './joshua/save/checkpoints/ckpt')

    if s % force_save_per_n_episode == 0:
        agent.actor_critic.save('./joshua/save/Actor/model_actor_{}_{}'.format(s, avg_reward), save_format="tf")
        checkpoint.save(file_prefix = './joshua/save/checkpoints/ckpt')
        clip = make_anim(display_frames, fps=60, true_image=True).rotate(-90)
        clip.write_videofile("./joshua/movie_f/{}_demo-{}.webm".format('Lab17', s), fps=60)
        display(clip.ipython_display(fps=60, autoplay=1, loop=1, maxduration=120))

    if best_reward >= early_stop_reward:
        early_stop = True

2021-12-29 01:44:10.487002: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudnn.so.7
2021-12-29 01:44:11.260131: W tensorflow/stream_executor/gpu/asm_compiler.cc:63] Running ptxas --version returned 256
2021-12-29 01:44:11.293814: W tensorflow/stream_executor/gpu/redzone_allocator.cc:314] Internal: ptxas exited with non-zero error code 256, output: 
Relying on driver to perform ptx compilation. 
Modify $PATH to customize ptxas location.
This message will be only logged once.
2021-12-29 01:44:11.466949: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublas.so.10


[Episode 0]  Actor loss: 86.12639, Critic loss: 59.13999
Test average reward is -5.0, Current best average reward is -5.0



2021-12-29 01:44:22.336549: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: ./joshua/save/Actor/model_actor_0_-5.0/assets
Moviepy - Building video ./joshua/movie_f/Lab17_demo-0.webm.
Moviepy - Writing video ./joshua/movie_f/Lab17_demo-0.webm



                                                                                                                                                                                                                   

Moviepy - Done !
Moviepy - video ready ./joshua/movie_f/Lab17_demo-0.webm
Moviepy - Building video __temp__.mp4.
Moviepy - Writing video __temp__.mp4



                                                                                                                                                                                                                   

Moviepy - Done !
Moviepy - video ready __temp__.mp4




[Episode 1]  Actor loss: 50.44118, Critic loss: 33.70451
[Episode 2]  Actor loss: 35.23173, Critic loss: 27.02555
[Episode 3]  Actor loss: 13.24104, Critic loss: 16.55406
[Episode 4]  Actor loss: 15.69803, Critic loss: 10.65388
[Episode 5]  Actor loss: 13.49687, Critic loss: 7.26399
[Episode 6]  Actor loss: 2.36203, Critic loss: 4.12656
[Episode 7]  Actor loss: 3.20218, Critic loss: 5.23790
[Episode 8]  Actor loss: 2.45117, Critic loss: 3.63215
[Episode 9]  Actor loss: 4.74494, Critic loss: 5.17919
[Episode 10]  Actor loss: -6.20561, Critic loss: 2.86974
Test average reward is -5.0, Current best average reward is -5.0

[Episode 11]  Actor loss: -2.15167, Critic loss: 3.02624
[Episode 12]  Actor loss: 2.39208, Critic loss: 2.66658
[Episode 13]  Actor loss: 3.40369, Critic loss: 2.78646
[Episode 14]  Actor loss: -1.27278, Critic loss: 2.27624
[Episode 15]  Actor loss: 1.26664, Critic loss: 2.61567
[Episode 16]  Actor loss: -1.21499, Critic loss: 3.05492
[Episode 17]  Actor loss: -10.4871

                                                                                                                                                                                                                   

Moviepy - Done !
Moviepy - video ready ./joshua/movie_f/Lab17_demo-1000.webm
Moviepy - Building video __temp__.mp4.
Moviepy - Writing video __temp__.mp4



                                                                                                                                                                                                                   

Moviepy - Done !
Moviepy - video ready __temp__.mp4




[Episode 1001]  Actor loss: -7.48774, Critic loss: 1.08319
[Episode 1002]  Actor loss: -5.17924, Critic loss: 0.68768
[Episode 1003]  Actor loss: -6.08404, Critic loss: 1.06139
[Episode 1004]  Actor loss: -11.68556, Critic loss: 2.44728
[Episode 1005]  Actor loss: -3.19795, Critic loss: 1.23747
[Episode 1006]  Actor loss: -2.67115, Critic loss: 1.34947
[Episode 1007]  Actor loss: -6.60402, Critic loss: 0.79906
[Episode 1008]  Actor loss: -2.36966, Critic loss: 0.83904
[Episode 1009]  Actor loss: -3.69485, Critic loss: 0.62737
[Episode 1010]  Actor loss: -3.84475, Critic loss: 0.80898
Test average reward is -5.0, Current best average reward is -4.0

[Episode 1011]  Actor loss: -6.55192, Critic loss: 0.53732
[Episode 1012]  Actor loss: -6.64771, Critic loss: 0.79799
[Episode 1013]  Actor loss: -10.82406, Critic loss: 1.12898
[Episode 1014]  Actor loss: -8.79058, Critic loss: 1.61338
[Episode 1015]  Actor loss: -2.45898, Critic loss: 1.18086
[Episode 1016]  Actor loss: 1.02617, Critic los

                                                                                                                                                                                                                   

Moviepy - Done !
Moviepy - video ready ./joshua/movie_f/Lab17_demo-2000.webm
Moviepy - Building video __temp__.mp4.
Moviepy - Writing video __temp__.mp4



                                                                                                                                                                                                                   

Moviepy - Done !
Moviepy - video ready __temp__.mp4




[Episode 2001]  Actor loss: -3.20429, Critic loss: 2.67749
[Episode 2002]  Actor loss: -10.36315, Critic loss: 3.55499
[Episode 2003]  Actor loss: -2.59683, Critic loss: 1.34200
[Episode 2004]  Actor loss: -13.74445, Critic loss: 3.74643
[Episode 2005]  Actor loss: -29.97130, Critic loss: 2.83775
[Episode 2006]  Actor loss: -22.08290, Critic loss: 1.89779
[Episode 2007]  Actor loss: -2.61021, Critic loss: 1.60459
[Episode 2008]  Actor loss: 0.21685, Critic loss: 2.13834
[Episode 2009]  Actor loss: -5.47336, Critic loss: 2.42205
[Episode 2010]  Actor loss: -13.37401, Critic loss: 2.47909
Test average reward is -4.0, Current best average reward is 0.0

[Episode 2011]  Actor loss: -2.38294, Critic loss: 1.14453
[Episode 2012]  Actor loss: -11.58186, Critic loss: 1.69857
[Episode 2013]  Actor loss: -8.18802, Critic loss: 1.60851
[Episode 2014]  Actor loss: 4.84961, Critic loss: 4.63530
[Episode 2015]  Actor loss: -11.08073, Critic loss: 2.65122
[Episode 2016]  Actor loss: -2.71037, Critic 

                                                                                                                                                                                                                   

Moviepy - Done !
Moviepy - video ready ./joshua/movie_f/Lab17_demo-3000.webm
Moviepy - Building video __temp__.mp4.
Moviepy - Writing video __temp__.mp4



                                                                                                                                                                                                                   

Moviepy - Done !
Moviepy - video ready __temp__.mp4




[Episode 3001]  Actor loss: 11.43378, Critic loss: 8.93090
[Episode 3002]  Actor loss: 5.89410, Critic loss: 4.92078
[Episode 3003]  Actor loss: 0.02673, Critic loss: 8.01989
[Episode 3004]  Actor loss: 3.98381, Critic loss: 4.40331
[Episode 3005]  Actor loss: -10.64126, Critic loss: 5.79097
[Episode 3006]  Actor loss: -14.07008, Critic loss: 6.81745
[Episode 3007]  Actor loss: -9.15165, Critic loss: 10.03388
[Episode 3008]  Actor loss: -12.06127, Critic loss: 7.58984
[Episode 3009]  Actor loss: -22.96837, Critic loss: 3.98932
[Episode 3010]  Actor loss: 4.46391, Critic loss: 5.77890
Test average reward is -5.0, Current best average reward is 8.0

[Episode 3011]  Actor loss: -22.94950, Critic loss: 10.88746
[Episode 3012]  Actor loss: -5.48090, Critic loss: 6.37535
[Episode 3013]  Actor loss: -31.09271, Critic loss: 8.27125
[Episode 3014]  Actor loss: -14.64703, Critic loss: 6.54280
[Episode 3015]  Actor loss: -14.48070, Critic loss: 5.02845
[Episode 3016]  Actor loss: -22.06204, Criti

## Report

Based on my observation,function estimation was leveraged to predict value policy for all invisible states, allowing us to estimate the value or action of every state given. We'll train an agent that uses raw frames instead of hand-crafted features as input.

 