In [29]:
import tensorflow as tf
import numpy as np

In [30]:
import os
os.environ["SDL_VIDEODRIVER"] = "dummy"  # this line make pop-out window not appear
from ple.games.flappybird import FlappyBird
from ple import PLE

game = FlappyBird()
env = PLE(game, fps=30, display_screen=False)  # environment interface to game
env.reset_game()

In [31]:
import os
print(os.getcwd())
os.chdir('D:\python')
print(os.getcwd())

D:\python
D:\python


In [32]:
# Define Input Size
IMG_WIDTH = 84
IMG_HEIGHT = 84
NUM_STACK = 4
# For Epsilon-greedy
MIN_EXPLORING_RATE = 0.01

## Change the network

In [33]:
class Agent:
    def __init__(self, name, num_action, discount_factor=0.99):
        self.exploring_rate = 0.1
        self.discount_factor = discount_factor
        self.num_action = num_action
        self.model = self.build_model(name)

    def build_model(self, name):
        # input: state
        # output: each action's Q-value 
        screen_stack = tf.keras.Input(shape=(32,), dtype=tf.float32)
        x = tf.keras.layers.Dense(units=64)(screen_stack)
        x = tf.keras.layers.ReLU()(x)
        Q = tf.keras.layers.Dense(self.num_action)(x)

        model = tf.keras.Model(name=name, inputs=screen_stack, outputs=Q)

        return model
    
    def loss(self, state, action, reward, tar_Q, ternimal):
        # Q(s,a,theta) for all a, shape (batch_size, num_action)
        output = self.model(state)
        index = tf.stack([tf.range(tf.shape(action)[0]), action], axis=1)
        # Q(s,a,theta) for selected a, shape (batch_size, 1)
        Q = tf.gather_nd(output, index)
        
        # set tar_Q as 0 if reaching terminal state
        tar_Q *= ~np.array(terminal)

        # loss = E[r+max(Q(s',a',theta'))-Q(s,a,theta)]
        loss = tf.reduce_mean(tf.square(reward + self.discount_factor * tar_Q - Q))

        return loss
    
    def max_Q(self, state):
        # Q(s,a,theta) for all a, shape (batch_size, num_action)
        output = self.model(state)

        # max(Q(s',a',theta')), shape (batch_size, 1)
        return tf.reduce_max(output, axis=1)
    
    def select_action(self, state):
        # epsilon-greedy
        if np.random.rand() < self.exploring_rate:
            action = np.random.choice(self.num_action)  # Select a random action
        else:                                     
            state = np.expand_dims(state, axis = 0)
            # Q(s,a,theta) for all a, shape (batch_size, num_action)
            output = self.model(state)
            
            # select action with highest action-value
            action = tf.argmax(output, axis=1)[0]

        return action
    def update_parameters(self, episode):
        self.exploring_rate = max(MIN_EXPLORING_RATE, min(0.5, 0.99**((episode) / 30)))

    def shutdown_explore(self):
        # make action selection greedy
        self.exploring_rate = 0

In [34]:
# init agent
num_action = len(env.getActionSet())

# agent for frequently updating
online_agent = Agent('online', num_action)

# agent for slow updating
target_agent = Agent('target', num_action)
# synchronize target model's weight with online model's weight
target_agent.model.set_weights(online_agent.model.get_weights())

In [35]:
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)
average_loss = tf.keras.metrics.Mean(name='loss')

@tf.function
def train_step(state, action, reward, next_state, ternimal):
    # Delayed Target Network
    tar_Q = target_agent.max_Q(next_state)
    with tf.GradientTape() as tape:
        loss = online_agent.loss(state, action, reward, tar_Q, ternimal)
    gradients = tape.gradient(loss, online_agent.model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, online_agent.model.trainable_variables))
    
    average_loss.update_state(loss)

In [36]:
class Replay_buffer():
    def __init__(self, buffer_size=50000):
        self.experiences = []
        self.buffer_size = buffer_size

    def add(self, experience):
        if len(self.experiences) >= self.buffer_size:
            self.experiences.pop(0)
        self.experiences.append(experience)

    def sample(self, size):
        """
        sample experience from buffer
        """
        if size > len(self.experiences):
            experiences_idx = np.random.choice(len(self.experiences), size=size)
        else:
            experiences_idx = np.random.choice(len(self.experiences), size=size, replace=False)

        # from all sampled experiences, extract a tuple of (s,a,r,s')
        states = []
        actions = []
        rewards = []
        states_prime = []
        terminal = []
        for i in range(size):
            states.append(self.experiences[experiences_idx[i]][0])
            actions.append(self.experiences[experiences_idx[i]][1])
            rewards.append(self.experiences[experiences_idx[i]][2])
            states_prime.append(self.experiences[experiences_idx[i]][3])
            terminal.append(self.experiences[experiences_idx[i]][4])

        return states, actions, rewards, states_prime, terminal

In [37]:
# init buffer
buffer = Replay_buffer()

In [38]:
import moviepy.editor as mpy

def make_anim(images, fps=60, true_image=False):
    duration = len(images) / fps

    def make_frame(t):
        try:
            x = images[int(len(images) / duration * t)]
        except:
            x = images[-1]

        if true_image:
            return x.astype(np.uint8)
        else:
            return ((x + 1) / 2 * 255).astype(np.uint8)

    clip = mpy.VideoClip(make_frame, duration=duration)
    clip.fps = fps
    return clip

In [39]:
def frames_to_state(input_frames):
    if(len(input_frames)//8 == 1):
        state = (input_frames*4)
    elif(len(input_frames)//8 == 2):
        state = (input_frames[0:8]*2 + input_frames[8:16]*2)
    elif(len(input_frames)//8 == 3):
        state = (input_frames + input_frames[16:24])
    else:
        state = (input_frames[-32:])
    return np.array(state)

## Train

In [42]:
from IPython.display import Image, display

update_every_iteration = 1000
print_every_episode = 500
save_video_every_episode = 5000
NUM_EPISODE = 20000
NUM_EXPLORE = 200
BATCH_SIZE = 32

iter_num = 0
for episode in range(0, NUM_EPISODE + 1):
    
    # Reset the environment
    env.reset_game()
    # record frame
    if episode % save_video_every_episode == 0:
        frames = [env.getScreenRGB()]
    
    # input frame
    input_frames = list(game.getGameState().values())
    
    # for every 500 episodes, shutdown exploration to see the performance of greedy action
    if episode % print_every_episode == 0:
        online_agent.shutdown_explore()
    
    # cumulate reward for this episode
    cum_reward = 0
    
    t = 0
    while not env.game_over():
        
        state = frames_to_state(input_frames)
        
        # feed current state and select an action
        action = online_agent.select_action(state)       
        

        # execute the action and get reward
        reward = env.act(env.getActionSet()[action])
        
        # record frame
        if episode % save_video_every_episode == 0:
            frames.append(env.getScreenRGB())
        
        # record input frame
        input_frames += list(game.getGameState().values())

        # cumulate reward
        cum_reward += reward
        
        # observe the result
        state_prime = frames_to_state(input_frames)  # get next state
        
        # append experience for this episode
        if episode % print_every_episode != 0:
            buffer.add((state, action, reward, state_prime, env.game_over()))
        
        # Setting up for the next iteration
        state = state_prime
        t += 1
        
        # update agent
        if episode > NUM_EXPLORE and episode % print_every_episode != 0:
            iter_num += 1
            train_states, train_actions, train_rewards, train_states_prime, terminal = buffer.sample(BATCH_SIZE)
            train_states = np.asarray(train_states)
            train_states_prime = np.asarray(train_states_prime)


            # convert Python object to Tensor to prevent graph re-tracing
            train_states = tf.convert_to_tensor(train_states, tf.float32)
            train_actions = tf.convert_to_tensor(train_actions, tf.int32)
            train_rewards = tf.convert_to_tensor(train_rewards, tf.float32)
            train_states_prime = tf.convert_to_tensor(train_states_prime, tf.float32)
            terminal = tf.convert_to_tensor(terminal, tf.bool)

            train_step(train_states, train_actions, train_rewards, train_states_prime, terminal)

        # synchronize target model's weight with online model's weight every 1000 iterations
        if iter_num % update_every_iteration == 0 and episode > NUM_EXPLORE and episode % print_every_episode != 0:
            target_agent.model.set_weights(online_agent.model.get_weights())

    # update exploring rate
    online_agent.update_parameters(episode)
    target_agent.update_parameters(episode)

    if episode % print_every_episode == 0 and episode > NUM_EXPLORE:
        print(
            "[{}] time live:{}, cumulated reward: {}, exploring rate: {}, average loss: {}".
            format(episode, t, cum_reward, online_agent.exploring_rate, average_loss.result()))
        average_loss.reset_states()

    if episode % save_video_every_episode == 0:  # for every 500 episode, record an animation
        clip = make_anim(frames, fps=60, true_image=True).rotate(-90)
        clip.write_videofile("movie_f/DQN_demo-{}.webm".format(episode), fps=60)
#         display(clip.ipython_display(fps=60, autoplay=1, loop=1, maxduration=120))

t:   3%|█▉                                                           | 2/63 [9:20:00<284:40:27, 16800.45s/it, now=None]
t:   0%|                                                                              | 0/69 [00:00<?, ?it/s, now=None][A
t:  12%|████████                                                              | 8/69 [00:00<00:00, 77.29it/s, now=None][A

Moviepy - Building video movie_f/DQN_demo-0.webm.
Moviepy - Writing video movie_f/DQN_demo-0.webm




t:  38%|██████████████████████████                                           | 26/69 [00:00<00:00, 92.12it/s, now=None][A
t:  57%|███████████████████████████████████████                              | 39/69 [00:00<00:00, 99.79it/s, now=None][A
t:  75%|███████████████████████████████████████████████████▏                | 52/69 [00:00<00:00, 105.97it/s, now=None][A
t:  97%|██████████████████████████████████████████████████████████████████  | 67/69 [00:00<00:00, 115.95it/s, now=None][A
t:   3%|█▉                                                           | 2/63 [9:20:01<284:40:44, 16800.74s/it, now=None][A

Moviepy - Done !
Moviepy - video ready movie_f/DQN_demo-0.webm
[500] time live:62, cumulated reward: -5.0, exploring rate: 0.5, average loss: 0.836884081363678
[1000] time live:62, cumulated reward: -5.0, exploring rate: 0.5, average loss: 1.2427473068237305
[1500] time live:72, cumulated reward: -4.0, exploring rate: 0.5, average loss: 1.1712510585784912
[2000] time live:62, cumulated reward: -5.0, exploring rate: 0.5, average loss: 1.0754032135009766
[2500] time live:62, cumulated reward: -5.0, exploring rate: 0.43277903725889943, average loss: 0.9798916578292847
[3000] time live:62, cumulated reward: -5.0, exploring rate: 0.3660323412732292, average loss: 1.0054103136062622
[3500] time live:62, cumulated reward: -5.0, exploring rate: 0.30957986252419073, average loss: 0.9445566534996033
[4000] time live:62, cumulated reward: -5.0, exploring rate: 0.26183394327157605, average loss: 0.9172793030738831
[4500] time live:62, cumulated reward: -5.0, exploring rate: 0.22145178723886091, av

t:   3%|█▉                                                          | 2/63 [10:05:53<307:59:56, 18177.00s/it, now=None]
t:   0%|                                                                              | 0/63 [00:00<?, ?it/s, now=None][A
t:  14%|██████████                                                            | 9/63 [00:00<00:00, 88.64it/s, now=None][A

[5000] time live:62, cumulated reward: -5.0, exploring rate: 0.18729769509073985, average loss: 0.8313552737236023
Moviepy - Building video movie_f/DQN_demo-5000.webm.
Moviepy - Writing video movie_f/DQN_demo-5000.webm




t:  41%|████████████████████████████                                        | 26/63 [00:00<00:00, 102.39it/s, now=None][A
t:  68%|██████████████████████████████████████████████▍                     | 43/63 [00:00<00:00, 114.86it/s, now=None][A
t:  97%|█████████████████████████████████████████████████████████████████▊  | 61/63 [00:00<00:00, 126.16it/s, now=None][A
t:   3%|█▉                                                          | 2/63 [10:05:54<308:00:10, 18177.23s/it, now=None][A

Moviepy - Done !
Moviepy - video ready movie_f/DQN_demo-5000.webm
[5500] time live:62, cumulated reward: -5.0, exploring rate: 0.15841112426184903, average loss: 0.8133313655853271
[6000] time live:62, cumulated reward: -5.0, exploring rate: 0.13397967485796172, average loss: 0.8427032232284546
[6500] time live:62, cumulated reward: -5.0, exploring rate: 0.11331624189077398, average loss: 0.8734960556030273
[7000] time live:62, cumulated reward: -5.0, exploring rate: 0.09583969128049684, average loss: 0.8245804309844971
[7500] time live:98, cumulated reward: -4.0, exploring rate: 0.08105851616218128, average loss: 0.8071820139884949
[8000] time live:68, cumulated reward: -4.0, exploring rate: 0.0685570138491429, average loss: 0.791480302810669
[8500] time live:98, cumulated reward: -4.0, exploring rate: 0.05798359469728905, average loss: 0.6963657736778259
[9000] time live:98, cumulated reward: -4.0, exploring rate: 0.04904089407128572, average loss: 0.6660878658294678
[9500] time live

t:   3%|█▉                                                          | 2/63 [11:23:33<347:28:23, 20506.62s/it, now=None]
t:   0%|                                                                             | 0/249 [00:00<?, ?it/s, now=None][A
t:   4%|██▋                                                                 | 10/249 [00:00<00:02, 97.13it/s, now=None][A

[10000] time live:247, cumulated reward: 0.0, exploring rate: 0.03508042658630376, average loss: 0.5969874858856201
Moviepy - Building video movie_f/DQN_demo-10000.webm.
Moviepy - Writing video movie_f/DQN_demo-10000.webm




t:  12%|███████▊                                                           | 29/249 [00:00<00:01, 113.63it/s, now=None][A
t:  20%|█████████████▋                                                     | 51/249 [00:00<00:01, 132.68it/s, now=None][A
t:  30%|███████████████████▉                                               | 74/249 [00:00<00:01, 150.50it/s, now=None][A
t:  37%|████████████████████████▍                                          | 91/249 [00:00<00:01, 155.46it/s, now=None][A
t:  44%|█████████████████████████████▏                                    | 110/249 [00:00<00:00, 164.02it/s, now=None][A
t:  53%|██████████████████████████████████▋                               | 131/249 [00:00<00:00, 171.70it/s, now=None][A
t:  63%|█████████████████████████████████████████▌                        | 157/249 [00:00<00:00, 190.78it/s, now=None][A
t:  72%|███████████████████████████████████████████████▍                  | 179/249 [00:00<00:00, 196.86it/s, now=None][A
t:  80%|███████

Moviepy - Done !
Moviepy - video ready movie_f/DQN_demo-10000.webm
[10500] time live:144, cumulated reward: -2.0, exploring rate: 0.029670038450977102, average loss: 0.46773022413253784
[11000] time live:21, cumulated reward: -5.0, exploring rate: 0.02509408428990297, average loss: 0.38049036264419556
[11500] time live:360, cumulated reward: 3.0, exploring rate: 0.021223870922486707, average loss: 0.3178632855415344
[12000] time live:62, cumulated reward: -5.0, exploring rate: 0.017950553275045137, average loss: 0.2949834167957306
[12500] time live:401, cumulated reward: 4.0, exploring rate: 0.015182073244652034, average loss: 0.28174707293510437
[13000] time live:147, cumulated reward: -2.0, exploring rate: 0.012840570676248398, average loss: 0.27888453006744385
[13500] time live:62, cumulated reward: -5.0, exploring rate: 0.010860193639877882, average loss: 0.30037030577659607
[14000] time live:62, cumulated reward: -5.0, exploring rate: 0.01, average loss: 0.2837352454662323
[14500]

t:   3%|█▉                                                          | 2/63 [13:31:04<412:17:36, 24332.08s/it, now=None]
t:   0%|                                                                             | 0/215 [00:00<?, ?it/s, now=None][A
t:   5%|███▏                                                                | 10/215 [00:00<00:02, 97.14it/s, now=None][A

[15000] time live:213, cumulated reward: -1.0, exploring rate: 0.01, average loss: 0.30391424894332886
Moviepy - Building video movie_f/DQN_demo-15000.webm.
Moviepy - Writing video movie_f/DQN_demo-15000.webm




t:  13%|████████▍                                                          | 27/215 [00:00<00:01, 109.97it/s, now=None][A
t:  18%|████████████▏                                                      | 39/215 [00:00<00:01, 111.56it/s, now=None][A
t:  26%|█████████████████▏                                                 | 55/215 [00:00<00:01, 121.88it/s, now=None][A
t:  32%|█████████████████████▌                                             | 69/215 [00:00<00:01, 125.46it/s, now=None][A
t:  42%|████████████████████████████                                       | 90/215 [00:00<00:00, 139.58it/s, now=None][A
t:  50%|████████████████████████████████▊                                 | 107/215 [00:00<00:00, 146.38it/s, now=None][A
t:  57%|█████████████████████████████████████▍                            | 122/215 [00:00<00:00, 137.73it/s, now=None][A
t:  63%|█████████████████████████████████████████▋                        | 136/215 [00:00<00:00, 135.61it/s, now=None][A
t:  74%|███████

Moviepy - Done !
Moviepy - video ready movie_f/DQN_demo-15000.webm
[15500] time live:211, cumulated reward: -1.0, exploring rate: 0.01, average loss: 0.27912119030952454
[16000] time live:62, cumulated reward: -5.0, exploring rate: 0.01, average loss: 0.28793662786483765
[16500] time live:437, cumulated reward: 5.0, exploring rate: 0.01, average loss: 0.2789861261844635
[17000] time live:175, cumulated reward: -2.0, exploring rate: 0.01, average loss: 0.2545432150363922
[17500] time live:175, cumulated reward: -2.0, exploring rate: 0.01, average loss: 0.27497902512550354
[18000] time live:288, cumulated reward: 1.0, exploring rate: 0.01, average loss: 0.2693193554878235
[18500] time live:62, cumulated reward: -5.0, exploring rate: 0.01, average loss: 0.268561452627182
[19000] time live:98, cumulated reward: -4.0, exploring rate: 0.01, average loss: 0.2587844729423523
[19500] time live:62, cumulated reward: -5.0, exploring rate: 0.01, average loss: 0.2610747516155243


t:   3%|█▉                                                          | 2/63 [15:50:25<483:07:44, 28512.54s/it, now=None]
t:   0%|                                                                              | 0/99 [00:00<?, ?it/s, now=None][A
t:  10%|██████▉                                                              | 10/99 [00:00<00:00, 96.19it/s, now=None][A

[20000] time live:98, cumulated reward: -4.0, exploring rate: 0.01, average loss: 0.2719392478466034
Moviepy - Building video movie_f/DQN_demo-20000.webm.
Moviepy - Writing video movie_f/DQN_demo-20000.webm




t:  30%|████████████████████▌                                               | 30/99 [00:00<00:00, 113.75it/s, now=None][A
t:  49%|█████████████████████████████████▋                                  | 49/99 [00:00<00:00, 126.00it/s, now=None][A
t:  68%|██████████████████████████████████████████████                      | 67/99 [00:00<00:00, 138.16it/s, now=None][A
t:  90%|█████████████████████████████████████████████████████████████▏      | 89/99 [00:00<00:00, 154.55it/s, now=None][A
t:   3%|█▉                                                          | 2/63 [15:50:25<483:08:02, 28512.83s/it, now=None][A

Moviepy - Done !
Moviepy - video ready movie_f/DQN_demo-20000.webm


In [43]:
from moviepy.editor import *
clip = VideoFileClip("movie_f/DQN_demo-10000.webm")
display(clip.ipython_display(fps=60, autoplay=1, loop=1, maxduration=120))

t:   3%|█▉                                                          | 2/63 [19:35:04<597:19:35, 35252.06s/it, now=None]
t:   0%|                                                                             | 0/250 [00:00<?, ?it/s, now=None][A
t:   8%|█████▋                                                             | 21/250 [00:00<00:01, 206.00it/s, now=None][A

Moviepy - Building video __temp__.mp4.
Moviepy - Writing video __temp__.mp4




t:  41%|██████████████████████████▉                                       | 102/250 [00:00<00:00, 264.86it/s, now=None][A
t:  71%|██████████████████████████████████████████████▉                   | 178/250 [00:00<00:00, 328.80it/s, now=None][A
t:   3%|█▉                                                          | 2/63 [19:35:04<597:19:50, 35252.31s/it, now=None][A

Moviepy - Done !
Moviepy - video ready __temp__.mp4
