In [1]:
from collections import deque
import math
import random

import keras
import gym
from matplotlib import pyplot as plt

env_name = 'CartPole-v1'
env = gym.make(env_name).env
REPLAY_MEMORY_SIZE = 1e5
OBSERVATION_SHAPE = env.observation_space.shape
NUM_ACTIONS = env.action_space.n
DISCOUNT_RATE = 0.99
EPSILON_MAX = 1.0
EPSILON_MIN = .01
ANNEALING_CONST = .001  # aka Lambda
MINIBATCH_SIZE = 64


class Memory(deque):
    def __init__(self, size):
        self.size = size

    def append(self, thing):
        if len(self) > self.size - 1:
            self.popleft()
        return super().append(thing)

    def sample(self, batch_size):
        batch_size = min(len(self), batch_size)
        return random.sample(self, batch_size)

def make_network():
    q_network = keras.Sequential()
    q_network.add(keras.layers.Dense(64, activation='relu', input_shape=OBSERVATION_SHAPE))
    q_network.add(keras.layers.Dense(NUM_ACTIONS, activation='linear'))    
    q_network.compile(optimizer=keras.optimizers.Adam(), loss='mse')
    # q_network.compile(optimizer=keras.optimizers.RMSprop(), loss='mse')
    return q_network

q_network = make_network()
replay_memory = Memory(REPLAY_MEMORY_SIZE)

  from ._conv import register_converters as _register_converters
Using Theano backend.


[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [18]:
episode_rewards = [] 
q_hist_target_net2 = []
q_hist_primary_net2 = []
steps = 0

lunar = 'LunarLander-v2'
cartpole = 'CartPole-v1'
env = gym.make(cartpole).env
REPLAY_MEMORY_SIZE = 1e5
OBSERVATION_SHAPE = env.observation_space.shape
NUM_ACTIONS = env.action_space.n
SAVE_EVERY = 2000  # steps
RENDER_EVERY = 100


def double_dqn():
    
    replay_memory_with_ddqn = Memory(REPLAY_MEMORY_SIZE)
    RENDER = True
    NUM_EPISODES = 500
    REPORT_EVERY= 10
    UPDATE_TARGET_EVERY = 100
    MAX_EPISODE_LEN = 2000
    USE_DDQN = True
    online_net = make_network()
    target_net = make_network()
    online_net.load_weights(env_name + 'online-net.ht')
    target_net.load_weights(env_name + 'target-net.ht')
    replay_memory = Memory(REPLAY_MEMORY_SIZE)


    def replay():
        # Bec of the way we've set up the online_net, it gives us Vals for all actions available in one forward 
        # pass.    
        batch = replay_memory_with_ddqn.sample(MINIBATCH_SIZE)  # (batchsize x observation_shape)

        states = np.array([exp[0] for exp in batch])
        y = target_net.predict(states)  # (batch x num-actions)

        terminal_state = np.zeros(OBSERVATION_SHAPE)
        states_ = np.array([exp[3] if exp[3] is not None else terminal_state for exp in batch])
        online_predicted_actions = online_net.predict(states).argmax(axis=1)
        y_ = target_net.predict(states_)

        for idx, exp in enumerate(batch):
            state, action, reward, state_, terminal = exp        
            if state_ is None:
                y[idx, action] = reward
            else:                
                if USE_DDQN:
                    best_action = online_predicted_actions[idx]
                    y[idx, action] = reward + DISCOUNT_RATE * y_[idx, best_action]
                else:
                    y[idx, action] = reward + DISCOUNT_RATE * np.amax(y_[idx])                    
                # t[a] = r + GAMMA * pTarget_[i][ numpy.argmax(p_[i]) ]
        online_net.fit(states, y, batch_size=64, epochs=1, verbose=0)  # REMEBER, Q is a func from (state, action) pairs to values. 
            
    def replay_vectorized():
        batch = replay_memory_with_ddqn.sample(MINIBATCH_SIZE)
        # unpack all the replay memories into arrays:
        states = np.array([transition[0] for transition in batch])  # (batch x state-size)
        transition_actions = np.array([transition[1] for transition in batch])
        rewards = np.array([transition[2] for transition in batch])  # (batch x 1)
        terminal_mask = np.array([True if transition[3] is None else False for transition in batch])  # (batch x 1)
        terminal_state = np.zeros(OBSERVATION_SHAPE)
        states_ = np.array([transition[3] if transition[3] is not None else terminal_state for transition in batch])  # (batch x state-size)

        y = target_net.predict(states)  # (batch x num-actions)            
        online_predicted_actions = online_net.predict(states).argmax(axis=1)  # batch x num-action
        y_ = target_net.predict(states_)  # (batch x num-actions)
        
        # set y = r for terminal states:
        terminal_state_actions = transition_actions[terminal_mask]
        y[terminal_mask, terminal_state_actions] = rewards[terminal_mask]

        # DDQN update:
        # set y = r + gamma * Q_hat(s', argmax Q(s', a'))
        non_terminal_mask = ~terminal_mask
        best_actions = online_predicted_actions[non_terminal_mask]
        non_terminal_actions = transition_actions[non_terminal_mask]
        y[non_terminal_mask, non_terminal_actions] = rewards[non_terminal_mask] + DISCOUNT_RATE * y_[non_terminal_mask, best_actions] 
        online_net.fit(states, y, batch_size=MINIBATCH_SIZE, epochs=1, verbose=0)  # REMEBER, Q is a func from (state, action) pairs to values.         


    def Q_val_one(net, state):
        return net.predict(state.reshape((1, OBSERVATION_SHAPE[0]))).flatten()


    def main_loop():
        """ 
        Target network + DDQN.
        """
        global steps, reward_hist_target_net, q_hist_target_net2
        
        for episode_count in range(int(NUM_EPISODES)):
            episode_done = False
            episode_reward = 0
            state = env.reset()
            q_hist_target_net2.append(Q_val_one(target_net, state).max())
            q_hist_primary_net2.append(Q_val_one(online_net, state).max())
            episode_trace = []

            if episode_count % REPORT_EVERY == 0:
                print('Starting episode %s ' % episode_count, end='', flush=False)
                print('Steps: %d' % steps, end='', flush=False)
            if steps % SAVE_EVERY == 0:                
                online_net.save(env_name + 'online-net.ht')
                target_net.save(env_name + 'target-net.ht')
            episode_len = 0
            while not episode_done:
                episode_len += 1
                EPSILON = EPSILON_MIN + (EPSILON_MAX - EPSILON_MIN) * math.exp(-ANNEALING_CONST * steps)
                steps += 1

                if random.random() < EPSILON:
                    action = random.randint(0, NUM_ACTIONS-1)
                else:
                    action = Q_val_one(online_net, state).argmax()

                state_, reward, episode_done, _ = env.step(action)
                episode_reward += reward

                if RENDER and episode_count % RENDER_EVERY == 0:
                    env.render()
                if episode_done:
                    state_ = None

                replay_memory_with_ddqn.append((state, action, reward, state_, episode_done))
                replay_vectorized()
                state = state_

                if steps % UPDATE_TARGET_EVERY == 0:
                    target_net.set_weights(online_net.get_weights())
                if episode_len > MAX_EPISODE_LEN:
                    episode_done = True

                if episode_done:
                    print(episode_reward, end='', flush=False)            
                    episode_rewards.append(episode_reward)
    main_loop()


double_dqn()

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
Starting episode 0 Steps: 020.014.013.013.011.012.017.023.037.011.0Starting episode 10 Steps: 17123.018.050.017.013.015.024.010.08.013.0Starting episode 20 Steps: 36225.045.011.011.011.08.028.021.012.016.0Starting episode 30 Steps: 5509.09.017.09.012.011.022.013.017.015.0Starting episode 40 Steps: 68413.012.013.012.011.09.09.030.019.015.0Starting episode 50 Steps: 82710.011.09.011.013.011.012.014.011.016.0Starting episode 60 Steps: 94515.013.011.019.012.021.011.011.010.010.0Starting episode 70 Steps: 107818.019.014.019.014.011.013.011.013.010.0Starting episode 80 Steps: 122012.011.024.015.013.011.015.010.012.011.0Starting episode 90 Steps: 13549.012.012.013.016.020.025.017.029.010.0Starting episode 100 Steps: 151723.016.018.010.013.024.051.020.011.021.0Starting episode 110 Steps: 172413.017.024.021.016.044.024.018.082.018.0Starting episode 120 Steps: 200134.029.022.0136.028.075.0

KeyboardInterrupt: 

In [19]:
env.close()