In [1]:
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import load_model
import numpy as np

class DeulingDeepQNetwork(keras.Model):
    def __init__(self, n_actions, fc1_dims, fc2_dims):
        super(DeulingDeepQNetwork, self).__init__()
        self.dense1 = keras.layers.Dense(fc1_dims, activation= 'relu')
        self.dense2 = keras.layers.Dense(fc2_dims, activation= 'relu')
        self.V = keras.layers.Dense(1, activation = None)
        self.A = keras.layers.Dense(n_actions, activation = None)
        
    def call(self, state):
        x = self.dense1(state)
        x = self.dense2(x)
        V = self.V(x)
        A = self.A(x)
    
    #May be unnecessary, please experiement with call()   
    def  advantage(self, state):
        x = self.dense1(state)
        x = self.dense2(x)
        A = self.A(x)
        
        return A
    
class ReplayBuffer():
    def __init__(self, max_size, input_shape):
        self.mem_size = max_size
        self.mem_cntr = 0
        
        self.state_memory = np.zeros((self.mem_size, *input_shape), dtype=float32)
        self.new_state_memory = np.zeros((self.mem_size, *input_shape), dtype=np.float32)
        self.action_memory = np.zeros(self.mem_size, dtype=np.int32)
        self.reward_memory = np.zeros(self.mem_size, dtype=np.float32)
        self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool)
        
    def store_transition(self, state, action, reward, state_, done):
        index = self.mem_cntr % self.mem_size
        self.state_memory[index] = state
        self.new_state_memory[index] = state_
        self.action_memory[index] = action
        self.reward_memory[index] = reward
        
        self.mem_cntr += 1
        
    def sample_buffer(self, batch_size):
        max_mem = min(self.mem_cntr, self.mem_size)
        batch = np.random.choice(max_mem, batch_size, replace=False)
        
        states = self.state_memory[batch]
        new_states = self.new_state_memory[batch]
        actions = self.action_memory[batch]
        rewards = self.reward_memory[batch]
        dones = self.terminal_memory[batch]
        
        return states, actions, rewards, new_states, dones

In [2]:
class Agent():
    def __init__(self, lr, gamma, n_actions, epsilon, batch_size,
                input_dims, epsilon_dec=1e-3, eps_end=0.01,
                mem_size=100000, fname='deuling_dqn.h5', fc1_dims=128,
                fc2_dims=128, replace=100):                           #helps changes values from online to target network
        self.action_space = [i for i in range(n_actions)]
        self.gamma = gamma
        self.epsilon = epsilon
        self.eps_dec = epsilon_dec
        self.eps_min = eps_end
        self.fname = fname
        self.replace = replace
        self.batch_size = batch_size
        
        self.learn_step_counter = 0
        self.memory = ReplayBuffer(mem_size, input_dims)
        self.q_eval = DeulingDeepQNetwork(n_actions, fc1_dims, fc2_dims)# Online Network
        self.q_next = DeulingDeepQNetwork(n_actions, fc1_dims, fc2_dims)# Target Network for the cost function
        
        self.q_eval.compile(optimizer=Adam(learning_rate = lr), loss = 'mean_squared_error')
        self.q_next.compile(optimizer=Adam(learning_rate = lr), loss = 'mean_squared_error')
        
    def store_transition(self, state, action, reward, new_state, done):
        self.memory.store_transition(state, action, reward, new_state, done)
        
    def choose_action(self, observation):
        if np.random.random() < self.epsilon:
            action = np.random.choice(self.action_space)
        else:
            state = self.array([observation])
            actions = self.q_eval.advantage(state)
            action = tf.math.argmax(actions, axis=1).numpy()[0]
        return action
    
    def learn(self):
        if self.memory.mem_cntr < self.batch_size:
            return
        
        if self.learn_step_counter % self.replace == 0:
            selfq_next.set_weights(self.q_eval.get_weights())
            
        states, actions, rewards, states_, dones = self.memory.sample_buffer(self.batch_size)
        
        q_pred = self.q_eval(states)
        q_next = tf.math.reduce_max(self.q_next(states_), axis=1, keepdims=True).numpy()
        q_target = np.copy(q_pred)
        
        #improve on the solution
        for idx, terminal in enumerate(dones):
            if terminal:
                q_next[idx] = 0.0
            q_target[idx,actions[idx]] = rewards[idx] + self.gamma * q_next[idx]
            
        self.q_eval.train_on_batch(states, q_target)
        
        self.epsilon = self.epsilon - self.eps_dec if self.epsilon > self.eps_min else self.eps_min
        
        self.learn_step_counter += 1
        
        def save_model(self):
            self.q_eval.save(self.model_file)
            
        def load_model(self):
            self.q_eval = load_model(self.model_file)

In [3]:
from dueling_dqn_keras import Agent
import numpy as np
from utils import plot_learning_curve

if __name__ = '__main__':
    env = gym.make('LunarLander-v2')
    n_games = 400
    agent  = Agent(gamma=0.99, epsilon=1, lr=1e-3, input_dims=[8],epsilon_dec=1e-3,
                  mem_size=100000, batch_size=64, eps_end=0.01, fc1_dims=128, fc2_dims=128, replace=100,n_action=4 )
    
    scores, eps_history = [], []
    
    for i in range(n_games):
        done = False
        score = 0
        observation = env.reset()
        while not done:
            action = agent.choose_action(observation)
            observation_, reward, done, info = env.step(action)
            score += reward
            agent.store_transition(observation, action, reward, observation_, done)
            observation = observation_
            agent.learn()
        eps_history.append(agent.epsilon)
        scores.append(score)
        
        avg_score = np.mean(scores[-100:])
        print('Episode', i, 'score %.1f' % score,
             'Average score %.1f' % avg_score,
             'epsilon %.2f' % agent.epsilon)
        
    filename = 'keras_lunar_lander.png'
    x = [i + 1 for i in range(n_games)]
    plot_learning_curve(x, scores, eps_history, filename)

SyntaxError: invalid syntax (<ipython-input-3-97b2cd87cd9f>, line 5)