In [1]:
from keras.models import Sequential
from keras.layers import Activation
from keras.layers.core import Dense
from keras import backend as K
import numpy as np
import tensorflow as tf
import random
import gym

Using TensorFlow backend.


# TD(lambda) with Q lookup table
useful: https://keon.io/deep-q-learning/ This does experience replay, but has nice agent coded in oop style.

We will use TD(lambda) here instead.

In [3]:
# from gym.envs.registration import register
# register(
#     id='FrozenLakeNotSlippery-v0',
#     entry_point='gym.envs.toy_text:FrozenLakeEnv',
#     kwargs={'map_name' : '4x4', 'is_slippery': False},
#     max_episode_steps=100,
#     reward_threshold=0.78, # optimum = .8196
# )

# env = gym.make('FrozenLakeNotSlippery-v0')

[2017-08-17 20:04:18,729] Making new env: FrozenLakeNotSlippery-v0


In [4]:
# Setting up relevant variables/parameters
n_actions = env.action_space.n
episodes = 1000

In [5]:
def convert_to_one_hot(state_number, n_states):
    state = np.zeros((1,n_states))
    state[0][state_number] = 1
    return state

In [6]:
if 'session' in locals() and session is not None:
    print('Close interactive session')
    session.close()

In [32]:
class DQNAgent:
    def __init__(self, n_actions, n_states, discount=0.9, alpha=0.1, epsilon=1, epsilon_decay=0.995, lamb=0.5):
        self.n_actions = n_actions
        self.n_states = n_states
        self.discount = discount
        self.alpha = alpha
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.lamb = lamb
        
        tf.reset_default_graph()
        self.state_tensor, self.Q_values_tensor, self.chosen_value_tensor, self.opt, self.weight1 = self._build_model()
        self.gradients = self._get_gradients(self.opt)
        self.e_trace = self._get_eligibility_trace(self.gradients)
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())
        
    def _build_model(self):
        state_tensor = tf.placeholder(tf.float32, shape=(1, self.n_states))
        weight1 = tf.Variable(tf.zeros(shape=(self.n_states, self.n_actions)))
        bias1 = tf.Variable(tf.zeros(n_actions))
        Q_values_tensor = tf.add(tf.matmul(state_tensor, weight1), bias1)
        Q_values_tensor = tf.matmul(state_tensor, weight1)
        chosen_action_index = tf.argmax(Q_values_tensor, 1)
        chosen_value_tensor = tf.gather(Q_values_tensor, chosen_action_index, axis=1)
        opt = tf.train.GradientDescentOptimizer(self.alpha)
        
        return state_tensor, Q_values_tensor, chosen_value_tensor, opt, weight1

    def _get_gradients(self, opt):
        trainable_variables = tf.trainable_variables()
        gradients = opt.compute_gradients(self.chosen_value_tensor, trainable_variables)
        return gradients
    
    def _get_eligibility_trace(self, gradients):
        e_trace = []
        for gradient in gradients:
            e = np.zeros(gradient[1].get_shape())
            e_trace.append(e)
        return e_trace
    
    def predict_Q_values(self, state):
        Q_values = self.sess.run(self.Q_values_tensor, feed_dict={self.state_tensor: state})
        return Q_values
    
    def get_max_Q_value(self, state):
        Q_values = self.predict_Q_values(state)
        return np.max(Q_values)
    
    def get_Q_value(self, state, action):
        Q_values = self.predict_Q_values(state)
        return Q_values[0][action]
        
    def get_best_action(self, state):
        Q_values = self.predict_Q_values(state)
        return np.argmax(Q_values)
    
    def get_e_greedy_action(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.n_actions)
        else:
            return self.get_best_action(state)
    
    def compute_e_trace(self, evaluated_gradients, e_trace):
        for i in range(len(e_trace)):
            e_trace[i] = self.discount*self.lamb*e_trace[i] + evaluated_gradients[i]
            assert(e_trace[i].shape == evaluated_gradients[i].shape)
        return e_trace
    
    def reset(self):
        self.e_trace = self._get_eligibility_trace(self.gradients)
        
    def print_weights(self):
        w1 = self.sess.run(self.weight1)
        print (w1)
        
    def print_Q_values(self, state):
        print (self.predict_Q_values(state))
    
    def learn(self, state, action, next_state, reward):
        target = reward + self.get_max_Q_value(next_state)
        old_Q = self.get_Q_value(state, action)
        
        delta = target - old_Q
        
        grads_and_vars = self.sess.run(self.gradients, 
                                            feed_dict={self.state_tensor: state})
        evaluated_gradients = [gv[0] for gv in grads_and_vars]
        if np.sum(evaluated_gradients[0])>100:
            print (evaluated_gradients)
        
        self.e_trace = self.compute_e_trace(evaluated_gradients, self.e_trace)
        
        if delta>1000:
            print (delta)
            
#         print ('target for state {}: {}'.format(np.argmax(state), target))
#         print ('old Q: {}'.format(old_Q))
#         print (self.e_trace)
        
        # Realised I need to add a negative sign to delta. I think because tensorflow's optimizer would try to minimize.
        change = [-delta * e for e in self.e_trace] 
        assert (len(change) == len(evaluated_gradients))
        
        # APPLY GRADIENT UPDATE. e_trace is essentially a modified gradient. change is the change to the weights.
        # To alter the gradients before applying them, we have to do some session running and dictionary feeding
        grad_placeholder = [(tf.placeholder("float", shape=grad[0].get_shape()), grad[1]) for grad in self.gradients]
        apply_placeholder_op = self.opt.apply_gradients(grad_placeholder)
        
        feed_dict = {}
        for i in range(len(grad_placeholder)):
            feed_dict[grad_placeholder[i][0]] = change[i]
        self.sess.run(apply_placeholder_op, feed_dict=feed_dict)
        
        
        # Decay epsilon
        self.epsilon = self.epsilon*self.epsilon_decay # need to add epsilon_decay as a init parameter

### Letting the agent play and learn

In [38]:
env = gym.make('FrozenLake-v0')
n_actions = env.action_space.n

try:
    n_states = env.observation_space.n
except:
    obs = env.reset()
    n_states = len(obs)
    
agent = DQNAgent(n_actions=n_actions, n_states=n_states)

# Iterate the game
for e in range(1000):
    state = env.reset()
#     env.render()
    state = convert_to_one_hot(state, n_states)
    state = state.reshape(1, -1)

    total_reward = 0
    for time_t in range(500):
        action = agent.get_e_greedy_action(state)

        next_state, reward, done, _ = env.step(action)
#         env.render()
        next_state = convert_to_one_hot(next_state, n_states)
        next_state = next_state.reshape(1, -1)
    
        # Tweaking the reward to help the agent learn faster
        if reward == 0:
            reward = -0.01
        if done:
            if reward < 1:
                reward = -1
            else:
                print ('FOUND GOAL!')
        agent.learn(state, action, next_state, reward)
        
        state = next_state
        total_reward += reward
        
        if done:
            print("episode: {}/{}, score: {}".format(e, episodes, total_reward))
            break
    
    agent.reset()
# env.close()

[2017-08-17 20:43:51,750] Making new env: FrozenLake-v0


episode: 0/1000, score: -1.01
episode: 1/1000, score: -1.15
episode: 2/1000, score: -1.03
episode: 3/1000, score: -1.05
episode: 4/1000, score: -1.07
episode: 5/1000, score: -1.08
episode: 6/1000, score: -1.22
episode: 7/1000, score: -1.1
episode: 8/1000, score: -1.01
episode: 9/1000, score: -1.05
episode: 10/1000, score: -1.01
episode: 11/1000, score: -1.09
episode: 12/1000, score: -1.05
episode: 13/1000, score: -1.07
episode: 14/1000, score: -1.01
episode: 15/1000, score: -1.1099999999999999
episode: 16/1000, score: -1.02
episode: 17/1000, score: -1.08
episode: 18/1000, score: -1.09
episode: 19/1000, score: -1.1199999999999999
episode: 20/1000, score: -1.02
episode: 21/1000, score: -1.06
episode: 22/1000, score: -1.09
episode: 23/1000, score: -1.19
episode: 24/1000, score: -1.06
episode: 25/1000, score: -1.19
episode: 26/1000, score: -1.03
episode: 27/1000, score: -1.07
episode: 28/1000, score: -1.14
episode: 29/1000, score: -1.02
episode: 30/1000, score: -1.32
episode: 31/1000, scor

KeyboardInterrupt: 