In [1]:
import random
import time
import numpy as np
import gym
import tensorflow as tf
from collections import deque
from IPython.display import clear_output

In [2]:
from gym.envs.registration import registry, register

try:
    register(
        id='FrozenLakeNoSlip-v0',
        entry_point='gym.envs.toy_text:FrozenLakeEnv',
        kwargs={'map_name' : '4x4', 'is_slippery': False},
        max_episode_steps=100,
        reward_threshold=0.78, # optimum = .8196
    )
except:
    pass

In [3]:
class GeneralRandomAgent():
    def __init__(self, env):
        self.is_discrete = type(env.action_space) == gym.spaces.discrete.Discrete
        
        if self.is_discrete:
            self.action_size = env.action_space.n
            print('Action space size: ', self.action_size)
        else: 
            self.action_low = env.action_space.low
            self.action_high = env.action_space.high
            self.action_shape = env.action_space.shape
            print('Action range: ', self.action_low, self.action_high)
    
    def get_action(self, state):
        if self.is_discrete:
            action = random.choice(range(self.action_size))
        else:
            action = np.random.uniform(self.action_low, self.action_high, self.action_shape)
        return action

In [4]:
class QAgent(GeneralRandomAgent):
    def __init__(self, env, discount_rate = 0.97, learning_rate = 0.05, epsilon = 1.0):
        super().__init__(env)
        
        self.epsilon = epsilon
        self.discount_rate = discount_rate
        self.learning_rate = learning_rate
        
        self.state_size = env.observation_space.n
        print('State size: ', self.state_size)
        self.build_model()
    
    def build_model(self):
        self.q_table = 1e-4*np.random.random([self.state_size, self.action_size])
        #self.q_table = np.zeros([self.state_size, self.action_size])
        print(self.q_table)
        
    def get_action(self, state):
        q_state = self.q_table[state]
        policy_action = np.argmax(q_state)
        random_action = super().get_action(state)
        
        action = policy_action if random.random() > self.epsilon else random_action
        #print(q_state)
        #print(self.q_table)
        return action
    
    def train(self, experience):
        state, action, next_state, reward, done = experience
        
        q_next = self.q_table[next_state] if not(done) else np.zeros([self.action_size]) # checks if state is terminal
        q_target = reward + self.discount_rate*np.max(q_next)
        
        q_update = q_target - self.q_table[state, action]
        self.q_table[state, action] += self.learning_rate*q_update
        
        if done:
            self.epsilon *= 0.99 # Exponential decay

In [5]:
class QNAgent(GeneralRandomAgent):
    def __init__(self, env, discount_rate = 0.97, learning_rate = 0.01, epsilon = 1.0):
        super().__init__(env)
        
        self.epsilon = epsilon
        self.discount_rate = discount_rate
        self.learning_rate = learning_rate
        
        self.state_size = env.observation_space.n
        print('State size: ', self.state_size)
        self.build_model()
        
        self.sess = tf.compat.v1.Session()
        self.sess.run(tf.compat.v1.global_variables_initializer())
        
    def __del__(self):
        self.sess.close()
    
    def build_model(self):
        tf.compat.v1.reset_default_graph()
        tf.compat.v1.disable_eager_execution()
        self.state_in = tf.compat.v1.placeholder(tf.int32, shape=[1])
        self.action_in = tf.compat.v1.placeholder(tf.int32, shape=[1])
        self.target_in = tf.compat.v1.placeholder(tf.float32, shape=[1])
        
        self.state = tf.one_hot(self.state_in, depth=self.state_size)
        self.action = tf.one_hot(self.action_in, depth=self.action_size)
        
        self.q_state = tf.compat.v1.layers.dense(self.state, units=self.action_size, name='q_table')
        self.q_action = tf.reduce_sum(tf.multiply(self.q_state, self.action), axis=1)
        
        self.loss = tf.reduce_sum(tf.square(self.target_in - self.q_action))
        self.optimizer = tf.compat.v1.train.AdamOptimizer(self.learning_rate).minimize(self.loss)
        
    def get_action(self, state):
        q_state = self.sess.run(self.q_state, feed_dict = {self.state_in: [state]})
        policy_action = np.argmax(q_state)
        random_action = super().get_action(state)
        action = policy_action if random.random() > self.epsilon else random_action
        return action
    
    def train(self, experience):
        state, action, next_state, reward, done = ([exp] for exp in experience)
        
        q_next = self.sess.run(self.q_state, feed_dict = {self.state_in: next_state})
        q_next[done] = np.zeros([self.action_size])
        q_target = reward + self.discount_rate*np.max(q_next)
        
        feed = {self.state_in: state, self.action_in: action, self.target_in: q_target}
        self.sess.run(self.optimizer, feed_dict = feed)
        
        if experience[4]: # done is at index 4 of the experience
            self.epsilon *= 0.99


In [6]:

class QNExperienceReplayAgent(GeneralRandomAgent):
    def __init__(self, env, discount_rate = 0.97, learning_rate = 0.001, epsilon = 1.0):
        super().__init__(env)
        
        self.epsilon = epsilon
        self.discount_rate = discount_rate
        self.learning_rate = learning_rate
        
        self.state_size = env.observation_space.n
        print('State size: ', self.state_size)
        self.build_model()
        
        self.sess = tf.compat.v1.Session()
        self.sess.run(tf.compat.v1.global_variables_initializer())
        self.replay_buffer = deque(maxlen=1000)
        
    def __del__(self):
        self.sess.close()
    
    def build_model(self):
        tf.compat.v1.reset_default_graph()
        tf.compat.v1.disable_eager_execution()
        self.state_in = tf.compat.v1.placeholder(tf.int32, shape=[None])
        self.action_in = tf.compat.v1.placeholder(tf.int32, shape=[None])
        self.target_in = tf.compat.v1.placeholder(tf.float32, shape=[None])
        
        self.state = tf.one_hot(self.state_in, depth=self.state_size)
        self.action = tf.one_hot(self.action_in, depth=self.action_size)
        
        self.q_state = tf.compat.v1.layers.dense(self.state, units=self.action_size, name='q_table')
        self.q_action = tf.reduce_sum(tf.multiply(self.q_state, self.action), axis=1)
        
        self.loss = tf.reduce_sum(tf.square(self.target_in - self.q_action))
        self.optimizer = tf.compat.v1.train.AdamOptimizer(self.learning_rate).minimize(self.loss)
        
    def get_action(self, state):
        q_state = self.sess.run(self.q_state, feed_dict = {self.state_in: [state]})
        policy_action = np.argmax(q_state)
        random_action = super().get_action(state)
        action = policy_action if random.random() > self.epsilon else random_action
        return action
    
    def train(self, experience, batch_size=50):
        self.replay_buffer.append(experience)
        samples = random.choices(self.replay_buffer, k=batch_size)
        state, action, next_state, reward, done = (list(col) for col in zip(experience, *samples))
#         state, action, next_state, reward, done = ([exp] for exp in experience)
        
        q_next = self.sess.run(self.q_state, feed_dict = {self.state_in: next_state})
        q_next[done] = np.zeros([self.action_size])
        q_target = reward + self.discount_rate*np.max(q_next, axis=1)
        
        feed = {self.state_in: state, self.action_in: action, self.target_in: q_target}
        self.sess.run(self.optimizer, feed_dict = feed)
        
        if experience[4]: # done is at index 4 of the experience
            self.epsilon *= 0.99

In [7]:
# http://gym.openai.com/envs/CartPole-v1/
env = gym.make('FrozenLakeNoSlip-v0')
env = gym.make('FrozenLake-v0')
print('Observation space: ', env.observation_space)
print('Action space: ', env.action_space)

Observation space:  Discrete(16)
Action space:  Discrete(4)


In [8]:
# Random agent
number_of_episodes = 1
number_of_steps = 200
state = env.reset()
agent = GeneralRandomAgent(env)
for episode in range(number_of_episodes):
    state = env.reset()
    for t in range(number_of_steps):
        action = agent.get_action(state)
        state, reward, done, info = env.step(action)
        print('state: ', state, 'action: ', action)
        env.render()
        time.sleep(0.05)
        clear_output(wait=True)
        if done:
            print("Episode finished after {} timesteps".format(t+1))
            break
env.close()

Episode finished after 13 timesteps


In [9]:
# QAgent
number_of_episodes = 500
number_of_steps = 15
total_reward = 0
observation = env.reset()
qagent = QAgent(env)

Action space size:  4
State size:  16
[[8.52031783e-05 4.04003146e-05 1.62513941e-05 4.67634648e-05]
 [6.24834357e-05 3.48961037e-05 7.65619419e-06 6.70175986e-05]
 [7.13002283e-05 9.73329650e-05 3.41126782e-05 5.72855621e-05]
 [8.49058995e-05 6.66800762e-05 3.55445526e-05 8.96287442e-05]
 [4.13613649e-05 3.73168962e-05 6.45307524e-05 8.27441677e-05]
 [5.89259235e-06 4.28746706e-05 1.86141501e-05 9.91424535e-05]
 [7.18325905e-05 3.65172434e-05 1.11177817e-05 8.19409923e-05]
 [6.13798518e-05 7.88240154e-05 2.07678242e-06 6.78809995e-05]
 [9.61886538e-05 3.92107337e-06 4.04109845e-05 3.69788425e-05]
 [8.28349969e-05 7.96075889e-05 1.92481893e-05 1.62620270e-07]
 [1.22495295e-05 1.87755264e-05 6.34672954e-05 2.87457019e-05]
 [8.68717981e-05 4.44431857e-05 3.56530692e-06 4.19831817e-05]
 [6.21860525e-05 8.59431660e-06 9.18272274e-05 8.82287819e-05]
 [7.33702057e-05 7.82821917e-05 4.88438024e-05 5.37394641e-05]
 [7.42634132e-05 2.20169385e-06 4.24566738e-05 8.17093335e-05]
 [1.79803778e-05 

In [15]:
for episode in range(number_of_episodes):
    state = env.reset()
    for step in range(number_of_steps):
        action = qagent.get_action(state)
        next_state, reward, done, info = env.step(action)
        qagent.train((state, action, next_state, reward, done))
        state = next_state
        total_reward += reward
        print('state: ', state, 'action: ', action)
        print("Episode: {}, Step: {}, Total Reward: {}, Epsilon: {}".format(episode, step, total_reward, qagent.epsilon))
        time.sleep(0.01)
        env.render()
        if done:
            #print("Episode finished after {} timesteps".format(step+1))
            break
        
        clear_output(wait=True)
env.close()

state:  11 action:  2
Episode: 499, Step: 6, Total Reward: 57.0, Epsilon: 0.0028531095691707925
  (Right)
SFFF
FHFH
FFF[41mH[0m
HFFG


In [11]:
# QNAgent
number_of_episodes = 500
number_of_steps = 15
total_reward = 0
observation = env.reset()
qnagent = QNAgent(env)

Action space size:  4
State size:  16
Instructions for updating:
Use keras.layers.Dense instead.
Instructions for updating:
Please use `layer.__call__` method instead.
Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [12]:
for episode in range(number_of_episodes):
    state = env.reset()
    for step in range(number_of_steps):
        action = qnagent.get_action(state)
        next_state, reward, done, info = env.step(action)
        qnagent.train((state, action, next_state, reward, done))
        state = next_state
        total_reward += reward
        
        
        with tf.compat.v1.variable_scope('q_table', reuse=True):
            weights = qnagent.sess.run(tf.compat.v1.get_variable('kernel'))
            clear_output(wait=True)
            print('state: ', state, 'action: ', action)
            print("Episode: {}, Step: {}, Total Reward: {}, Epsilon: {}".format(episode, step, total_reward, qnagent.epsilon))
            print(weights)
        env.render()
        if done:
            #print("Episode finished after {} timesteps".format(step+1))
            break
        
        #time.sleep(0.05)
env.close()

state:  4 action:  0
Episode: 499, Step: 14, Total Reward: 19.0, Epsilon: 0.04758433047647449
[[ 0.1165266   0.06946613  0.14192423 -0.17370005]
 [ 0.01617921 -0.03987114 -0.00797266 -0.19320562]
 [ 0.05848814 -0.04552825  0.06245172 -0.22826032]
 [-0.1060167  -0.14021339 -0.10871245 -0.15403204]
 [ 0.19001442  0.0217197   0.14077489 -0.11223559]
 [ 0.14217412  0.52027154  0.18515062  0.31784374]
 [-0.04663343  0.03964523  0.02694805 -0.29334894]
 [-0.43584025 -0.19452131 -0.43871474  0.48712683]
 [ 0.18359174 -0.03271065 -0.01928588  0.04506766]
 [ 0.05202064  0.3106228   0.13075025 -0.2250963 ]
 [ 0.18033391  0.0652004  -0.05814102 -0.2860504 ]
 [ 0.17683744 -0.00725192 -0.04753661 -0.2003015 ]
 [ 0.24951577 -0.11414146  0.1632188   0.03431988]
 [-0.05584126  0.17742342  0.49143094  0.03646352]
 [ 0.20404454  0.7697846   0.3460671  -0.06766621]
 [ 0.5047077  -0.1758818  -0.07844076 -0.49205545]]
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG


In [13]:
# QNExperienceReplayAgent
number_of_episodes = 500
number_of_steps = 15
total_reward = 0
observation = env.reset()
qneragent = QNExperienceReplayAgent(env)

Action space size:  4
State size:  16


In [14]:
for episode in range(number_of_episodes):
    state = env.reset()
    for step in range(number_of_steps):
        action = qneragent.get_action(state)
        next_state, reward, done, info = env.step(action)
        qneragent.train((state, action, next_state, reward, done))
        state = next_state
        total_reward += reward
        
        with tf.compat.v1.variable_scope('q_table', reuse=True):
            weights = qneragent.sess.run(tf.compat.v1.get_variable('kernel'))
            clear_output(wait=True)
            print('state: ', state, 'action: ', action)
            print("Episode: {}, Step: {}, Total Reward: {}, Epsilon: {}".format(episode, step, total_reward, qneragent.epsilon))
            print(weights)
        env.render()
        if done:
            #print("Episode finished after {} timesteps".format(step+1))
            break
        
        #time.sleep(0.05)
env.close()

state:  14 action:  2
Episode: 499, Step: 14, Total Reward: 25.0, Epsilon: 0.0835397296732052
[[-0.00531529  0.01306844 -0.00291092  0.07747389]
 [-0.07648093 -0.14665034 -0.16744182  0.0746096 ]
 [-0.06550104 -0.07793216  0.02711069  0.05727452]
 [-0.17472988 -0.22903581 -0.046034    0.03538208]
 [ 0.01942207  0.02628988  0.02246995  0.02805051]
 [-0.1158613  -0.30250338 -0.3896829  -0.43672696]
 [-0.1744681  -0.20901743  0.04858912 -0.10944855]
 [-0.4681601  -0.28647718  0.22151786  0.3469417 ]
 [ 0.00166311  0.05843698  0.01437457  0.15765597]
 [-0.15570353  0.21437898  0.11401512  0.1630933 ]
 [ 0.2518926  -0.12850088 -0.26514766 -0.01809256]
 [ 0.3655262   0.41264993 -0.29228467 -0.1266354 ]
 [ 0.06771696  0.28759098 -0.31337425 -0.18128303]
 [ 0.10479593  0.17632948  0.33746034  0.27126125]
 [ 0.27620307  0.23278955  0.5835002   0.30110642]
 [-0.447088    0.52529216  0.3574468   0.45807624]]
  (Right)
SFFF
FHFH
FFFH
HF[41mF[0mG
