In [1]:
import random
import time
import numpy as np
import gym
import tensorflow as tf
from collections import deque
from IPython.display import clear_output

In [2]:
from gym.envs.registration import registry, register

try:
    register(
        id='FrozenLakeNoSlip-v0',
        entry_point='gym.envs.toy_text:FrozenLakeEnv',
        kwargs={'map_name' : '4x4', 'is_slippery': False},
        max_episode_steps=100,
        reward_threshold=0.78, # optimum = .8196
    )
except:
    pass

In [3]:
class GeneralRandomAgent():
    def __init__(self, env):
        self.is_discrete = type(env.action_space) == gym.spaces.discrete.Discrete
        
        if self.is_discrete:
            self.action_size = env.action_space.n
            print('Action space size: ', self.action_size)
        else: 
            self.action_low = env.action_space.low
            self.action_high = env.action_space.high
            self.action_shape = env.action_space.shape
            print('Action range: ', self.action_low, self.action_high)
    
    def get_action(self, state):
        if self.is_discrete:
            action = random.choice(range(self.action_size))
        else:
            action = np.random.uniform(self.action_low, self.action_high, self.action_shape)
        return action

In [4]:
class QAgent(GeneralRandomAgent):
    def __init__(self, env, discount_rate = 0.97, learning_rate = 0.05, epsilon = 1.0):
        super().__init__(env)
        
        self.epsilon = epsilon
        self.discount_rate = discount_rate
        self.learning_rate = learning_rate
        
        self.state_size = env.observation_space.n
        print('State size: ', self.state_size)
        self.build_model()
    
    def build_model(self):
        self.q_table = 1e-4*np.random.random([self.state_size, self.action_size])
        #self.q_table = np.zeros([self.state_size, self.action_size])
        print(self.q_table)
        
    def get_action(self, state):
        q_state = self.q_table[state]
        policy_action = np.argmax(q_state)
        random_action = super().get_action(state)
        
        action = policy_action if random.random() > self.epsilon else random_action
        #print(q_state)
        #print(self.q_table)
        return action
    
    def train(self, experience):
        state, action, next_state, reward, done = experience
        
        q_next = self.q_table[next_state] if not(done) else np.zeros([self.action_size]) # checks if state is terminal
        q_target = reward + self.discount_rate*np.max(q_next)
        
        q_update = q_target - self.q_table[state, action]
        self.q_table[state, action] += self.learning_rate*q_update
        
        if done:
            self.epsilon *= 0.99 # Exponential decay

In [5]:
class QNAgent(GeneralRandomAgent):
    def __init__(self, env, discount_rate = 0.97, learning_rate = 0.01, epsilon = 1.0):
        super().__init__(env)
        
        self.epsilon = epsilon
        self.discount_rate = discount_rate
        self.learning_rate = learning_rate
        
        self.state_size = env.observation_space.n
        print('State size: ', self.state_size)
        self.build_model()
        
        self.sess = tf.compat.v1.Session()
        self.sess.run(tf.compat.v1.global_variables_initializer())
        
    def __del__(self):
        self.sess.close()
    
    def build_model(self):
        tf.compat.v1.reset_default_graph()
        tf.compat.v1.disable_eager_execution()
        self.state_in = tf.compat.v1.placeholder(tf.int32, shape=[1])
        self.action_in = tf.compat.v1.placeholder(tf.int32, shape=[1])
        self.target_in = tf.compat.v1.placeholder(tf.float32, shape=[1])
        
        self.state = tf.one_hot(self.state_in, depth=self.state_size)
        self.action = tf.one_hot(self.action_in, depth=self.action_size)
        
        self.q_state = tf.compat.v1.layers.dense(self.state, units=self.action_size, name='q_table')
        self.q_action = tf.reduce_sum(tf.multiply(self.q_state, self.action), axis=1)
        
        self.loss = tf.reduce_sum(tf.square(self.target_in - self.q_action))
        self.optimizer = tf.compat.v1.train.AdamOptimizer(self.learning_rate).minimize(self.loss)
        
    def get_action(self, state):
        q_state = self.sess.run(self.q_state, feed_dict = {self.state_in: [state]})
        policy_action = np.argmax(q_state)
        random_action = super().get_action(state)
        action = policy_action if random.random() > self.epsilon else random_action
        return action
    
    def train(self, experience):
        state, action, next_state, reward, done = ([exp] for exp in experience)
        
        q_next = self.sess.run(self.q_state, feed_dict = {self.state_in: next_state})
        q_next[done] = np.zeros([self.action_size])
        q_target = reward + self.discount_rate*np.max(q_next)
        
        feed = {self.state_in: state, self.action_in: action, self.target_in: q_target}
        self.sess.run(self.optimizer, feed_dict = feed)
        
        if experience[4]: # done is at index 4 of the experience
            self.epsilon *= 0.99


In [6]:

class QNExperienceReplayAgent(GeneralRandomAgent):
    def __init__(self, env, discount_rate = 0.97, learning_rate = 0.001, epsilon = 1.0):
        super().__init__(env)
        
        self.epsilon = epsilon
        self.discount_rate = discount_rate
        self.learning_rate = learning_rate
        
        self.state_size = env.observation_space.n
        print('State size: ', self.state_size)
        self.build_model()
        
        self.sess = tf.compat.v1.Session()
        self.sess.run(tf.compat.v1.global_variables_initializer())
        self.replay_buffer = deque(maxlen=1000)
        
    def __del__(self):
        self.sess.close()
    
    def build_model(self):
        tf.compat.v1.reset_default_graph()
        tf.compat.v1.disable_eager_execution()
        self.state_in = tf.compat.v1.placeholder(tf.int32, shape=[None])
        self.action_in = tf.compat.v1.placeholder(tf.int32, shape=[None])
        self.target_in = tf.compat.v1.placeholder(tf.float32, shape=[None])
        
        self.state = tf.one_hot(self.state_in, depth=self.state_size)
        self.action = tf.one_hot(self.action_in, depth=self.action_size)
        
        self.q_state = tf.compat.v1.layers.dense(self.state, units=self.action_size, name='q_table')
        self.q_action = tf.reduce_sum(tf.multiply(self.q_state, self.action), axis=1)
        
        self.loss = tf.reduce_sum(tf.square(self.target_in - self.q_action))
        self.optimizer = tf.compat.v1.train.AdamOptimizer(self.learning_rate).minimize(self.loss)
        
    def get_action(self, state):
        q_state = self.sess.run(self.q_state, feed_dict = {self.state_in: [state]})
        policy_action = np.argmax(q_state)
        random_action = super().get_action(state)
        action = policy_action if random.random() > self.epsilon else random_action
        return action
    
    def train(self, experience, batch_size=50):
        self.replay_buffer.append(experience)
        samples = random.choices(self.replay_buffer, k=batch_size)
        state, action, next_state, reward, done = (list(col) for col in zip(experience, *samples))
#         state, action, next_state, reward, done = ([exp] for exp in experience)
        
        q_next = self.sess.run(self.q_state, feed_dict = {self.state_in: next_state})
        q_next[done] = np.zeros([self.action_size])
        q_target = reward + self.discount_rate*np.max(q_next, axis=1)
        
        feed = {self.state_in: state, self.action_in: action, self.target_in: q_target}
        self.sess.run(self.optimizer, feed_dict = feed)
        
        if experience[4]: # done is at index 4 of the experience
            self.epsilon *= 0.99

In [7]:
# http://gym.openai.com/envs/CartPole-v1/
env = gym.make('FrozenLakeNoSlip-v0')
#env = gym.make('FrozenLake-v0')
print('Observation space: ', env.observation_space)
print('Action space: ', env.action_space)

Observation space:  Discrete(16)
Action space:  Discrete(4)


In [8]:
# Random agent
number_of_episodes = 1
number_of_steps = 200
state = env.reset()
agent = GeneralRandomAgent(env)
for episode in range(number_of_episodes):
    state = env.reset()
    for t in range(number_of_steps):
        action = agent.get_action(state)
        state, reward, done, info = env.step(action)
        print('state: ', state, 'action: ', action)
        env.render()
        time.sleep(0.5)
        clear_output(wait=True)
        if done:
            print("Episode finished after {} timesteps".format(t+1))
            break
env.close()

Episode finished after 4 timesteps


In [9]:
# QAgent
number_of_episodes = 500
number_of_steps = 15
total_reward = 0
observation = env.reset()
qagent = QAgent(env)

Action space size:  4
State size:  16
[[3.80767860e-05 8.48120021e-05 8.88274926e-05 1.44231660e-05]
 [2.18007381e-05 2.81960527e-05 5.29814187e-05 8.30885320e-05]
 [7.92400876e-05 3.91778205e-05 5.27979597e-05 5.25919940e-05]
 [1.88857636e-06 7.95557036e-05 5.22169807e-05 6.51422110e-05]
 [1.92941378e-05 3.35424430e-05 2.99086205e-05 7.55679299e-06]
 [4.94655693e-05 6.08995236e-05 4.79416054e-05 8.39894331e-05]
 [7.04345375e-05 8.22543873e-05 6.39729931e-05 8.43384446e-06]
 [9.12131106e-05 3.01368553e-05 4.26339953e-05 9.10282554e-05]
 [5.48595226e-05 3.58145893e-06 9.17796175e-05 7.74578432e-05]
 [4.57323630e-05 9.57635816e-05 3.13077221e-05 5.05074834e-05]
 [6.30439502e-05 1.75580043e-05 1.71392273e-05 5.20715881e-05]
 [4.42025477e-05 8.20495556e-05 7.03925908e-05 4.10983490e-06]
 [3.61483999e-06 4.73015748e-06 2.88123763e-05 1.83733536e-05]
 [3.24311542e-06 1.58570168e-05 4.39610720e-05 3.15871387e-05]
 [6.10736304e-05 1.99396361e-05 7.17621734e-06 7.95476412e-06]
 [5.48765939e-05 

In [10]:
for episode in range(number_of_episodes):
    state = env.reset()
    for step in range(number_of_steps):
        action = qagent.get_action(state)
        next_state, reward, done, info = env.step(action)
        qagent.train((state, action, next_state, reward, done))
        state = next_state
        total_reward += reward
        print('state: ', state, 'action: ', action)
        print(qagent.q_table)
        print("Episode: {}, Step: {}, Total Reward: {}, Epsilon: {}".format(episode, step, total_reward, qagent.epsilon))
        time.sleep(0.01)
        env.render()
        if done:
            #print("Episode finished after {} timesteps".format(step+1))
            break
        
        clear_output(wait=True)
env.close()

state:  15 action:  2
[[1.52847179e-01 8.61400463e-03 8.58650182e-01 1.88153007e-01]
 [7.08592178e-02 3.27024083e-06 8.85271119e-01 1.38060371e-01]
 [4.66226141e-02 9.12668330e-01 1.96854751e-02 2.01271690e-01]
 [1.95891114e-01 5.27788792e-05 5.56611145e-05 6.36281271e-05]
 [5.27152002e-05 1.08656630e-02 1.18801323e-05 1.19665987e-01]
 [4.94655693e-05 6.08995236e-05 4.79416054e-05 8.39894331e-05]
 [4.21717594e-05 9.40899218e-01 3.11980281e-05 1.25613662e-01]
 [9.12131106e-05 3.01368553e-05 4.26339953e-05 9.10282554e-05]
 [6.11884629e-05 2.63270142e-06 8.14903934e-02 9.15475262e-05]
 [4.85398734e-04 1.29229178e-02 4.22081748e-01 4.33038536e-05]
 [8.07026606e-02 9.69999913e-01 1.02618885e-05 2.76987453e-01]
 [4.42025477e-05 8.20495556e-05 7.03925908e-05 4.10983490e-06]
 [3.61483999e-06 4.73015748e-06 2.88123763e-05 1.83733536e-05]
 [3.08095965e-06 4.33052110e-03 2.57071854e-01 1.38274666e-04]
 [2.92252272e-02 3.37718387e-01 9.99999995e-01 2.72748015e-01]
 [5.48765939e-05 4.26917562e-05 5

In [11]:
# QNAgent
number_of_episodes = 500
number_of_steps = 15
total_reward = 0
observation = env.reset()
qnagent = QNAgent(env)

Action space size:  4
State size:  16
Instructions for updating:
Use keras.layers.Dense instead.
Instructions for updating:
Please use `layer.__call__` method instead.
Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [12]:
for episode in range(number_of_episodes):
    state = env.reset()
    for step in range(number_of_steps):
        action = qnagent.get_action(state)
        next_state, reward, done, info = env.step(action)
        qnagent.train((state, action, next_state, reward, done))
        state = next_state
        total_reward += reward
        
        
        with tf.compat.v1.variable_scope('q_table', reuse=True):
            weights = qnagent.sess.run(tf.compat.v1.get_variable('kernel'))
            clear_output(wait=True)
            print('state: ', state, 'action: ', action)
            print("Episode: {}, Step: {}, Total Reward: {}, Epsilon: {}".format(episode, step, total_reward, qnagent.epsilon))
            print(weights)
        env.render()
        if done:
            #print("Episode finished after {} timesteps".format(step+1))
            break
        
        #time.sleep(0.05)
env.close()

state:  15 action:  2
Episode: 499, Step: 5, Total Reward: 298.0, Epsilon: 0.010327936091511133
[[ 0.07632102  0.17970124  0.01175249  0.00893372]
 [ 0.04334284 -0.7133101  -0.07980934  0.0093018 ]
 [ 0.10582779  0.00243173 -0.11538547  0.01017686]
 [ 0.07644816 -0.4263432  -0.10869314 -0.04261433]
 [ 0.12278821  0.23501751 -0.68490344  0.11678237]
 [-0.38648638  0.18289542 -0.14773777  0.38919908]
 [-0.3749835   0.03801651 -0.4780062   0.04116749]
 [ 0.4600129   0.08391863  0.48174596  0.32973498]
 [ 0.14467894 -0.6885422   0.25971818  0.18881004]
 [ 0.21829142  0.22419757  0.23214693 -0.68399894]
 [ 0.22886972  0.24281915 -0.44329125  0.06139692]
 [-0.35076216  0.34161556 -0.19978437  0.42577302]
 [-0.14341938 -0.46603203 -0.34229043 -0.19877377]
 [-0.6207238   0.21034077  0.3035936   0.22067715]
 [ 0.23644921  0.2640022   0.33495903  0.2618231 ]
 [-0.5012219   0.2703408   0.5410551   0.4204687 ]]
  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m


In [13]:
# QNExperienceReplayAgent
number_of_episodes = 500
number_of_steps = 15
total_reward = 0
observation = env.reset()
qneragent = QNExperienceReplayAgent(env)

Action space size:  4
State size:  16


In [14]:
for episode in range(number_of_episodes):
    state = env.reset()
    for step in range(number_of_steps):
        action = qneragent.get_action(state)
        next_state, reward, done, info = env.step(action)
        qneragent.train((state, action, next_state, reward, done))
        state = next_state
        total_reward += reward
        
        with tf.compat.v1.variable_scope('q_table', reuse=True):
            weights = qneragent.sess.run(tf.compat.v1.get_variable('kernel'))
            clear_output(wait=True)
            print('state: ', state, 'action: ', action)
            print("Episode: {}, Step: {}, Total Reward: {}, Epsilon: {}".format(episode, step, total_reward, qneragent.epsilon))
            print(weights)
        env.render()
        if done:
            #print("Episode finished after {} timesteps".format(step+1))
            break
        
        #time.sleep(0.05)
env.close()

state:  15 action:  2
Episode: 499, Step: 5, Total Reward: 183.0, Epsilon: 0.025517964452291122
[[ 0.08939223  0.30631933  0.28621385  0.32313994]
 [ 0.09580315 -0.47827035  0.31279135  0.35013384]
 [ 0.11861119  0.37433052  0.28995806  0.37548012]
 [ 0.1432212  -0.21134509  0.28714278  0.26522642]
 [ 0.10608572  0.33678457 -0.50874966  0.32313508]
 [ 0.15293556  0.2891879  -0.3575045   0.48467863]
 [-0.7438443   0.40255308 -0.5723404   0.37545744]
 [ 0.28464335 -0.29901147  0.25744754  0.12317991]
 [ 0.15657745 -0.5220305   0.33948773  0.32331985]
 [-0.11438273  0.3494544   0.36844945 -0.24062574]
 [ 0.10693756  0.43168625 -0.4788931   0.40310675]
 [ 0.4856392   0.28456712  0.5380231   0.4463485 ]
 [ 0.19428432 -0.5198695   0.5333235   0.04380703]
 [-0.331699   -0.28733137 -0.29034948  0.40263277]
 [ 0.14095995  0.31973946  0.42770645  0.4311563 ]
 [-0.0565781   0.24739611  0.04011744  0.210989  ]]
  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m
