In [15]:
import random
import time
import numpy as np
import gym
import tensorflow as tf
from collections import deque
from IPython.display import clear_output

In [16]:
from gym.envs.registration import registry, register

try:
    register(
        id='FrozenLakeNoSlip-v0',
        entry_point='gym.envs.toy_text:FrozenLakeEnv',
        kwargs={'map_name' : '4x4', 'is_slippery': False},
        max_episode_steps=100,
        reward_threshold=0.78, # optimum = .8196
    )
except:
    pass

In [17]:
class GeneralRandomAgent():
    def __init__(self, env):
        self.is_discrete = type(env.action_space) == gym.spaces.discrete.Discrete
        
        if self.is_discrete:
            self.action_size = env.action_space.n
            print('Action space size: ', self.action_size)
        else: 
            self.action_low = env.action_space.low
            self.action_high = env.action_space.high
            self.action_shape = env.action_space.shape
            print('Action range: ', self.action_low, self.action_high)
    
    def get_action(self, state):
        if self.is_discrete:
            action = random.choice(range(self.action_size))
        else:
            action = np.random.uniform(self.action_low, self.action_high, self.action_shape)
        return action

In [18]:
class QAgent(GeneralRandomAgent):
    def __init__(self, env, discount_rate = 0.97, learning_rate = 0.05, epsilon = 1.0):
        super().__init__(env)
        
        self.epsilon = epsilon
        self.discount_rate = discount_rate
        self.learning_rate = learning_rate
        
        self.state_size = env.observation_space.n
        print('State size: ', self.state_size)
        self.build_model()
    
    def build_model(self):
        self.q_table = 1e-4*np.random.random([self.state_size, self.action_size])
        #self.q_table = np.zeros([self.state_size, self.action_size])
        print(self.q_table)
        
    def get_action(self, state):
        q_state = self.q_table[state]
        policy_action = np.argmax(q_state)
        random_action = super().get_action(state)
        
        action = policy_action if random.random() > self.epsilon else random_action
        #print(q_state)
        #print(self.q_table)
        return action
    
    def train(self, experience):
        state, action, next_state, reward, done = experience
        
        q_next = self.q_table[next_state] if not(done) else np.zeros([self.action_size]) # checks if state is terminal
        q_target = reward + self.discount_rate*np.max(q_next)
        
        q_update = q_target - self.q_table[state, action]
        self.q_table[state, action] += self.learning_rate*q_update
        
        if done:
            self.epsilon *= 0.99 # Exponential decay

In [19]:
class QNAgent(GeneralRandomAgent):
    def __init__(self, env, discount_rate = 0.97, learning_rate = 0.01, epsilon = 1.0):
        super().__init__(env)
        
        self.epsilon = epsilon
        self.discount_rate = discount_rate
        self.learning_rate = learning_rate
        
        self.state_size = env.observation_space.n
        print('State size: ', self.state_size)
        self.build_model()
        
        self.sess = tf.compat.v1.Session()
        self.sess.run(tf.compat.v1.global_variables_initializer())
        
    def __del__(self):
        self.sess.close()
    
    def build_model(self):
        tf.compat.v1.reset_default_graph()
        tf.compat.v1.disable_eager_execution()
        self.state_in = tf.compat.v1.placeholder(tf.int32, shape=[1])
        self.action_in = tf.compat.v1.placeholder(tf.int32, shape=[1])
        self.target_in = tf.compat.v1.placeholder(tf.float32, shape=[1])
        
        self.state = tf.one_hot(self.state_in, depth=self.state_size)
        self.action = tf.one_hot(self.action_in, depth=self.action_size)
        
        self.q_state = tf.compat.v1.layers.dense(self.state, units=self.action_size, name='q_table')
        self.q_action = tf.reduce_sum(tf.multiply(self.q_state, self.action), axis=1)
        
        self.loss = tf.reduce_sum(tf.square(self.target_in - self.q_action))
        self.optimizer = tf.compat.v1.train.AdamOptimizer(self.learning_rate).minimize(self.loss)
        
    def get_action(self, state):
        q_state = self.sess.run(self.q_state, feed_dict = {self.state_in: [state]})
        policy_action = np.argmax(q_state)
        random_action = super().get_action(state)
        action = policy_action if random.random() > self.epsilon else random_action
        return action
    
    def train(self, experience):
        state, action, next_state, reward, done = ([exp] for exp in experience)
        
        q_next = self.sess.run(self.q_state, feed_dict = {self.state_in: next_state})
        q_next[done] = np.zeros([self.action_size])
        q_target = reward + self.discount_rate*np.max(q_next)
        
        feed = {self.state_in: state, self.action_in: action, self.target_in: q_target}
        self.sess.run(self.optimizer, feed_dict = feed)
        
        if experience[4]: # done is at index 4 of the experience
            self.epsilon *= 0.99


In [32]:
class QNExperienceReplayAgent(GeneralRandomAgent):
    def __init__(self, env, discount_rate = 0.97, learning_rate = 0.001, epsilon = 1.0):
        super().__init__(env)
        
        self.epsilon = epsilon
        self.discount_rate = discount_rate
        self.learning_rate = learning_rate
        
        self.state_size = env.observation_space.n
        print('State size: ', self.state_size)
        self.build_model()
        
        self.sess = tf.compat.v1.Session()
        self.sess.run(tf.compat.v1.global_variables_initializer())
        self.replay_buffer = deque(maxlen=1000)
        
    def __del__(self):
        self.sess.close()
    
    def build_model(self):
        tf.compat.v1.reset_default_graph()
        tf.compat.v1.disable_eager_execution()
        self.state_in = tf.compat.v1.placeholder(tf.int32, shape=[None])
        self.action_in = tf.compat.v1.placeholder(tf.int32, shape=[None])
        self.target_in = tf.compat.v1.placeholder(tf.float32, shape=[None])
        
        self.state = tf.one_hot(self.state_in, depth=self.state_size)
        self.action = tf.one_hot(self.action_in, depth=self.action_size)
        
        self.q_state = tf.compat.v1.layers.dense(self.state, units=self.action_size, name='q_table')
        self.q_action = tf.reduce_sum(tf.multiply(self.q_state, self.action), axis=1)
        
        self.loss = tf.reduce_sum(tf.square(self.target_in - self.q_action))
        self.optimizer = tf.compat.v1.train.AdamOptimizer(self.learning_rate).minimize(self.loss)
        
    def get_action(self, state):
        q_state = self.sess.run(self.q_state, feed_dict = {self.state_in: [state]})
        policy_action = np.argmax(q_state)
        random_action = super().get_action(state)
        action = policy_action if random.random() > self.epsilon else random_action
        return action
    
    def train(self, experience, batch_size=50):
        self.replay_buffer.append(experience)
        samples = random.choices(self.replay_buffer, k=batch_size)
        state, action, next_state, reward, done = (list(col) for col in zip(experience, *samples))
#         state, action, next_state, reward, done = ([exp] for exp in experience)
        
        q_next = self.sess.run(self.q_state, feed_dict = {self.state_in: next_state})
        q_next[done] = np.zeros([self.action_size])
        q_target = reward + self.discount_rate*np.max(q_next, axis=1)
        
        feed = {self.state_in: state, self.action_in: action, self.target_in: q_target}
        self.sess.run(self.optimizer, feed_dict = feed)
        
        if experience[4]: # done is at index 4 of the experience
            self.epsilon *= 0.99

In [21]:
# http://gym.openai.com/envs/CartPole-v1/
env = gym.make('FrozenLakeNoSlip-v0')
#env = gym.make('FrozenLake-v0')
print('Observation space: ', env.observation_space)
print('Action space: ', env.action_space)

Observation space:  Discrete(16)
Action space:  Discrete(4)


In [23]:
# Random agent
number_of_episodes = 15
number_of_steps = 200
state = env.reset()
agent = GeneralRandomAgent(env)
for episode in range(number_of_episodes):
    state = env.reset()
    for t in range(number_of_steps):
        action = agent.get_action(state)
        state, reward, done, info = env.step(action)
        print('state: ', state, 'action: ', action)
        env.render()
        time.sleep(0.5)
        clear_output(wait=True)
        if done:
            print("Episode finished after {} timesteps".format(t+1))
            break
env.close()

Episode finished after 4 timesteps


In [28]:
# QAgent
number_of_episodes = 100
number_of_steps = 15
total_reward = 0
observation = env.reset()
qagent = QAgent(env)

Action space size:  4
State size:  16
[[5.64941237e-05 4.65033434e-05 1.58906307e-06 4.53422743e-06]
 [2.20176581e-05 6.40402591e-05 5.98423484e-06 9.64008630e-05]
 [2.79459850e-05 1.23997290e-05 8.04509285e-06 3.35024318e-06]
 [9.85673341e-05 7.25849662e-05 9.10091126e-05 6.05474899e-05]
 [3.11890767e-05 7.10356968e-05 4.31764141e-05 8.04062483e-05]
 [1.56603470e-05 4.82322323e-05 8.67871013e-05 2.95462762e-05]
 [8.56626690e-05 2.80938766e-05 8.32629503e-05 9.43663390e-05]
 [4.53163544e-05 9.53091676e-06 9.17560645e-05 9.67503059e-05]
 [8.28902561e-05 4.56778884e-05 7.00796043e-05 4.53161856e-05]
 [7.63152001e-05 6.14328332e-05 5.69231307e-05 1.24464660e-05]
 [5.06427716e-05 8.23982362e-05 2.26098005e-05 3.52182310e-05]
 [9.42448217e-05 1.34769063e-05 1.18518001e-05 6.09191467e-06]
 [4.44169290e-05 2.57198513e-06 7.88524738e-05 7.42287215e-05]
 [8.07016629e-05 2.42310607e-05 5.94629050e-05 3.44006004e-05]
 [9.96754390e-05 3.45108024e-05 9.48479896e-05 5.18222253e-05]
 [7.80842520e-05 

In [26]:
for episode in range(number_of_episodes):
    state = env.reset()
    for step in range(number_of_steps):
        action = qagent.get_action(state)
        next_state, reward, done, info = env.step(action)
        qagent.train((state, action, next_state, reward, done))
        state = next_state
        total_reward += reward
        print('state: ', state, 'action: ', action)
        print(qagent.q_table)
        print("Episode: {}, Step: {}, Total Reward: {}, Epsilon: {}".format(episode, step, total_reward, qagent.epsilon))
        time.sleep(0.01)
        env.render()
        if done:
            #print("Episode finished after {} timesteps".format(step+1))
            break
        
        clear_output(wait=True)
env.close()

state:  15 action:  2
[[9.53777164e-02 8.58734026e-01 3.46034740e-03 5.64445712e-02]
 [9.23905748e-02 4.12237553e-05 6.71154408e-05 3.38308406e-05]
 [6.73823518e-05 6.73480571e-05 6.57841259e-05 6.32303905e-05]
 [8.31758889e-05 5.85122946e-05 6.71246538e-05 3.34661427e-05]
 [1.03162114e-01 8.85292810e-01 3.11012947e-06 1.33884327e-01]
 [4.84187374e-05 7.11973232e-06 6.27477539e-05 9.25268982e-05]
 [3.33427051e-05 6.33566391e-05 5.67264539e-05 6.73803147e-05]
 [7.87416657e-06 5.96217051e-05 2.69546775e-05 8.22648508e-05]
 [1.38719012e-01 2.74008606e-07 9.12673000e-01 1.20558593e-01]
 [8.12081463e-02 9.40900000e-01 5.84895280e-02 2.60858390e-05]
 [5.68370508e-05 3.70386151e-01 2.36217943e-05 8.20899098e-05]
 [3.25309362e-05 5.00361626e-05 1.30302330e-06 4.06842194e-05]
 [1.36844226e-05 7.61741396e-05 7.60349590e-05 1.70238594e-05]
 [6.71526611e-06 2.37355706e-01 9.70000000e-01 6.82970998e-02]
 [1.39506563e-01 3.69401455e-01 1.00000000e+00 2.40255498e-02]
 [6.39090137e-05 1.58260145e-05 1

In [29]:
# QNAgent
number_of_episodes = 500
number_of_steps = 15
total_reward = 0
observation = env.reset()
qnagent = QNAgent(env)

Action space size:  4
State size:  16


In [31]:
for episode in range(number_of_episodes):
    state = env.reset()
    for step in range(number_of_steps):
        action = qnagent.get_action(state)
        next_state, reward, done, info = env.step(action)
        qnagent.train((state, action, next_state, reward, done))
        state = next_state
        total_reward += reward
        
        
        with tf.compat.v1.variable_scope('q_table', reuse=True):
            weights = qnagent.sess.run(tf.compat.v1.get_variable('kernel'))
            clear_output(wait=True)
            print('state: ', state, 'action: ', action)
            print("Episode: {}, Step: {}, Total Reward: {}, Epsilon: {}".format(episode, step, total_reward, qnagent.epsilon))
            print(weights)
        env.render()
        if done:
            #print("Episode finished after {} timesteps".format(step+1))
            break
        
        #time.sleep(0.05)
env.close()

state:  15 action:  2
Episode: 499, Step: 5, Total Reward: 845.0, Epsilon: 5.439827419996685e-05
[[-0.11353138  0.3026645  -0.08388886  0.00874332]
 [ 0.19759448 -0.61010665  0.01294092 -0.01451255]
 [ 0.02953254  0.14976345 -0.02624523  0.1244484 ]
 [ 0.00986244 -0.45339635 -0.29157433 -0.04377493]
 [-0.07174112  0.3744143  -0.60003525  0.05704692]
 [ 0.12638491 -0.08479851 -0.1730896   0.27620023]
 [-0.4763186  -0.2942671  -0.62303746  0.00105364]
 [-0.1430591  -0.30655393 -0.3905269   0.47696114]
 [ 0.15187502 -0.6287654   0.4051334   0.1601466 ]
 [ 0.17623964  0.3326224   0.26564372 -0.67298615]
 [ 0.19514348  0.25047514 -0.17080365  0.0487534 ]
 [ 0.38052273 -0.11120182 -0.21112823 -0.11281523]
 [-0.05955234  0.53874755 -0.45393848 -0.1798228 ]
 [-0.683561    0.19446652  0.39479434  0.23349886]
 [ 0.2114212   0.07370419  0.4312942   0.24862446]
 [-0.2798764   0.23501915 -0.5188889  -0.06278816]]
  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m


In [33]:
# QNExperienceReplayAgent
number_of_episodes = 500
number_of_steps = 15
total_reward = 0
observation = env.reset()
qneragent = QNExperienceReplayAgent(env)

Action space size:  4
State size:  16


In [35]:
for episode in range(number_of_episodes):
    state = env.reset()
    for step in range(number_of_steps):
        action = qneragent.get_action(state)
        next_state, reward, done, info = env.step(action)
        qneragent.train((state, action, next_state, reward, done))
        state = next_state
        total_reward += reward
        
        with tf.compat.v1.variable_scope('q_table', reuse=True):
            weights = qneragent.sess.run(tf.compat.v1.get_variable('kernel'))
            clear_output(wait=True)
            print('state: ', state, 'action: ', action)
            print("Episode: {}, Step: {}, Total Reward: {}, Epsilon: {}".format(episode, step, total_reward, qneragent.epsilon))
            print(weights)
        env.render()
        if done:
            #print("Episode finished after {} timesteps".format(step+1))
            break
        
        #time.sleep(0.05)
env.close()

state:  15 action:  2
Episode: 499, Step: 5, Total Reward: 779.0, Epsilon: 8.050308666708495e-05
[[ 0.20255187  0.25056434  0.22962482  0.10157435]
 [ 0.20788176 -0.60815537  0.25618353  0.12405635]
 [ 0.2192079   0.30450344  0.10467924  0.05697221]
 [ 0.1729581  -0.60592556  0.07862183 -0.01653947]
 [ 0.22193362  0.27712327 -0.62254286  0.09341645]
 [-0.32965118  0.1217761   0.01705599 -0.5333879 ]
 [-0.5849208   0.3327304  -0.41542938  0.14338332]
 [-0.24792641 -0.2691514  -0.34157532  0.47875667]
 [ 0.21437193 -0.6081718   0.28356367  0.14349379]
 [ 0.24117659  0.3133603   0.3117907  -0.7371238 ]
 [ 0.2825509   0.36183044 -0.6250388   0.17336299]
 [ 0.21073818 -0.43384254  0.19345558  0.18745261]
 [ 0.4930879  -0.10579994 -0.32171196 -0.53229254]
 [-0.5683287   0.23685841  0.33868006  0.17987706]
 [ 0.30814162  0.36115944  0.37089068  0.1964099 ]
 [-0.06892592  0.02590877  0.04803693  0.24937773]]
  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m
