In [11]:
import gym
import random
import numpy as np
import time
#import tensorflow as tf
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
from gym.envs.registration import register
from IPython.display import clear_output
from collections import deque
print("Using OpenAI Gym:", gym.__version__)
print("Using Tensorflow:", tf.__version__)

Using OpenAI Gym: 0.17.1
Using Tensorflow: 2.1.0


In [12]:
try:
    register(
        id='FrozenLakeNoSlip-v0',
        entry_point='gym.envs.toy_text:FrozenLakeEnv',
        kwargs={'map_name' : '4x4', 'is_slippery':False},
        max_episode_steps=100,
        reward_threshold=0.78, # optimum = .8196
    )
except:
    pass
env_name = "FrozenLake-v0"
env_name = "FrozenLakeNoSlip-v0"
env = gym.make(env_name)
print("Observation space:", env.observation_space)
print("Action space:", env.action_space)
type(env.action_space)

Observation space: Discrete(16)
Action space: Discrete(4)


gym.spaces.discrete.Discrete

In [13]:
class Agent():
    def __init__(self, env):
        self.is_discrete = \
            type(env.action_space) == gym.spaces.discrete.Discrete
        
        if self.is_discrete:
            self.action_size = env.action_space.n
            print("Action size:", self.action_size)
        else:
            self.action_low = env.action_space.low
            self.action_high = env.action_space.high
            self.action_shape = env.action_space.shape
            print("Action range:", self.action_low, self.action_high)
        
    def get_action(self, state):
        if self.is_discrete:
            action = random.choice(range(self.action_size))
        else:
            action = np.random.uniform(self.action_low,
                                       self.action_high,
                                       self.action_shape)
        return action

In [14]:
class QNAgent(Agent):
    def __init__(self, env, discount_rate=0.97, learning_rate=0.001):
        super().__init__(env)
        self.state_size = env.observation_space.n
        print("State size:", self.state_size)
        
        self.eps = 1.0
        self.discount_rate = discount_rate
        self.learning_rate = learning_rate
        self.build_model()
        
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())
        self.replay_buffer = deque(maxlen=1000)
        
    def build_model(self):
        tf.reset_default_graph()
        self.state_in = tf.placeholder(tf.int32, shape=[None])
        self.action_in = tf.placeholder(tf.int32, shape=[None])
        self.target_in = tf.placeholder(tf.float32, shape=[None])
        
        self.state = tf.one_hot(self.state_in, depth=self.state_size)
        self.action = tf.one_hot(self.action_in, depth=self.action_size)
        
        self.q_state = tf.layers.dense(self.state, units=self.action_size, name="q_table")
        self.q_action = tf.reduce_sum(tf.multiply(self.q_state, self.action), axis=1)
        
        self.loss = tf.reduce_sum(tf.square(self.target_in - self.q_action))
        self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)
        
    def get_action(self, state):
        q_state = self.sess.run(self.q_state, feed_dict={self.state_in: [state]})
        action_greedy = np.argmax(q_state)
        action_random = super().get_action(state)
        return action_random if random.random() < self.eps else action_greedy
    
    def train(self, experience, batch_size=50):
        self.replay_buffer.append(experience)
        samples = random.choices(self.replay_buffer, k=batch_size)
        state, action, next_state, reward, done = (list(col) for col in zip(experience, *samples))
#         state, action, next_state, reward, done = ([exp] for exp in experience)
        
        q_next = self.sess.run(self.q_state, feed_dict={self.state_in: next_state})
        q_next[done] = np.zeros([self.action_size])
        q_target = reward + self.discount_rate * np.max(q_next, axis=1)
        
        feed = {self.state_in: state, self.action_in: action, self.target_in: q_target}
        self.sess.run(self.optimizer, feed_dict=feed)
        
        if experience[4]:
            self.eps = self.eps * 0.99
            
    def __del__(self):
        self.sess.close()
        
agent = QNAgent(env)

Action size: 4
State size: 16


In [15]:
total_reward = 0
for ep in range(100):
    state = env.reset()
    done = False
    while not done:
        action = agent.get_action(state)
        next_state, reward, done, info = env.step(action)
        agent.train((state,action,next_state,reward,done))
        state = next_state
        total_reward += reward
        
        print("s:", state, "a:", action)
        print("Episode: {}, Total reward: {}, eps: {}".format(ep,total_reward,agent.eps))
        env.render()
        with tf.variable_scope("q_table", reuse=True):
            weights = agent.sess.run(tf.get_variable("kernel"))
            print(weights)
        time.sleep(0.05)
        clear_output(wait=True)

s: 5 a: 2
Episode: 99, Total reward: 1.0, eps: 0.36603234127322926
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
[[ 1.42852990e-02  1.44654587e-01  1.93341866e-01  1.32229239e-01]
 [ 2.82647703e-02 -2.18661115e-01  1.99018955e-01  1.41097531e-01]
 [ 2.08945479e-02  1.03275232e-01  2.09479615e-01  1.47364572e-01]
 [ 3.09499707e-02 -2.14844987e-01  1.93277285e-01  1.39716119e-01]
 [ 7.99230300e-03  1.33829847e-01 -1.93122506e-01  1.37275010e-01]
 [-2.68103689e-01  3.11622441e-01 -3.33666801e-04  2.86950827e-01]
 [-3.20300132e-01  1.09595485e-01  5.06092748e-03 -1.71940044e-01]
 [-1.78681612e-01  1.44309223e-01 -5.13570070e-01  1.25464976e-01]
 [-2.03515843e-01 -1.99462757e-01 -8.49921852e-02  1.26064643e-01]
 [-5.95885003e-03  1.88965365e-01 -4.56754446e-01 -1.23095676e-01]
 [-1.54008074e-02 -2.71095306e-01  3.68575356e-03 -5.80745041e-02]
 [-5.45153320e-02  2.23692417e-01  2.16812313e-01 -1.92574382e-01]
 [ 4.03243065e-01 -3.30478191e-01  8.52910876e-02 -2.62093693e-01]
 [-2.80162841e-02  1.75