In [1]:
import gym
import random
import numpy as np
import time
from gym.envs.registration import register
from IPython.display import clear_output


In [2]:
# env_name = "CartPole-v1"
# env_name = "Acrobot-v1"
# env_name = "MountainCar-v0"
# env_name = "MountainCarContinuous-v0"
try:
    register(
        id='FrozenLakeNoSlip-v0',
        entry_point='gym.envs.toy_text:FrozenLakeEnv',
        kwargs={'map_name' : '4x4', 'is_slippery':False},
        max_episode_steps=100,
        reward_threshold=0.78, # optimum = .8196
    )
except:
    pass
env_name = "FrozenLake-v0"
env_name = "FrozenLakeNoSlip-v0"
#env_name = "FrozenLakeNoSlip-v0"
env = gym.make(env_name)

In [3]:
class Agent():
    def __init__(self,env):
        self.is_discrete = type(env.action_space)==gym.spaces.discrete.Discrete
        if self.is_discrete:
            self.action_size = env.action_space.n
            print("Action size:", self.action_size)
        else:
            self.action_low = env.action_space.low
            self.action_high = env.action_space.high
            self.action_shape = env.action_space.shape
            print("Action range:", self.action_low, self.action_high)
    def get_random_action(self,state):
        if self.is_discrete:
            action = random.choice(range(self.action_size))
        else:
            action = np.random.uniform(self.action_low, \
                                       self.action_high, self.action_shape)
        # pole_angle= state[2]
        # action = 0 if pole_angle < 0 else 1
        return action

In [4]:
class QAgent(Agent):
    def __init__(self,env, discount_rate =0.97, learning_rate=0.01):
        super().__init__(env)
        self.state_size = env.observation_space.n
        print("State size:", self.state_size)
        self.eps =1.0
        self.discount_rate = discount_rate
        self.learning_rate = learning_rate
        self.build_model()

    def build_model(self):
        self.q_table = 1e-4*np.random.random([self.state_size, self.action_size]) 
        #States are rows and actions are columns
    
    def get_action(self,state):
        q_state= self.q_table[state]
        action_greedy = np.argmax(q_state)
        action_random = super().get_random_action(state)
        return action_random if random.random() < self.eps else action_greedy
    
    def train(self, experience):
        state, action, next_state, reward, done = experience
        q_next = self.q_table[next_state]
        q_next = np.zeros([self.action_size]) if done else q_next
        q_target = reward + self.discount_rate * np.max(q_next)
        q_update = q_target - self.q_table[state,action]
        self.q_table[state,action] += self.learning_rate*q_update
        #new_q = old_q + learning_rate*(learned value - old_q)
        if done:
            self.eps = self.eps*0.99
agent = QAgent(env)

Action size: 4
State size: 16


In [9]:

#agent = Agent(env)
total_reward = 0
for ep in range(100):
    #i = 0
    state = env.reset()
    done = False
    while not(done):
        # i+=1
       # action = env.action_space.sample()
        action = agent.get_action(state)
        next_state, reward, done, info = env.step(action)
        agent.train((state,action,next_state,reward,done))
        state = next_state
        total_reward += reward
        
        print("s:", state, "a:", action)
        print("Episode: {}, Total reward: {}, eps: {}".format(ep,total_reward,agent.eps))
        env.render()
        print(agent.q_table)
        time.sleep(0.05)
        clear_output(wait=True)

s: 15 a: 2
Episode: 99, Total reward: 97.0, eps: 0.017950553275045134
  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m
[[4.82714312e-04 6.17349265e-05 3.73497174e-02 8.20783082e-05]
 [3.72034121e-04 2.44487082e-05 9.67766353e-02 6.42099760e-04]
 [1.06172318e-04 2.19670923e-01 2.09812483e-05 4.12148067e-04]
 [3.16227708e-04 4.43374669e-05 7.34923701e-05 9.04727185e-05]
 [6.80306509e-05 2.52738682e-05 3.10423641e-06 4.26597114e-05]
 [8.66396713e-06 1.12264212e-05 2.97729382e-05 8.11264117e-05]
 [3.38148140e-05 4.23478138e-01 5.52245123e-05 3.98014819e-04]
 [9.44875383e-05 1.95083053e-05 5.70656125e-05 9.24598366e-05]
 [4.56456041e-05 4.51921923e-05 3.48984802e-05 4.02808347e-05]
 [8.13269374e-06 2.90543010e-05 2.51395661e-05 6.26010990e-05]
 [7.13753565e-05 6.79971643e-01 2.67909497e-05 1.19426438e-02]
 [2.30445329e-05 9.17830291e-05 1.93496430e-05 2.37162361e-05]
 [5.63236019e-05 1.82585499e-05 7.87834316e-05 7.26613158e-05]
 [9.67451605e-06 7.33534101e-05 2.95443079e-02 2.77962266e-05]
 [5.63828

In [6]:
done

True