In [3]:
!pip install numpy
!pip install gym

Collecting gym
  Downloading gym-0.26.2.tar.gz (721 kB)
     ---------------------------------------- 0.0/721.7 kB ? eta -:--:--
      --------------------------------------- 10.2/721.7 kB ? eta -:--:--
      --------------------------------------- 10.2/721.7 kB ? eta -:--:--
     ---- -------------------------------- 92.2/721.7 kB 581.0 kB/s eta 0:00:02
     ------ ----------------------------- 122.9/721.7 kB 654.9 kB/s eta 0:00:01
     ------- ---------------------------- 143.4/721.7 kB 607.9 kB/s eta 0:00:01
     -------- --------------------------- 174.1/721.7 kB 615.9 kB/s eta 0:00:01
     ---------- ------------------------- 204.8/721.7 kB 621.6 kB/s eta 0:00:01
     ----------- ------------------------ 235.5/721.7 kB 600.7 kB/s eta 0:00:01
     ------------- ---------------------- 276.5/721.7 kB 655.8 kB/s eta 0:00:01
     --------------- -------------------- 317.4/721.7 kB 655.0 kB/s eta 0:00:01
     ----------------- ------------------ 358.4/721.7 kB 696.3 kB/s eta 0:00:01
   

In [1]:
import gym
import random
import numpy as np
import time
from gym.envs.registration import register
from IPython.display import clear_output

In [2]:
# Register evironment
try:
    register(
        id='FrozenLakeNoSlip-v0',
        entry_point='gym.envs.toy_text:FrozenLakeEnv',
        kwargs={'map_name' : '4x4', 'is_slippery':False},
        max_episode_steps=100,
        reward_threshold=0.78, 
    )
except:
    pass

env_name = "FrozenLakeNoSlip-v0"
# Create environment
env = gym.make(env_name)
print("Observation space:", env.observation_space)
print("Action space:", env.action_space)
type(env.action_space)

Observation space: Discrete(16)
Action space: Discrete(4)


gym.spaces.discrete.Discrete

In [3]:
# Agent
class Agent():
    def __init__(self, env):
        self.action_size = env.action_space.n
        print("Action size:", self.action_size)

    # get random action
    def get_action(self, state):
        action = random.choice(range(self.action_size))
        return action

In [4]:
# Agent Q-learning
class QL_Agent(Agent):
    def __init__(self, env, discount_rate=0.97, learning_rate=0.01):
        super().__init__(env)
        self.state_size = env.observation_space.n
        print("State size:", self.state_size)
        self.eps = 1.0
        self.discount_rate = discount_rate
        self.learning_rate = learning_rate
        self.init_Q_table()
        
    def init_Q_table(self):
        self.q_table = 1e-4*np.random.random([self.state_size, self.action_size])
    
    def get_action(self, state):
        q_state = self.q_table[state]
        action_greedy = np.argmax(q_state)
        action_random = super().get_action(state)
        # choose between greedy and random actions base on epsilon
        return action_random if random.random() < self.eps else action_greedy
    
    def train(self, experience):
        state, action, next_state, reward, done = experience
        
        q_next = self.q_table[next_state]
        q_next = np.zeros([self.action_size]) if done else q_next
        q_target = reward + self.discount_rate * np.max(q_next)
        
        q_update = q_target - self.q_table[state,action]
        # update Q-value
        self.q_table[state,action] += self.learning_rate * q_update
        
        if done:
            #update epsilon
            self.eps = self.eps * 0.99
        
agent = QL_Agent(env)

Action size: 4
State size: 16


In [6]:
total_reward = 0
for i in range(600):
    state = env.reset()
    done = False
    while not done:
        action = agent.get_action(state)
        next_state, reward, done, info = env.step(action)
        agent.train((state,action,next_state,reward,done))

        state = next_state
        total_reward += reward
        print("state:", state, "action:", action)
        print(f"Epi: {i}, Total reward: {total_reward}, eps: {agent.eps}")
        # Render the current state of the environment
        env.render()
        print(agent.q_table)
        #pause for a short duration
        time.sleep(0.0005)
        clear_output(wait=True)

state: 15 action: 2
Epi: 599, Total reward: 600.0, eps: 5.784069691292563e-06
  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m
[[5.15658150e-05 5.15465936e-05 5.10592147e-01 5.15508010e-05]
 [5.22086823e-05 5.11224445e-05 6.62569539e-01 5.22046788e-05]
 [5.25093650e-05 7.94972729e-01 5.24980331e-05 5.16834404e-05]
 [5.25528620e-05 5.53758907e-06 4.67136468e-05 4.50667890e-05]
 [5.14719450e-05 5.03645460e-05 4.52841769e-06 2.82080445e-04]
 [5.69536075e-05 1.11663522e-05 2.47261473e-05 5.14765236e-05]
 [3.00038727e-05 8.93012150e-01 5.08411092e-05 5.20720670e-05]
 [7.14208256e-05 4.74147695e-05 1.81176496e-05 7.04379104e-06]
 [6.21222639e-05 4.27941806e-05 6.25375317e-05 6.63711866e-05]
 [1.45230152e-05 4.10699254e-05 4.31795411e-05 4.07701579e-05]
 [6.24889222e-05 9.56730836e-01 3.59510122e-05 6.26212985e-05]
 [2.30306483e-05 8.26036591e-05 6.50437685e-05 3.86002757e-05]
 [6.64501378e-05 4.56855793e-05 3.39967502e-05 6.09526957e-05]
 [2.73572671e-05 2.75624322e-05 9.30193097e-06 3.99285435e-05]
 