In [1]:

# Virtual display
from pyvirtualdisplay import Display
import gym
import pygame
import numpy as np
import random
from tqdm import trange, tqdm
import time

pygame 2.1.3 (SDL 2.28.4, Python 3.9.16)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [2]:
env = gym.make("FrozenLake-v1",map_name="4x4",is_slippery=False)

#Action space
# up, down, left, right actions
action_space = env.action_space.n

print("Action space ", env.action_space.n)
print("Action sample ", env.action_space.sample())

#State space
#4x4 grid
state_space = env.observation_space.n
print("State space ", env.observation_space.n)
print("State sample ", env.observation_space.sample())

Action space  4
Action sample  3
State space  16
State sample  1


In [3]:
def init_qtable(state_space,action_space):
    qtable = np.zeros((state_space, action_space))
    return qtable

def epsilon_greedy_policy(qtable, env, state, epsilon):
    random_init = random.uniform(-1,1)
    if random_init > epsilon:
        action = np.argmax(qtable[state])
    else:
        action = env.action_state.sample()
    return action

def greedy_policy(qtable, state):
    action = np.argmax(qtable[state])
    return action


In [4]:
qtable = init_qtable(state_space, action_space)
print(qtable.shape)

(16, 4)


In [37]:
n_training_eps =  10000
learning_rate = 0.07
n_eval_eps = 100

env_id=  'FrozenLake-v1'
max_steps = 99
gamma = 0.95
eval_seed = []

max_epsilon = 1.0
min_epsilon = 0.005
decay_rate = 0.0005

env = gym.make(env_id, map_name="4x4",is_slippery=False)
state_space = env.observation_space.n
action_space = env.action_space.n


qtable = init_qtable(state_space, action_space)
print(qtable.shape)

env.render()

(16, 4)

[41mS[0mFFF
FHFH
FFFH
HFFG


In [38]:
def epsilon_greedy_policy(qtable, env, state, epsilon):
    random_init = random.uniform(0,1)
    if random_init > epsilon:
        action = np.argmax(qtable[state])
    else:
        action = env.action_space.sample()
    return action

def greedy_policy(qtable, state):
    action = np.argmax(qtable[state])
    return action

    
def train(n_training_eps, min_epsilon, max_epsilon, decay_rate, env, max_steps, qtable ):
    loop = tqdm(list(range(n_training_eps)))
    print('qtable shape =', qtable.shape)

    for ep in loop:
        epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * ep)
        state = env.reset()
        done = False

        for step in range(max_steps):
            action = epsilon_greedy_policy(qtable, env, state,epsilon)
            new_state, reward, done, info = env.step(action)

            qtable[state][action] = qtable[state][action] + \
                learning_rate * (reward  + gamma * np.max(qtable[new_state]) - qtable[state][action])
            
            if done:
                break
            
            state = new_state
        
        loop.set_description(f"ep = {ep}, eposilon = {epsilon:.2f}")

train(n_training_eps, min_epsilon, max_epsilon, decay_rate, env, max_steps, qtable)



ep = 164, eposilon = 0.92:   1%|          | 82/10000 [00:00<00:12, 811.53it/s]

qtable shape = (16, 4)


ep = 9999, eposilon = 0.01: 100%|██████████| 10000/10000 [00:10<00:00, 985.22it/s]


In [39]:
print(qtable)

[[0.73509189 0.77378094 0.6983373  0.73509189]
 [0.73509189 0.         0.64252666 0.68996204]
 [0.69451544 0.37632231 0.18956545 0.40422976]
 [0.43732229 0.         0.00238698 0.00489795]
 [0.77378094 0.81450625 0.         0.73509189]
 [0.         0.         0.         0.        ]
 [0.         0.86608611 0.         0.3246072 ]
 [0.         0.         0.         0.        ]
 [0.81450625 0.         0.857375   0.77378094]
 [0.81450625 0.9025     0.9025     0.        ]
 [0.8509721  0.95       0.         0.75349534]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.         0.9025     0.95       0.857375  ]
 [0.9025     0.95       1.         0.9025    ]
 [0.         0.         0.         0.        ]]


In [40]:
state = env.reset()

In [41]:
print(state)

0


In [42]:
env.render()


[41mS[0mFFF
FHFH
FFFH
HFFG


In [43]:
done = False
print("Initial state")
env.render()
while not done:
    action = np.argmax(qtable[state][:])
    state, reward, done, info = env.step(action)
    env.render()
    time.sleep(1)

print("DONE")

Initial state

[41mS[0mFFF
FHFH
FFFH
HFFG
  (Down)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Down)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Right)
SFFF
FHFH
F[41mF[0mFH
HFFG
  (Down)
SFFF
FHFH
FFFH
H[41mF[0mFG
  (Right)
SFFF
FHFH
FFFH
HF[41mF[0mG
  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m
DONE
