In [1]:
import numpy as np
import gym
import random
import time


In [88]:
env = gym.make('FrozenLake-v1', map_name="4x4", is_slippery=False)

## Environment

![title]("Frozen Lake.png")

In [89]:
env.desc

array([[b'S', b'F', b'F', b'F'],
       [b'F', b'H', b'F', b'H'],
       [b'F', b'F', b'F', b'H'],
       [b'H', b'F', b'F', b'G']], dtype='|S1')

## Actions
0: Move left <br/>
1: Move down <br/>
2: Move right <br/>
3: Move up <br/>

In [90]:
action_size = env.action_space.n
state_size = env.observation_space.n
state_size

16

In [91]:
total_episodes = 15000        # Total episodes
learning_rate = 0.8           # Learning rate
max_steps = 99                # Max steps per episode
gamma = 0.95                  # Discounting rate

# Exploration parameters
epsilon = 1.0                 # Exploration rate
max_epsilon = 1.0             # Exploration probability at start
min_epsilon = 0.01            # Minimum exploration probability 
decay_rate = 0.005             # Exponential decay rate for exploration prob

In [92]:
# List of rewards
rewards = []
qtable = np.zeros((state_size, action_size))
# 2 For life or until learning is stopped
for episode in range(total_episodes):
    # Reset the environment
    state,info = env.reset()
    state = int(state)
    step = 0
    done = False
    total_rewards = 0
    for step in range(max_steps):
        # 3. Choose an action a in the current world state (s)
        ## First we randomize a number
        exp_exp_tradeoff = random.uniform(0, 1)
        
        ## If this number > greater than epsilon --> exploitation (taking the biggest Q value for this state)
        if exp_exp_tradeoff > epsilon:
            action = np.argmax(qtable[state,:])
        # Else doing a random choice --> exploration
        else:
            action = env.action_space.sample()

        # Take the action (a) and observe the outcome state(s') and reward (r)
        new_state, reward, done, _, info = env.step(action)

        # Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)]
        qtable[state, action] = qtable[state, action] + learning_rate * (reward + gamma * np.max(qtable[new_state, :]) - qtable[state, action])
        total_rewards += reward
        
        # Our new state is state
        state = new_state
        
        # If done (if we're dead) : finish episode
        if done == True: 
            break
    # Reduce epsilon (because we need less and less exploration)
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode) 
    rewards.append(total_rewards)



  if not isinstance(terminated, (bool, np.bool8)):


In [82]:
state,info = env.reset()
new_state, reward, done, _, info = env.step(1)
print(f"New State:{new_state}  , Reward:{reward}  , Done:{done}")

new_state, reward, done, _, info = env.step(1)
print(f"New State:{new_state}  , Reward:{reward}  , Done:{done}")

new_state, reward, done, _, info = env.step(2)
print(f"New State:{new_state}  , Reward:{reward}  , Done:{done}")

new_state, reward, done, _, info = env.step(1)
print(f"New State:{new_state}  , Reward:{reward}  , Done:{done}")

new_state, reward, done, _, info = env.step(2)
print(f"New State:{new_state}  , Reward:{reward}  , Done:{done}")

new_state, reward, done, _, info = env.step(2)
print(f"New State:{new_state}  , Reward:{reward}  , Done:{done}")

New State:4  , Reward:0.0  , Done:False
New State:8  , Reward:0.0  , Done:False
New State:9  , Reward:0.0  , Done:False
New State:13  , Reward:0.0  , Done:False
New State:14  , Reward:0.0  , Done:False
New State:15  , Reward:1.0  , Done:True


In [97]:
env = gym.make('FrozenLake-v1', is_slippery=False,render_mode = "human")
state,_ = env.reset()
time.sleep(2)
done = False
while not done:
    env.render()  # Render the environment
    action = np.argmax(qtable[state])
    state, _, done, _, _ = env.step(action)
env.close()
