In [2]:
import gym
import math
import numpy as np
 
env=gym.make("FrozenLake-v1", render_mode="human", is_slippery=False)
env.reset()
# render the environment
env.render()

In [3]:
# observation space - states 
env.observation_space

Discrete(16)

In [4]:
actions = {0: "left",
          1: "down",
          2: "right",
          3: "up"}

def is_square(x):
    square_root = math.sqrt(x)
    return False if square_root - (square_root // 1) > 0 else True

def visualize_actions(prev_a, next_a, action):
    if not is_square(env.observation_space.n):
        return Exception("Observation space is not a square grid")
    else:
        n = int(math.sqrt(env.observation_space.n))
    
    next_obs = next_a
    prev_obs = prev_a if prev_a else None
    
    grid = []
    for i in range(n):
        inner_grid = []
        for j in range(n):
            curr_tile = (i * 4 + j)
            to_append = ""
            if next_obs == curr_tile:
                to_append = " X "
            if prev_obs == curr_tile:
                if action == 0:
                    to_append = " ← "
                if action == 1:
                    to_append = " ↓ "
                if action == 2:
                    to_append = " → "
                if action == 3:
                    to_append = " ↑ "
            if next_obs != curr_tile and prev_obs != curr_tile:
                to_append = " O "
            inner_grid.append(to_append)
                    
        grid.append(inner_grid)
        print(inner_grid)
    
    return grid

In [5]:
# actions: left: 0, down: 1, right: 2, up: 3
# Discrete(N) generates a sequence of integers from 0 to N-1 with a step of 1
# Therefore, Discrete(4) is equivalant to {0, 1, 2, 3}
print(env.action_space)

Discrete(4)


In [6]:
env.reset()
# .sample() can be used on both Discrete and Space objects to choose a sample randomly
# accepts "mask" as a parameter which blocks out some options for sampling eg. [0, 1, 1] blocks the 1st option

# format of returnValue is (observation,reward, terminated, truncated, info)
# observation (object)  - observed state
# reward (float)        - reward that is the result of taking the action
# terminated (bool)     - is it a terminal state
# truncated (bool)      - it is not important in our case
# info (dictionary)     - in our case transition probability

prev_action = None
for i in range(50):
    print(f"----------STEP {i}-----------")
    if i > 0:
        prev_action = return_value
    random_action= env.action_space.sample()
    return_value = env.step(random_action)
    visualize_actions(prev_action, return_value, random_action)
    if return_value[2]:

        if return_value[1] == 0:
            print("DEATH")
        else:
            print("VICTORY")
        break
 
env.render()
return_value

----------STEP 0-----------


  if not isinstance(terminated, (bool, np.bool8)):


[' O ', ' O ', ' O ', ' O ']
[' O ', ' O ', ' O ', ' O ']
[' O ', ' O ', ' O ', ' O ']
[' O ', ' O ', ' O ', ' O ']
----------STEP 1-----------
[' O ', ' O ', ' O ', ' O ']
[' O ', ' O ', ' O ', ' O ']
[' O ', ' O ', ' O ', ' O ']
[' O ', ' O ', ' O ', ' O ']
DEATH


(5, 0.0, True, False, {'prob': 1.0})

In [7]:
import numpy as np

In [8]:
class QTableAgent:

    def __init__(self, n_states, n_actions, epsilon, learning_rate, discount_value):
        self.learning_rate = learning_rate
        self.discount_value = discount_value
        self.n_states = n_states
        self.n_actions = n_actions
        self.epsilon = epsilon
        self.initialize_q_table(n_states, n_actions)
    
    
    # This is the policy
    def choose_best_action(self, state):
        '''parameters: state 
        returns: action'''
        # Always need choose_action
        return np.argmax(self.q_table[state]) # <-- returns the index with the highest action value for that state
    
    def choose_action(self, state): # Policy
        if np.random.rand(1) < self.epsilon:
            action = np.random.randint(0, self.n_actions)
        else:
            action = self.choose_best_action(state)
            
        return action
    
        
    def initialize_q_table(self, n_states, n_actions):
        self.q_table = np.zeros((n_states, n_actions))
        
       
    
    # HOMEWORK
    def bellman_equation(self, state, action, next_state, reward):
        current_q = self.q_table[state][action]
        max_q_next_state = np.max(self.q_table[next_state])
        new_q =  current_q + self.learning_rate * (reward + (self.discount_value * max_q_next_state) - current_q)
        # single step forecasting?
        # value of state = current value + immediate reward + negative or positive expected reward for that state
        # reward - immediate reward for moving to next_state? 0 for everything except the goal node (1)
        # max_q_next_state - max value of state for all of its actions?
        return new_q
        
    
    def update_q_table(self, state, action, next_state, reward):
        new_q = self.bellman_equation(state, action, next_state, reward)
        self.q_table[state][action] = new_q
        
    
        


In [31]:
env.P

{0: {0: [(1.0, 0, 0.0, False)],
  1: [(1.0, 4, 0.0, False)],
  2: [(1.0, 1, 0.0, False)],
  3: [(1.0, 0, 0.0, False)]},
 1: {0: [(1.0, 0, 0.0, False)],
  1: [(1.0, 5, 0.0, True)],
  2: [(1.0, 2, 0.0, False)],
  3: [(1.0, 1, 0.0, False)]},
 2: {0: [(1.0, 1, 0.0, False)],
  1: [(1.0, 6, 0.0, False)],
  2: [(1.0, 3, 0.0, False)],
  3: [(1.0, 2, 0.0, False)]},
 3: {0: [(1.0, 2, 0.0, False)],
  1: [(1.0, 7, 0.0, True)],
  2: [(1.0, 3, 0.0, False)],
  3: [(1.0, 3, 0.0, False)]},
 4: {0: [(1.0, 4, 0.0, False)],
  1: [(1.0, 8, 0.0, False)],
  2: [(1.0, 5, 0.0, True)],
  3: [(1.0, 0, 0.0, False)]},
 5: {0: [(1.0, 5, 0, True)],
  1: [(1.0, 5, 0, True)],
  2: [(1.0, 5, 0, True)],
  3: [(1.0, 5, 0, True)]},
 6: {0: [(1.0, 5, 0.0, True)],
  1: [(1.0, 10, 0.0, False)],
  2: [(1.0, 7, 0.0, True)],
  3: [(1.0, 2, 0.0, False)]},
 7: {0: [(1.0, 7, 0, True)],
  1: [(1.0, 7, 0, True)],
  2: [(1.0, 7, 0, True)],
  3: [(1.0, 7, 0, True)]},
 8: {0: [(1.0, 8, 0.0, False)],
  1: [(1.0, 12, 0.0, True)],
  2: [(

In [1]:
episodes = 100000

start_epsilon = 0.5
min_epsilon = 0.1
epsilon_reduction = 0.1
epsilon_decay = 1000


learning_rate = 0.1
discount_rate = 0.95

render_decay = 10000

agent = QTableAgent(env.observation_space.n, env.action_space.n, start_epsilon, learning_rate, discount_rate)

for episode in range(episodes):
    
    if (episode + 1) % render_decay == 0:
        env=gym.make("FrozenLake-v1", render_mode="human", is_slippery=False)
        visualize_actions
    else:
        env=gym.make("FrozenLake-v1", render_mode="rgb_array", is_slippery=False)
        
    if (episode + 1) % epsilon_decay == 0 and (agent.epsilon - epsilon_reduction) > min_epsilon:
        agent.epsilon -= epsilon_reduction
    
    state, _ = env.reset()
    terminated, truncated = False, False
    
    while not terminated and not truncated:
        
      
        action = agent.choose_action(state)
    
        new_state, reward, terminated, truncated, info = env.step(action)
        agent.update_q_table(state, action, new_state, reward)
        
        if episode == episodes - 1:
            visualize_actions(state, new_state, action)
            print("----------------")
            
        state = new_state
            
        
    
    

NameError: name 'QTableAgent' is not defined

In [12]:
agent.q_table

array([[0.73509189, 0.77378094, 0.77378094, 0.73509189],
       [0.73509189, 0.        , 0.81450625, 0.77378094],
       [0.77378094, 0.857375  , 0.77378094, 0.81450625],
       [0.81450625, 0.        , 0.73333693, 0.73457744],
       [0.77378094, 0.81450625, 0.        , 0.73509189],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.9025    , 0.        , 0.81450625],
       [0.        , 0.        , 0.        , 0.        ],
       [0.81450625, 0.        , 0.857375  , 0.77378094],
       [0.81450625, 0.9025    , 0.9025    , 0.        ],
       [0.857375  , 0.95      , 0.        , 0.857375  ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.9025    , 0.95      , 0.857375  ],
       [0.9025    , 0.95      , 1.        , 0.9025    ],
       [0.        , 0.        , 0.        , 0.        ]])

In [35]:
env.step(action)

(1, 0.0, False, False, {'prob': 1.0})

In [None]:
# For large state-action spaces, deep learning must be used to estimate the value function