# Manually updating the Q value

In [1]:
import gymnasium as gym
import numpy as np

In [2]:
env = gym.make("MountainCar-v0")  # a very simple application
done = False

# Resets the state of the environment and returns an initial observation.
env.reset()  # 2 observables - position and velocity

# number of allowed actions  - 3: left push, no movement, right push
# print(env.action_space.n)

# there are only two observables - position and velocity
# print(env.observation_space.high)  # the high values of the observations
# print(env.observation_space.low)  # the low values

# the range of values for observation 1 is 0.6 to -1.2
# and similarly for observation 2 its 0.07 to -0.07
# we can segregate the values in 20 chunks (can be any value)

DISCRETE_OS_SIZE = [20] * len(env.observation_space.high)  # will give out 20*20 list

# see how big is the range for each of the 20 different buckets
discrete_os_win_size = (env.observation_space.high -
                        env.observation_space.low) / DISCRETE_OS_SIZE

LEARNING_RATE = 0.01
DISCOUNT = 0.95  # how important we find the new future actions are ; future reward over current reward
EPISODES = 2000
render = True

# even though the solution might have been found, we still wish to look for other solutions
epsilon = 0.5  # 0-1 ; higher it is, more likely for it to perform something random action
START_EPSILON_DECAYING = 1
# python2 style division - gives only int values
END_EPSILON_DECAYING = EPISODES // 2
epsilon_decay_value = epsilon / (END_EPSILON_DECAYING - START_EPSILON_DECAYING)

# Q learning
# so we will have now a table such that each row will have 400 (20*20) rows for the possible state the agent can be in
# and 3 columns for the 3 possible actions
# the agent will see which state it is in and take the action corresponding to the largest Q value

# Create a randomised q_table and agent will update it after exploring the environment
q_table = np.random.uniform(
    low=-2, high=0, size=(DISCRETE_OS_SIZE + [env.action_space.n]))



In [3]:
def get_discrete_state(state):
    """Discretizes continuous state into discrete bins."""
    
    discrete_state = (state - env.observation_space.low) / discrete_os_win_size
    discrete_state = tuple(discrete_state.astype(int))  # use int instead of np.int
    
    assert all(0 <= s < DISCRETE_OS_SIZE[i] for i, s in enumerate(discrete_state)), f"State {discrete_state} out of bounds"
    return discrete_state

In [4]:
# modified version

for ep in range(EPISODES):
    done = False
    state, _ = env.reset()
    discrete_state = get_discrete_state(state)

    while not done:
        if np.random.random() > epsilon:
            action = np.argmax(q_table[discrete_state])  # Exploit
        else:
            action = np.random.randint(0, env.action_space.n)  # Explore

        # Sanity check for action
        assert 0 <= action < env.action_space.n, f"Action {action} out of bounds"

        # Step through the environment
        #print(env.step(action) ) 
        
        new_state, reward, done, info , _  = env.step(action)
        new_state = new_state
        new_discrete_state = get_discrete_state(new_state)

        # Sanity check for discrete states and new discrete state
        assert all(0 <= s < DISCRETE_OS_SIZE[i] for i, s in enumerate(new_discrete_state)), f"New state {new_discrete_state} out of bounds"

        if not done:
            max_future_q = np.max(q_table[new_discrete_state])
            current_q = q_table[discrete_state + (action,)]
            new_q = (1 - LEARNING_RATE) * current_q + LEARNING_RATE * (reward + DISCOUNT * max_future_q)
            q_table[discrete_state + (action,)] = new_q
        elif new_state[0] >= env.goal_position:
            q_table[discrete_state + (action,)] = 0

        discrete_state = new_discrete_state

    if END_EPSILON_DECAYING >= ep >= START_EPSILON_DECAYING:
        epsilon -= epsilon_decay_value

  logger.warn(
