In [1]:
import gym

env = gym.make('MountainCar-v0')

print(env.action_space.n)

3


In [2]:
# state has two values, [position, velocity]
# make the observation state to be discrete

DISCRETE_OS_SIZE = [20]*len(env.observation_space.high)
discrete_os_win_size = (env.observation_space.high - env.observation_space.low) / DISCRETE_OS_SIZE
print(discrete_os_win_size)

[0.09  0.007]


In [3]:
import numpy as np

# q_table size : [20, 20, 3]
q_table = np.random.uniform(low=-2,high=0,size=DISCRETE_OS_SIZE + [env.action_space.n])

print(q_table.shape)

(20, 20, 3)


In [4]:
LEARNING_RATE = 0.1
DISCOUNT = 0.95
EPISODES = 4100
SHOW_EVERY = 1000

# exploration settings
epsilon = 0.5
START_EPSILON_DECAY = 1
END_EPSILON_DECAY = EPISODES // 2
epsilon_decay_value = epsilon / (END_EPSILON_DECAY - START_EPSILON_DECAY)

In [5]:
def get_discrete_state(state):
    discrete_state = (state - env.observation_space.low) / discrete_os_win_size
    return tuple(discrete_state.astype(np.int))

In [6]:
discrete_state = get_discrete_state(env.reset())
discrete_state

(6, 10)

In [7]:
print(np.argmax(q_table[discrete_state]))

1


In [8]:
for episode in range(EPISODES):
    
    if episode % SHOW_EVERY == 0:
        render = True
    else:
        render = False
        
    discrete_state = get_discrete_state(env.reset())
    done = False
    while not done:
        
        #=================== exporation =================
        
        if np.random.random() > epsilon:
            action = np.argmax(q_table[discrete_state])
        else:
            action = np.random.randint(0, env.action_space.n)
        
        #===============================================
        action = np.argmax(q_table[discrete_state])
        new_state, reward, done, _ = env.step(action)

        new_discrete_state = get_discrete_state(new_state)
        
        if render:
            env.render()

        if not done:
            max_future_q = np.max(q_table[new_discrete_state])

            current_q = q_table[discrete_state + (action, )]

            new_q = (1-LEARNING_RATE)*current_q + LEARNING_RATE*(reward + DISCOUNT*max_future_q)

            # update q_table
            q_table[discrete_state + (action, )] = new_q

        elif new_state[0] > env.goal_position:
            q_table[discrete_state + (action, )] = 0

        discrete_state = new_discrete_state
    
env.close()