In [2]:
import gym
from gym import wrappers
import numpy as np
import io, base64
from IPython.display import HTML

In [12]:
env = gym.make('SpaceInvaders-v0')
env = wrappers.Monitor(env, "./gym-results", force=True)
env.reset()
for _ in range(1000):
    action = env.action_space.sample()
    observation, reward, done, info = env.step(action)
    if done: breakenv.close()


video = io.open('./gym-results/openaigym.video.%s.video000000.mp4' % env.file_infix, 'r+b').read()
encoded = base64.b64encode(video)
HTML(data='''
    <video width="360" height="auto" alt="test" controls><source src="data:video/mp4;base64,{0}" type="video/mp4" /></video>'''
.format(encoded.decode('ascii')))

# Built-in

## Mountain Car

In [9]:
LR = 0.1
DISCOUNT = 0.95
EPISODES = 2000
SHOW_EVERY = 500

EPSILON = 0.5  # exploration
START_EPSILON_DECAY = 1
END_EPSILON_DECAY = EPISODES // 2  # no exploration after this many episodes

In [None]:
# https://pythonprogramming.net/q-learning-reinforcement-learning-python-tutorial/
env = gym.make('MountainCar-v0')

print(env.observation_space.high) # ranges of observation & action
print(env.observation_space.low)
print(env.action_space.n)

n_states = [20]*len(env.observation_space.high)  #discretize the observation space so each value takes this many possible states
q_table = np.random.uniform(low=-2, high=0, size=(n_states+[env.action_space.n])) # (n_state, n_state, env.action_space.n)

def get_discrete_state(state):
    dis = (state-env.observation_space.low)/(env.observation_space.high-env.observation_space.low) * n_states
    return tuple(dis.astype(np.int))  #usable to index q_table

for e in range(EPISODES):
    if e%SHOW_EVERY==0: print(e)
    state = env.reset()                         #returns the init state
    dis_state = get_discrete_state(state)
    done = False
    while not done:
        action = np.argmax(q_table[dis_state]) if np.random.random()>EPSILON else np.random.randint(0,env.action_space.n)
        ind =  dis_state+(action,)# tuple of 3
        new_state, reward, done, _ = env.step(action)  # each episode, sum up rewards for all steps to judge the quality of Q table
        new_dis_state = get_discrete_state(new_state)
        if e%SHOW_EVERY==0: env.render()               # whether to render graphics

        if not done:        # Q-learning algorithm
            max_future_q = np.max(q_table[new_dis_state])
            cur_q = q_table[ind] 
            new_q = (1 - LR) * cur_q + LR * (reward + DISCOUNT * max_future_q)
            q_table[ind] = new_q
        elif new_state[0] >= env.goal_position:
            print(f'succedded at {e}')
            q_table[ind] = 0

        dis_state = new_dis_state
    if e < END_EPSILON_DECAY:
        EPSILON -= EPSILON/END_EPSILON_DECAY
env.close()