In [1]:
import os
import gym
import numpy as np
import matplotlib.pyplot as plt

In [None]:
seed = 123
os.environ['PYTHONHASHSEED'] = str(seed) # 为了禁止hash随机化，使得实验可复现
np.random.seed(seed)
env.seed(seed)

# FrozenLake-v0

[介绍](https://gym.openai.com/envs/FrozenLake-v0/)

## Q-learning

In [22]:
name = "FrozenLake-v0"
env = gym.make(name)


off_policy = True # if True use q-learning, if False use Sarsa


t_max = 10000
def run_episode(env, policy=None, render=False):
    obs = env.reset()
    total_reward = 0
    steps = 0
    for _ in range(t_max):
        if render:
            env.render()
        '''
        we could also only print the last state (to see if our agent is on the goal or fall into an hole)
        put the 'env.render()' into the 'done:'
        '''
        if policy is None:
            action = env.action_space.sample()
        else:
            action = policy[obs]
        obs, reward, done, _ = env.step(action)
        total_reward += gamma**steps * reward
        steps += 1
        
        if done:
            break
    return total_reward


iter_max = int(1e6)
initial_lr = 1.0
min_lr = 0.003
gamma = 1.0
eps = 0.1
env.seed(123)
np.random.seed(123)

if off_policy == True:
    print ('----- using Q Learning -----')
else:
    print('------ using SARSA Learning ---')

q_table = np.zeros((env.observation_space.n, env.action_space.n))   # (16, 4)

returns = []
for i in range(iter_max):
    obs = env.reset()
    total_reward = 0
    # eta: learning rate is decayed every 100 steps
    eta = max(min_lr, initial_lr * (0.85**(i//100)))
    
    for j in range(t_max):
        if np.random.uniform(0, 1) < eps:
            action = np.random.choice(env.action_space.n)
        else:
            action = np.argmax(q_table[obs])
        
        obs_tp1, reward, done, _ = env.step(action)
        total_reward += reward
        
        # update q table
        if off_policy == True:
            # q-learning
            target = reward + gamma * np.max(q_table[obs_tp1]) * (1 - done)
            q_table[obs][action] = q_table[obs][action] + eta * (target - q_table[obs][action])
        else:
            # Sarsa
            if np.random.uniform(0, 1) < eps:
                action_ = np.random.choice(env.action_space.n)
            else:
                action_ = np.argmax(q_table[obs_tp1])
            target = reward + gamma * q_table[obs_tp1][action_] * (1 - done)
            q_table[obs][action] = q_table[obs][action] + eta * (target - q_table[obs][action])
        
        obs = obs_tp1
        
        if done:
            returns.append(total_reward)
            break
        
    if i % 200 == 0:
        print('Iteration #%d -- Total reward = %d.' %(i+1, total_reward))
        
    if np.array(returns[-100:]).mean() > 0.7:
        break
        
    
solution_policy = np.argmax(q_table, axis=1)
solution_policy_scores = [run_episode(env, solution_policy, False) for _ in range(100)]
print("Average score of solution = ", np.mean(solution_policy_scores))

# Animate it
if np.mean(solution_policy_scores) > 0.7: 
    for i in range(2):
        print('episode {}'.format(i))
        run_episode(env, solution_policy, True)
env.close()  

----- using Q Learning -----
Iteration #1 -- Total reward = 0.
Iteration #201 -- Total reward = 0.
Iteration #401 -- Total reward = 0.
Iteration #601 -- Total reward = 0.
Iteration #801 -- Total reward = 0.
Iteration #1001 -- Total reward = 0.
Iteration #1201 -- Total reward = 0.
Iteration #1401 -- Total reward = 0.
Iteration #1601 -- Total reward = 0.
Iteration #1801 -- Total reward = 0.
Iteration #2001 -- Total reward = 0.
Iteration #2201 -- Total reward = 0.
Iteration #2401 -- Total reward = 0.
Iteration #2601 -- Total reward = 0.
Iteration #2801 -- Total reward = 0.
Iteration #3001 -- Total reward = 0.
Iteration #3201 -- Total reward = 0.
Iteration #3401 -- Total reward = 0.
Iteration #3601 -- Total reward = 0.
Iteration #3801 -- Total reward = 0.
Iteration #4001 -- Total reward = 0.
Iteration #4201 -- Total reward = 0.
Iteration #4401 -- Total reward = 0.
Iteration #4601 -- Total reward = 0.
Iteration #4801 -- Total reward = 0.
Iteration #5001 -- Total reward = 0.
Iteration #5201 

Iteration #43801 -- Total reward = 0.
Iteration #44001 -- Total reward = 0.
Iteration #44201 -- Total reward = 0.
Iteration #44401 -- Total reward = 0.
Iteration #44601 -- Total reward = 0.
Iteration #44801 -- Total reward = 0.
Iteration #45001 -- Total reward = 0.
Iteration #45201 -- Total reward = 0.
Iteration #45401 -- Total reward = 0.
Iteration #45601 -- Total reward = 0.
Iteration #45801 -- Total reward = 0.
Iteration #46001 -- Total reward = 0.
Iteration #46201 -- Total reward = 0.
Iteration #46401 -- Total reward = 0.
Iteration #46601 -- Total reward = 0.
Iteration #46801 -- Total reward = 0.
Iteration #47001 -- Total reward = 0.
Iteration #47201 -- Total reward = 0.
Iteration #47401 -- Total reward = 0.
Iteration #47601 -- Total reward = 0.
Iteration #47801 -- Total reward = 0.
Iteration #48001 -- Total reward = 0.
Iteration #48201 -- Total reward = 0.
Iteration #48401 -- Total reward = 0.
Iteration #48601 -- Total reward = 0.
Iteration #48801 -- Total reward = 0.
Iteration #4

Iteration #87001 -- Total reward = 1.
Iteration #87201 -- Total reward = 1.
Iteration #87401 -- Total reward = 0.
Iteration #87601 -- Total reward = 0.
Iteration #87801 -- Total reward = 0.
Iteration #88001 -- Total reward = 0.
Iteration #88201 -- Total reward = 1.
Iteration #88401 -- Total reward = 0.
Iteration #88601 -- Total reward = 1.
Iteration #88801 -- Total reward = 1.
Iteration #89001 -- Total reward = 0.
Iteration #89201 -- Total reward = 0.
Iteration #89401 -- Total reward = 0.
Iteration #89601 -- Total reward = 0.
Iteration #89801 -- Total reward = 0.
Iteration #90001 -- Total reward = 0.
Iteration #90201 -- Total reward = 0.
Iteration #90401 -- Total reward = 0.
Iteration #90601 -- Total reward = 0.
Iteration #90801 -- Total reward = 0.
Iteration #91001 -- Total reward = 0.
Iteration #91201 -- Total reward = 0.
Iteration #91401 -- Total reward = 1.
Iteration #91601 -- Total reward = 0.
Iteration #91801 -- Total reward = 0.
Iteration #92001 -- Total reward = 0.
Iteration #9

Iteration #129601 -- Total reward = 0.
Iteration #129801 -- Total reward = 0.
Iteration #130001 -- Total reward = 0.
Iteration #130201 -- Total reward = 0.
Iteration #130401 -- Total reward = 1.
Iteration #130601 -- Total reward = 0.
Iteration #130801 -- Total reward = 0.
Iteration #131001 -- Total reward = 1.
Iteration #131201 -- Total reward = 0.
Iteration #131401 -- Total reward = 0.
Iteration #131601 -- Total reward = 1.
Iteration #131801 -- Total reward = 0.
Iteration #132001 -- Total reward = 0.
Iteration #132201 -- Total reward = 0.
Iteration #132401 -- Total reward = 1.
Iteration #132601 -- Total reward = 0.
Iteration #132801 -- Total reward = 0.
Iteration #133001 -- Total reward = 0.
Iteration #133201 -- Total reward = 0.
Iteration #133401 -- Total reward = 0.
Iteration #133601 -- Total reward = 1.
Iteration #133801 -- Total reward = 0.
Iteration #134001 -- Total reward = 1.
Iteration #134201 -- Total reward = 0.
Iteration #134401 -- Total reward = 0.
Iteration #134601 -- Tota

Iteration #171801 -- Total reward = 1.
Iteration #172001 -- Total reward = 0.
Iteration #172201 -- Total reward = 1.
Iteration #172401 -- Total reward = 0.
Iteration #172601 -- Total reward = 1.
Iteration #172801 -- Total reward = 0.
Iteration #173001 -- Total reward = 0.
Iteration #173201 -- Total reward = 0.
Iteration #173401 -- Total reward = 0.
Iteration #173601 -- Total reward = 1.
Iteration #173801 -- Total reward = 0.
Iteration #174001 -- Total reward = 0.
Iteration #174201 -- Total reward = 0.
Iteration #174401 -- Total reward = 0.
Iteration #174601 -- Total reward = 1.
Iteration #174801 -- Total reward = 0.
Iteration #175001 -- Total reward = 0.
Iteration #175201 -- Total reward = 0.
Iteration #175401 -- Total reward = 0.
Iteration #175601 -- Total reward = 0.
Iteration #175801 -- Total reward = 0.
Iteration #176001 -- Total reward = 0.
Iteration #176201 -- Total reward = 0.
Iteration #176401 -- Total reward = 1.
Iteration #176601 -- Total reward = 1.
Iteration #176801 -- Tota

Iteration #214201 -- Total reward = 1.
Iteration #214401 -- Total reward = 0.
Iteration #214601 -- Total reward = 0.
Iteration #214801 -- Total reward = 0.
Iteration #215001 -- Total reward = 1.
Iteration #215201 -- Total reward = 0.
Iteration #215401 -- Total reward = 1.
Iteration #215601 -- Total reward = 0.
Iteration #215801 -- Total reward = 0.
Iteration #216001 -- Total reward = 0.
Iteration #216201 -- Total reward = 0.
Iteration #216401 -- Total reward = 1.
Iteration #216601 -- Total reward = 1.
Iteration #216801 -- Total reward = 1.
Iteration #217001 -- Total reward = 0.
Iteration #217201 -- Total reward = 0.
Iteration #217401 -- Total reward = 0.
Iteration #217601 -- Total reward = 1.
Iteration #217801 -- Total reward = 0.
Iteration #218001 -- Total reward = 1.
Iteration #218201 -- Total reward = 1.
Iteration #218401 -- Total reward = 0.
Iteration #218601 -- Total reward = 0.
Iteration #218801 -- Total reward = 0.
Iteration #219001 -- Total reward = 1.
Iteration #219201 -- Tota

Iteration #256601 -- Total reward = 0.
Iteration #256801 -- Total reward = 0.
Iteration #257001 -- Total reward = 0.
Iteration #257201 -- Total reward = 0.
Iteration #257401 -- Total reward = 1.
Iteration #257601 -- Total reward = 0.
Iteration #257801 -- Total reward = 0.
Iteration #258001 -- Total reward = 1.
Iteration #258201 -- Total reward = 1.
Iteration #258401 -- Total reward = 0.
Iteration #258601 -- Total reward = 0.
Iteration #258801 -- Total reward = 1.
Iteration #259001 -- Total reward = 1.
Iteration #259201 -- Total reward = 1.
Iteration #259401 -- Total reward = 0.
Iteration #259601 -- Total reward = 0.
Iteration #259801 -- Total reward = 0.
Iteration #260001 -- Total reward = 0.
Iteration #260201 -- Total reward = 0.
Iteration #260401 -- Total reward = 0.
Iteration #260601 -- Total reward = 0.
Iteration #260801 -- Total reward = 0.
Iteration #261001 -- Total reward = 1.
Iteration #261201 -- Total reward = 0.
Iteration #261401 -- Total reward = 1.
Iteration #261601 -- Tota

Iteration #299001 -- Total reward = 1.
Iteration #299201 -- Total reward = 0.
Iteration #299401 -- Total reward = 0.
Iteration #299601 -- Total reward = 1.
Iteration #299801 -- Total reward = 0.
Iteration #300001 -- Total reward = 1.
Iteration #300201 -- Total reward = 0.
Iteration #300401 -- Total reward = 0.
Iteration #300601 -- Total reward = 1.
Iteration #300801 -- Total reward = 1.
Iteration #301001 -- Total reward = 0.
Iteration #301201 -- Total reward = 0.
Iteration #301401 -- Total reward = 1.
Iteration #301601 -- Total reward = 1.
Iteration #301801 -- Total reward = 0.
Iteration #302001 -- Total reward = 1.
Iteration #302201 -- Total reward = 1.
Iteration #302401 -- Total reward = 0.
Iteration #302601 -- Total reward = 1.
Iteration #302801 -- Total reward = 0.
Iteration #303001 -- Total reward = 0.
Iteration #303201 -- Total reward = 0.
Iteration #303401 -- Total reward = 1.
Iteration #303601 -- Total reward = 0.
Iteration #303801 -- Total reward = 0.
Iteration #304001 -- Tota

Iteration #341201 -- Total reward = 0.
Iteration #341401 -- Total reward = 0.
Iteration #341601 -- Total reward = 1.
Iteration #341801 -- Total reward = 1.
Iteration #342001 -- Total reward = 0.
Iteration #342201 -- Total reward = 0.
Iteration #342401 -- Total reward = 1.
Iteration #342601 -- Total reward = 0.
Iteration #342801 -- Total reward = 0.
Iteration #343001 -- Total reward = 0.
Iteration #343201 -- Total reward = 1.
Iteration #343401 -- Total reward = 1.
Iteration #343601 -- Total reward = 1.
Iteration #343801 -- Total reward = 0.
Iteration #344001 -- Total reward = 0.
Iteration #344201 -- Total reward = 1.
Iteration #344401 -- Total reward = 0.
Iteration #344601 -- Total reward = 0.
Iteration #344801 -- Total reward = 0.
Iteration #345001 -- Total reward = 1.
Iteration #345201 -- Total reward = 0.
Iteration #345401 -- Total reward = 0.
Iteration #345601 -- Total reward = 1.
Iteration #345801 -- Total reward = 1.
Iteration #346001 -- Total reward = 1.
Iteration #346201 -- Tota

Iteration #383401 -- Total reward = 1.
Iteration #383601 -- Total reward = 0.
Iteration #383801 -- Total reward = 1.
Iteration #384001 -- Total reward = 0.
Iteration #384201 -- Total reward = 1.
Iteration #384401 -- Total reward = 1.
Iteration #384601 -- Total reward = 0.
Iteration #384801 -- Total reward = 0.
Iteration #385001 -- Total reward = 0.
Iteration #385201 -- Total reward = 1.
Iteration #385401 -- Total reward = 0.
Iteration #385601 -- Total reward = 0.
Iteration #385801 -- Total reward = 0.
Iteration #386001 -- Total reward = 0.
Iteration #386201 -- Total reward = 1.
Iteration #386401 -- Total reward = 0.
Iteration #386601 -- Total reward = 1.
Iteration #386801 -- Total reward = 0.
Iteration #387001 -- Total reward = 0.
Iteration #387201 -- Total reward = 1.
Iteration #387401 -- Total reward = 0.
Iteration #387601 -- Total reward = 0.
Iteration #387801 -- Total reward = 0.
Iteration #388001 -- Total reward = 1.
Iteration #388201 -- Total reward = 0.
Iteration #388401 -- Tota

Iteration #425801 -- Total reward = 1.
Iteration #426001 -- Total reward = 1.
Iteration #426201 -- Total reward = 0.
Iteration #426401 -- Total reward = 0.
Iteration #426601 -- Total reward = 0.
Iteration #426801 -- Total reward = 0.
Iteration #427001 -- Total reward = 1.
Iteration #427201 -- Total reward = 0.
Iteration #427401 -- Total reward = 0.
Iteration #427601 -- Total reward = 1.
Iteration #427801 -- Total reward = 1.
Iteration #428001 -- Total reward = 0.
Iteration #428201 -- Total reward = 0.
Iteration #428401 -- Total reward = 0.
Iteration #428601 -- Total reward = 0.
Iteration #428801 -- Total reward = 0.
Iteration #429001 -- Total reward = 1.
Iteration #429201 -- Total reward = 0.
Iteration #429401 -- Total reward = 0.
Iteration #429601 -- Total reward = 0.
Iteration #429801 -- Total reward = 1.
Iteration #430001 -- Total reward = 0.
Iteration #430201 -- Total reward = 1.
Iteration #430401 -- Total reward = 1.
Iteration #430601 -- Total reward = 0.
Iteration #430801 -- Tota

Iteration #468201 -- Total reward = 0.
Iteration #468401 -- Total reward = 0.
Iteration #468601 -- Total reward = 0.
Iteration #468801 -- Total reward = 1.
Iteration #469001 -- Total reward = 0.
Iteration #469201 -- Total reward = 0.
Iteration #469401 -- Total reward = 0.
Iteration #469601 -- Total reward = 0.
Iteration #469801 -- Total reward = 1.
Iteration #470001 -- Total reward = 0.
Iteration #470201 -- Total reward = 1.
Iteration #470401 -- Total reward = 0.
Iteration #470601 -- Total reward = 0.
Iteration #470801 -- Total reward = 0.
Iteration #471001 -- Total reward = 0.
Iteration #471201 -- Total reward = 1.
Iteration #471401 -- Total reward = 0.
Iteration #471601 -- Total reward = 1.
Iteration #471801 -- Total reward = 0.
Iteration #472001 -- Total reward = 1.
Iteration #472201 -- Total reward = 1.
Iteration #472401 -- Total reward = 0.
Iteration #472601 -- Total reward = 1.
Iteration #472801 -- Total reward = 0.
Iteration #473001 -- Total reward = 0.
Iteration #473201 -- Tota

Iteration #510401 -- Total reward = 1.
Iteration #510601 -- Total reward = 1.
Iteration #510801 -- Total reward = 0.
Iteration #511001 -- Total reward = 1.
Iteration #511201 -- Total reward = 0.
Iteration #511401 -- Total reward = 0.
Iteration #511601 -- Total reward = 0.
Iteration #511801 -- Total reward = 1.
Iteration #512001 -- Total reward = 1.
Iteration #512201 -- Total reward = 0.
Iteration #512401 -- Total reward = 0.
Iteration #512601 -- Total reward = 0.
Iteration #512801 -- Total reward = 1.
Iteration #513001 -- Total reward = 0.
Iteration #513201 -- Total reward = 0.
Iteration #513401 -- Total reward = 1.
Iteration #513601 -- Total reward = 0.
Iteration #513801 -- Total reward = 1.
Iteration #514001 -- Total reward = 1.
Iteration #514201 -- Total reward = 1.
Iteration #514401 -- Total reward = 0.
Iteration #514601 -- Total reward = 0.
Iteration #514801 -- Total reward = 1.
Iteration #515001 -- Total reward = 0.
Iteration #515201 -- Total reward = 1.
Iteration #515401 -- Tota

Iteration #552601 -- Total reward = 0.
Iteration #552801 -- Total reward = 1.
Iteration #553001 -- Total reward = 0.
Iteration #553201 -- Total reward = 1.
Iteration #553401 -- Total reward = 1.
Iteration #553601 -- Total reward = 1.
Iteration #553801 -- Total reward = 1.
Iteration #554001 -- Total reward = 0.
Iteration #554201 -- Total reward = 1.
Iteration #554401 -- Total reward = 0.
Iteration #554601 -- Total reward = 1.
Iteration #554801 -- Total reward = 0.
Iteration #555001 -- Total reward = 0.
Iteration #555201 -- Total reward = 1.
Iteration #555401 -- Total reward = 0.
Iteration #555601 -- Total reward = 1.
Iteration #555801 -- Total reward = 1.
Iteration #556001 -- Total reward = 1.
Iteration #556201 -- Total reward = 0.
Iteration #556401 -- Total reward = 1.
Iteration #556601 -- Total reward = 1.
Iteration #556801 -- Total reward = 0.
Iteration #557001 -- Total reward = 1.
Iteration #557201 -- Total reward = 0.
Iteration #557401 -- Total reward = 0.
Iteration #557601 -- Tota

Iteration #594801 -- Total reward = 1.
Iteration #595001 -- Total reward = 1.
Iteration #595201 -- Total reward = 0.
Iteration #595401 -- Total reward = 0.
Iteration #595601 -- Total reward = 0.
Iteration #595801 -- Total reward = 0.
Iteration #596001 -- Total reward = 0.
Iteration #596201 -- Total reward = 0.
Iteration #596401 -- Total reward = 0.
Iteration #596601 -- Total reward = 1.
Iteration #596801 -- Total reward = 0.
Iteration #597001 -- Total reward = 1.
Iteration #597201 -- Total reward = 1.
Iteration #597401 -- Total reward = 0.
Iteration #597601 -- Total reward = 0.
Iteration #597801 -- Total reward = 1.
Iteration #598001 -- Total reward = 1.
Iteration #598201 -- Total reward = 0.
Iteration #598401 -- Total reward = 1.
Iteration #598601 -- Total reward = 0.
Iteration #598801 -- Total reward = 0.
Iteration #599001 -- Total reward = 0.
Iteration #599201 -- Total reward = 0.
Iteration #599401 -- Total reward = 0.
Iteration #599601 -- Total reward = 0.
Iteration #599801 -- Tota

Iteration #637201 -- Total reward = 1.
Iteration #637401 -- Total reward = 1.
Iteration #637601 -- Total reward = 1.
Iteration #637801 -- Total reward = 0.
Iteration #638001 -- Total reward = 1.
Iteration #638201 -- Total reward = 0.
Iteration #638401 -- Total reward = 0.
Iteration #638601 -- Total reward = 1.
Iteration #638801 -- Total reward = 0.
Iteration #639001 -- Total reward = 0.
Iteration #639201 -- Total reward = 0.
Iteration #639401 -- Total reward = 0.
Iteration #639601 -- Total reward = 1.
Iteration #639801 -- Total reward = 0.
Iteration #640001 -- Total reward = 0.
Iteration #640201 -- Total reward = 0.
Iteration #640401 -- Total reward = 0.
Iteration #640601 -- Total reward = 0.
Iteration #640801 -- Total reward = 1.
Iteration #641001 -- Total reward = 0.
Iteration #641201 -- Total reward = 0.
Iteration #641401 -- Total reward = 1.
Iteration #641601 -- Total reward = 0.
Iteration #641801 -- Total reward = 0.
Iteration #642001 -- Total reward = 0.
Iteration #642201 -- Tota

Iteration #679401 -- Total reward = 0.
Iteration #679601 -- Total reward = 0.
Iteration #679801 -- Total reward = 0.
Iteration #680001 -- Total reward = 0.
Iteration #680201 -- Total reward = 0.
Iteration #680401 -- Total reward = 0.
Iteration #680601 -- Total reward = 0.
Iteration #680801 -- Total reward = 0.
Iteration #681001 -- Total reward = 0.
Iteration #681201 -- Total reward = 1.
Iteration #681401 -- Total reward = 1.
Iteration #681601 -- Total reward = 0.
Iteration #681801 -- Total reward = 1.
Iteration #682001 -- Total reward = 0.
Iteration #682201 -- Total reward = 1.
Iteration #682401 -- Total reward = 1.
Iteration #682601 -- Total reward = 1.
Iteration #682801 -- Total reward = 0.
Iteration #683001 -- Total reward = 0.
Iteration #683201 -- Total reward = 1.
Iteration #683401 -- Total reward = 1.
Iteration #683601 -- Total reward = 0.
Iteration #683801 -- Total reward = 1.
Iteration #684001 -- Total reward = 1.
Iteration #684201 -- Total reward = 0.
Iteration #684401 -- Tota

Iteration #721801 -- Total reward = 0.
Iteration #722001 -- Total reward = 1.
Iteration #722201 -- Total reward = 0.
Iteration #722401 -- Total reward = 0.
Iteration #722601 -- Total reward = 1.
Iteration #722801 -- Total reward = 0.
Iteration #723001 -- Total reward = 0.
Iteration #723201 -- Total reward = 0.
Iteration #723401 -- Total reward = 0.
Iteration #723601 -- Total reward = 0.
Iteration #723801 -- Total reward = 0.
Iteration #724001 -- Total reward = 1.
Iteration #724201 -- Total reward = 1.
Iteration #724401 -- Total reward = 0.
Iteration #724601 -- Total reward = 1.
Iteration #724801 -- Total reward = 0.
Iteration #725001 -- Total reward = 0.
Iteration #725201 -- Total reward = 1.
Iteration #725401 -- Total reward = 1.
Iteration #725601 -- Total reward = 1.
Iteration #725801 -- Total reward = 1.
Iteration #726001 -- Total reward = 0.
Iteration #726201 -- Total reward = 0.
Iteration #726401 -- Total reward = 0.
Iteration #726601 -- Total reward = 0.
Iteration #726801 -- Tota

Iteration #764001 -- Total reward = 1.
Iteration #764201 -- Total reward = 0.
Iteration #764401 -- Total reward = 0.
Iteration #764601 -- Total reward = 1.
Iteration #764801 -- Total reward = 0.
Iteration #765001 -- Total reward = 0.
Iteration #765201 -- Total reward = 0.
Iteration #765401 -- Total reward = 1.
Iteration #765601 -- Total reward = 0.
Iteration #765801 -- Total reward = 0.
Iteration #766001 -- Total reward = 0.
Iteration #766201 -- Total reward = 0.
Iteration #766401 -- Total reward = 0.
Iteration #766601 -- Total reward = 0.
Iteration #766801 -- Total reward = 1.
Iteration #767001 -- Total reward = 0.
Iteration #767201 -- Total reward = 1.
Iteration #767401 -- Total reward = 1.
Iteration #767601 -- Total reward = 0.
Iteration #767801 -- Total reward = 1.
Iteration #768001 -- Total reward = 0.
Iteration #768201 -- Total reward = 0.
Iteration #768401 -- Total reward = 0.
Iteration #768601 -- Total reward = 0.
Iteration #768801 -- Total reward = 1.
Iteration #769001 -- Tota

Iteration #806401 -- Total reward = 0.
Iteration #806601 -- Total reward = 1.
Iteration #806801 -- Total reward = 0.
Iteration #807001 -- Total reward = 1.
Iteration #807201 -- Total reward = 1.
Iteration #807401 -- Total reward = 0.
Iteration #807601 -- Total reward = 0.
Iteration #807801 -- Total reward = 1.
Iteration #808001 -- Total reward = 0.
Iteration #808201 -- Total reward = 0.
Iteration #808401 -- Total reward = 0.
Iteration #808601 -- Total reward = 0.
Iteration #808801 -- Total reward = 0.
Iteration #809001 -- Total reward = 0.
Iteration #809201 -- Total reward = 0.
Iteration #809401 -- Total reward = 0.
Iteration #809601 -- Total reward = 1.
Iteration #809801 -- Total reward = 0.
Iteration #810001 -- Total reward = 0.
Iteration #810201 -- Total reward = 1.
Iteration #810401 -- Total reward = 1.
Iteration #810601 -- Total reward = 1.
Iteration #810801 -- Total reward = 1.
Iteration #811001 -- Total reward = 1.
Iteration #811201 -- Total reward = 0.
Iteration #811401 -- Tota

Iteration #848801 -- Total reward = 1.
Iteration #849001 -- Total reward = 0.
Iteration #849201 -- Total reward = 1.
Iteration #849401 -- Total reward = 0.
Iteration #849601 -- Total reward = 0.
Iteration #849801 -- Total reward = 1.
Iteration #850001 -- Total reward = 1.
Iteration #850201 -- Total reward = 0.
Iteration #850401 -- Total reward = 1.
Iteration #850601 -- Total reward = 0.
Iteration #850801 -- Total reward = 0.
Iteration #851001 -- Total reward = 1.
Iteration #851201 -- Total reward = 0.
Iteration #851401 -- Total reward = 0.
Iteration #851601 -- Total reward = 1.
Iteration #851801 -- Total reward = 0.
Iteration #852001 -- Total reward = 1.
Iteration #852201 -- Total reward = 1.
Iteration #852401 -- Total reward = 1.
Iteration #852601 -- Total reward = 1.
Iteration #852801 -- Total reward = 0.
Iteration #853001 -- Total reward = 0.
Iteration #853201 -- Total reward = 0.
Iteration #853401 -- Total reward = 0.
Iteration #853601 -- Total reward = 0.
Iteration #853801 -- Tota

Iteration #891201 -- Total reward = 1.
Iteration #891401 -- Total reward = 1.
Iteration #891601 -- Total reward = 0.
Iteration #891801 -- Total reward = 0.
Iteration #892001 -- Total reward = 0.
Iteration #892201 -- Total reward = 1.
Iteration #892401 -- Total reward = 1.
Iteration #892601 -- Total reward = 0.
Iteration #892801 -- Total reward = 0.
Iteration #893001 -- Total reward = 0.
Iteration #893201 -- Total reward = 1.
Iteration #893401 -- Total reward = 0.
Iteration #893601 -- Total reward = 1.
Iteration #893801 -- Total reward = 1.
Iteration #894001 -- Total reward = 1.
Iteration #894201 -- Total reward = 1.
Iteration #894401 -- Total reward = 0.
Iteration #894601 -- Total reward = 1.
Iteration #894801 -- Total reward = 0.
Iteration #895001 -- Total reward = 0.
Iteration #895201 -- Total reward = 1.
Iteration #895401 -- Total reward = 0.
Iteration #895601 -- Total reward = 1.
Iteration #895801 -- Total reward = 0.
Iteration #896001 -- Total reward = 0.
Iteration #896201 -- Tota

Iteration #933401 -- Total reward = 0.
Iteration #933601 -- Total reward = 1.
Iteration #933801 -- Total reward = 0.
Iteration #934001 -- Total reward = 0.
Iteration #934201 -- Total reward = 1.
Iteration #934401 -- Total reward = 0.
Iteration #934601 -- Total reward = 1.
Iteration #934801 -- Total reward = 0.
Iteration #935001 -- Total reward = 1.
Iteration #935201 -- Total reward = 0.
Iteration #935401 -- Total reward = 0.
Iteration #935601 -- Total reward = 0.
Iteration #935801 -- Total reward = 1.
Iteration #936001 -- Total reward = 1.
Iteration #936201 -- Total reward = 0.
Iteration #936401 -- Total reward = 1.
Iteration #936601 -- Total reward = 0.
Iteration #936801 -- Total reward = 1.
Iteration #937001 -- Total reward = 1.
Iteration #937201 -- Total reward = 1.
Iteration #937401 -- Total reward = 0.
Iteration #937601 -- Total reward = 0.
Iteration #937801 -- Total reward = 0.
Iteration #938001 -- Total reward = 0.
Iteration #938201 -- Total reward = 1.
Iteration #938401 -- Tota

Iteration #975801 -- Total reward = 1.
Iteration #976001 -- Total reward = 1.
Iteration #976201 -- Total reward = 0.
Iteration #976401 -- Total reward = 1.
Iteration #976601 -- Total reward = 0.
Iteration #976801 -- Total reward = 1.
Iteration #977001 -- Total reward = 1.
Iteration #977201 -- Total reward = 0.
Iteration #977401 -- Total reward = 1.
Iteration #977601 -- Total reward = 0.
Iteration #977801 -- Total reward = 1.
Iteration #978001 -- Total reward = 1.
Iteration #978201 -- Total reward = 0.
Iteration #978401 -- Total reward = 0.
Iteration #978601 -- Total reward = 0.
Iteration #978801 -- Total reward = 0.
Iteration #979001 -- Total reward = 1.
Iteration #979201 -- Total reward = 0.
Iteration #979401 -- Total reward = 1.
Iteration #979601 -- Total reward = 0.
Iteration #979801 -- Total reward = 1.
Iteration #980001 -- Total reward = 0.
Iteration #980201 -- Total reward = 1.
Iteration #980401 -- Total reward = 0.
Iteration #980601 -- Total reward = 1.
Iteration #980801 -- Tota

TypeError: '>' not supported between instances of 'list' and 'float'

In [34]:
np.mean([run_episode(env, solution_policy, False) for _ in range(100)])

0.71

### test

In [None]:
q_table = np.load('frozenlake-v0_Qtable_Qlearning.npy')

def run_episode(env, policy=None, render=False):
    obs = env.reset()
    total_reward = 0
    steps = 0
    for _ in range(t_max):
        if render:
            env.render()
        '''
        we could also only print the last state (to see if our agent is on the goal or fall into an hole)
        put the 'env.render()' into the 'done:'
        '''
        if policy is None:
            action = env.action_space.sample()
        else:
            action = policy[obs]
        obs, reward, done, _ = env.step(action)
        total_reward += gamma**steps * reward
        steps += 1
        
        if done:
            break
    return total_reward


solution_policy = np.argmax(q_table, axis=1)

print(np.mean([run_episode(env, solution_policy, False) for _ in range(100)]))

run_episode(env, solution_policy, True)

## SARSA

In [None]:
off_policy = False
env.seed(123)
np.random.seed(123)

if off_policy == True:
    print ('----- using Q Learning -----')
else:
    print('------ using SARSA Learning ---')

q_table = np.zeros((env.observation_space.n, env.action_space.n))   # (16, 4)
returns = []
for i in range(iter_max):
    obs = env.reset()
    total_reward = 0
    # eta: learning rate is decayed every 100 steps
    eta = max(min_lr, initial_lr * (0.85**(i//100)))
    
    for j in range(t_max):
        if np.random.uniform(0, 1) < eps:
            action = np.random.choice(env.action_space.n)
        else:
            action = np.argmax(q_table[obs])
        
        obs_tp1, reward, done, _ = env.step(action)
        total_reward += reward
        
        # update q table
        if off_policy == True:
            # q-learning
            target = reward + gamma * np.max(q_table[obs_tp1]) * (1 - done)
            q_table[obs][action] = q_table[obs][action] + eta * (target - q_table[obs][action])
        else:
            # Sarsa
            if np.random.uniform(0, 1) < eps:
                action_ = np.random.choice(env.action_space.n)
            else:
                action_ = np.argmax(q_table[obs_tp1])
            target = reward + gamma * q_table[obs_tp1][action_] * (1 - done)
            q_table[obs][action] = q_table[obs][action] + eta * (target - q_table[obs][action])
        
        obs = obs_tp1
        if done:
            returns.append(total_reward)
            break
        
    if i % 200 == 0:
        print('Iteration #%d -- Total reward = %d.' %(i+1, total_reward))
        
    if np.array(returns[-100:]).mean() > 0.7:
        break
        
        
solution_policy = np.argmax(q_table, axis=1)
solution_policy_scores = [run_episode(env, solution_policy, False) for _ in range(100)]
print("Average score of solution = ", np.mean(solution_policy_scores))

# Animate it
if np.mean(solution_policy_scores) > 0.7:
    for i in range(2):
        print('episode {}'.format(i))
        run_episode(env, solution_policy, True)
env.close()  

# MountainCar-v0

[介绍](https://github.com/openai/gym/wiki/MountainCar-v0)

**MountainCar-v0 defines "solving" as getting average reward of -110.0 over 100 consecutive trials.**

In [5]:
name = "MountainCar-v0"
env = gym.make(name)

# env.observation_space, env.action_space
# env.observation_space.sample(), env.action_space.sample()
# env.observation_space.high, env.observation_space.low
"""
状态是一个2维连续变量【位置, 速度】, 动作是一个离散变量, 取值 0（向左推）, 1（不动）, 2（向右推）
只有到达右边山顶的黄旗处得0.5, 其他状态都是0
"""

off_policy = True # if True use q-learning, if False use Sarsa

n_states = 40
def obs_to_state(env, obs):
    """
    Map an observation to state
    discrete the continuous observation space
    """
    env_low = env.observation_space.low
    env_high = env.observation_space.high
    env_dx = (env_high - env_low) / n_states
    a = int((obs[0] - env_low[0])/env_dx[0])
    b = int((obs[1] - env_low[1])/env_dx[1])
    return a, b



t_max = 10000
def run_episode(env, policy=None, render=False):
    obs = env.reset()
    total_reward = 0
    steps = 0
    for _ in range(t_max):
        if render:
            env.render()
        if policy is None:
            action = env.action_space.sample()
        else:
            a, b = obs_to_state(env, obs)
            action = policy[a][b]
        obs, reward, done, _ = env.step(action)
        total_reward += gamma**steps * reward
        steps += 1
        
        if done:
            break
    return total_reward



iter_max = int(1e6)
initial_lr = 1.0
min_lr = 0.003
gamma = 1.0
eps = 0.1
env.seed(0)
np.random.seed(0)

if off_policy == True:
    print ('----- using Q Learning -----')
else:
    print('------ using SARSA Learning ---')

q_table = np.zeros((n_states, n_states, 3))

returns = []
for i in range(iter_max):
    obs = env.reset()
    total_reward = 0
    # eta: learning rate is decayed every 100 steps
    eta = max(min_lr, initial_lr * (0.85**(i//100)))
    
    for j in range(t_max):
        a, b = obs_to_state(env, obs)
        if np.random.uniform(0, 1) < eps:
            action = np.random.choice(env.action_space.n)
        else:
            action = np.argmax(q_table[a][b])
        
        obs, reward, done, _ = env.step(action)
        total_reward += reward
        
        # update q table
        a_, b_ = obs_to_state(env, obs)
        if off_policy == True:
            # q-learning
            target = reward + gamma * np.max(q_table[a_][b_]) * (1 - done)
            q_table[a][b][action] = q_table[a][b][action] + eta * (target - q_table[a][b][action])
        else:
            # Sarsa
            if np.random.uniform(0, 1) < eps:
                action_ = np.random.choice(env.action_space.n)
            else:
                action_ = np.argmax(q_table[a_][b_])
            target = reward + gamma * q_table[a_][b_][action_] * (1 - done)
            q_table[a][b][action] = q_table[a][b][action] + eta * (target - q_table[a][b][action])
            
        if done:
            returns.append(total_reward)
            break
        
    if i % 200 == 0:
        print('Iteration #%d -- Total reward = %d.' %(i+1, total_reward))
     
    if np.mean(returns[-100:]) > -110:
        break
    
solution_policy = np.argmax(q_table, axis=2)
solution_policy_scores = [run_episode(env, solution_policy, False) for _ in range(100)]
print("Average score of solution = ", np.mean(solution_policy_scores))

# Animate it
if np.mean(solution_policy_scores) >= -110:
    for _ in range(2):
        run_episode(env, solution_policy, True)
env.close() 

----- using Q Learning -----
Iteration #1 -- Total reward = -200.
Iteration #201 -- Total reward = -200.
Iteration #401 -- Total reward = -200.
Iteration #601 -- Total reward = -200.
Iteration #801 -- Total reward = -200.
Iteration #1001 -- Total reward = -200.
Iteration #1201 -- Total reward = -200.
Iteration #1401 -- Total reward = -200.
Iteration #1601 -- Total reward = -200.
Iteration #1801 -- Total reward = -200.
Iteration #2001 -- Total reward = -160.
Iteration #2201 -- Total reward = -191.
Iteration #2401 -- Total reward = -156.
Iteration #2601 -- Total reward = -200.
Iteration #2801 -- Total reward = -154.
Iteration #3001 -- Total reward = -151.
Iteration #3201 -- Total reward = -158.
Iteration #3401 -- Total reward = -158.
Iteration #3601 -- Total reward = -157.
Iteration #3801 -- Total reward = -191.
Iteration #4001 -- Total reward = -200.
Iteration #4201 -- Total reward = -200.
Iteration #4401 -- Total reward = -200.
Iteration #4601 -- Total reward = -163.
Iteration #4801 --

Iteration #40201 -- Total reward = -200.
Iteration #40401 -- Total reward = -151.
Iteration #40601 -- Total reward = -200.
Iteration #40801 -- Total reward = -153.
Iteration #41001 -- Total reward = -200.
Iteration #41201 -- Total reward = -200.
Iteration #41401 -- Total reward = -198.
Iteration #41601 -- Total reward = -200.
Iteration #41801 -- Total reward = -193.
Iteration #42001 -- Total reward = -200.
Iteration #42201 -- Total reward = -191.
Iteration #42401 -- Total reward = -200.
Iteration #42601 -- Total reward = -200.
Iteration #42801 -- Total reward = -186.
Iteration #43001 -- Total reward = -193.
Iteration #43201 -- Total reward = -196.
Iteration #43401 -- Total reward = -200.
Iteration #43601 -- Total reward = -200.
Iteration #43801 -- Total reward = -200.
Iteration #44001 -- Total reward = -200.
Iteration #44201 -- Total reward = -200.
Iteration #44401 -- Total reward = -200.
Iteration #44601 -- Total reward = -190.
Iteration #44801 -- Total reward = -200.
Iteration #45001

Iteration #80201 -- Total reward = -199.
Iteration #80401 -- Total reward = -158.
Iteration #80601 -- Total reward = -192.
Iteration #80801 -- Total reward = -152.
Iteration #81001 -- Total reward = -161.
Iteration #81201 -- Total reward = -155.
Iteration #81401 -- Total reward = -155.
Iteration #81601 -- Total reward = -157.
Iteration #81801 -- Total reward = -196.
Iteration #82001 -- Total reward = -158.
Iteration #82201 -- Total reward = -165.
Iteration #82401 -- Total reward = -197.
Iteration #82601 -- Total reward = -158.
Iteration #82801 -- Total reward = -191.
Iteration #83001 -- Total reward = -153.
Iteration #83201 -- Total reward = -158.
Iteration #83401 -- Total reward = -158.
Iteration #83601 -- Total reward = -154.
Iteration #83801 -- Total reward = -192.
Iteration #84001 -- Total reward = -162.
Iteration #84201 -- Total reward = -200.
Iteration #84401 -- Total reward = -162.
Iteration #84601 -- Total reward = -196.
Iteration #84801 -- Total reward = -155.
Iteration #85001

Iteration #119801 -- Total reward = -116.
Iteration #120001 -- Total reward = -118.
Iteration #120201 -- Total reward = -149.
Iteration #120401 -- Total reward = -155.
Iteration #120601 -- Total reward = -125.
Iteration #120801 -- Total reward = -186.
Iteration #121001 -- Total reward = -157.
Iteration #121201 -- Total reward = -198.
Iteration #121401 -- Total reward = -158.
Iteration #121601 -- Total reward = -193.
Iteration #121801 -- Total reward = -159.
Iteration #122001 -- Total reward = -150.
Iteration #122201 -- Total reward = -119.
Iteration #122401 -- Total reward = -161.
Iteration #122601 -- Total reward = -132.
Iteration #122801 -- Total reward = -162.
Iteration #123001 -- Total reward = -154.
Iteration #123201 -- Total reward = -156.
Iteration #123401 -- Total reward = -197.
Iteration #123601 -- Total reward = -184.
Iteration #123801 -- Total reward = -164.
Iteration #124001 -- Total reward = -183.
Iteration #124201 -- Total reward = -179.
Iteration #124401 -- Total reward 

Iteration #159001 -- Total reward = -118.
Iteration #159201 -- Total reward = -120.
Iteration #159401 -- Total reward = -200.
Iteration #159601 -- Total reward = -156.
Iteration #159801 -- Total reward = -189.
Iteration #160001 -- Total reward = -156.
Iteration #160201 -- Total reward = -122.
Iteration #160401 -- Total reward = -155.
Iteration #160601 -- Total reward = -192.
Iteration #160801 -- Total reward = -139.
Iteration #161001 -- Total reward = -121.
Iteration #161201 -- Total reward = -154.
Iteration #161401 -- Total reward = -115.
Iteration #161601 -- Total reward = -118.
Iteration #161801 -- Total reward = -113.
Iteration #162001 -- Total reward = -164.
Iteration #162201 -- Total reward = -154.
Iteration #162401 -- Total reward = -157.
Iteration #162601 -- Total reward = -170.
Iteration #162801 -- Total reward = -150.
Iteration #163001 -- Total reward = -158.
Iteration #163201 -- Total reward = -158.
Iteration #163401 -- Total reward = -154.
Iteration #163601 -- Total reward 

Iteration #198201 -- Total reward = -115.
Iteration #198401 -- Total reward = -148.
Iteration #198601 -- Total reward = -121.
Iteration #198801 -- Total reward = -118.
Iteration #199001 -- Total reward = -181.
Iteration #199201 -- Total reward = -115.
Iteration #199401 -- Total reward = -115.
Iteration #199601 -- Total reward = -154.
Iteration #199801 -- Total reward = -152.
Iteration #200001 -- Total reward = -154.
Iteration #200201 -- Total reward = -146.
Iteration #200401 -- Total reward = -156.
Iteration #200601 -- Total reward = -146.
Iteration #200801 -- Total reward = -148.
Iteration #201001 -- Total reward = -121.
Iteration #201201 -- Total reward = -152.
Iteration #201401 -- Total reward = -156.
Iteration #201601 -- Total reward = -118.
Iteration #201801 -- Total reward = -155.
Iteration #202001 -- Total reward = -120.
Iteration #202201 -- Total reward = -119.
Iteration #202401 -- Total reward = -153.
Iteration #202601 -- Total reward = -153.
Iteration #202801 -- Total reward 

Iteration #237401 -- Total reward = -153.
Iteration #237601 -- Total reward = -118.
Iteration #237801 -- Total reward = -120.
Iteration #238001 -- Total reward = -149.
Iteration #238201 -- Total reward = -119.
Iteration #238401 -- Total reward = -119.
Iteration #238601 -- Total reward = -150.
Iteration #238801 -- Total reward = -118.
Iteration #239001 -- Total reward = -154.
Iteration #239201 -- Total reward = -148.
Iteration #239401 -- Total reward = -157.
Iteration #239601 -- Total reward = -123.
Iteration #239801 -- Total reward = -119.
Iteration #240001 -- Total reward = -119.
Iteration #240201 -- Total reward = -157.
Iteration #240401 -- Total reward = -150.
Iteration #240601 -- Total reward = -91.
Iteration #240801 -- Total reward = -158.
Iteration #241001 -- Total reward = -119.
Iteration #241201 -- Total reward = -146.
Iteration #241401 -- Total reward = -154.
Iteration #241601 -- Total reward = -154.
Iteration #241801 -- Total reward = -154.
Iteration #242001 -- Total reward =

Iteration #276601 -- Total reward = -157.
Iteration #276801 -- Total reward = -116.
Iteration #277001 -- Total reward = -161.
Iteration #277201 -- Total reward = -144.
Iteration #277401 -- Total reward = -159.
Iteration #277601 -- Total reward = -116.
Iteration #277801 -- Total reward = -114.
Iteration #278001 -- Total reward = -154.
Iteration #278201 -- Total reward = -150.
Iteration #278401 -- Total reward = -153.
Iteration #278601 -- Total reward = -122.
Iteration #278801 -- Total reward = -115.
Iteration #279001 -- Total reward = -146.
Iteration #279201 -- Total reward = -153.
Iteration #279401 -- Total reward = -155.
Iteration #279601 -- Total reward = -118.
Iteration #279801 -- Total reward = -157.
Iteration #280001 -- Total reward = -139.
Iteration #280201 -- Total reward = -149.
Iteration #280401 -- Total reward = -154.
Iteration #280601 -- Total reward = -153.
Iteration #280801 -- Total reward = -116.
Iteration #281001 -- Total reward = -153.
Iteration #281201 -- Total reward 

Iteration #315801 -- Total reward = -116.
Iteration #316001 -- Total reward = -160.
Iteration #316201 -- Total reward = -123.
Iteration #316401 -- Total reward = -120.
Iteration #316601 -- Total reward = -118.
Iteration #316801 -- Total reward = -160.
Iteration #317001 -- Total reward = -151.
Iteration #317201 -- Total reward = -139.
Iteration #317401 -- Total reward = -142.
Iteration #317601 -- Total reward = -147.
Iteration #317801 -- Total reward = -121.
Iteration #318001 -- Total reward = -156.
Iteration #318201 -- Total reward = -160.
Iteration #318401 -- Total reward = -149.
Iteration #318601 -- Total reward = -143.
Iteration #318801 -- Total reward = -164.
Iteration #319001 -- Total reward = -147.
Iteration #319201 -- Total reward = -144.
Iteration #319401 -- Total reward = -171.
Iteration #319601 -- Total reward = -161.
Iteration #319801 -- Total reward = -123.
Iteration #320001 -- Total reward = -118.
Iteration #320201 -- Total reward = -155.
Iteration #320401 -- Total reward 

Iteration #355001 -- Total reward = -155.
Iteration #355201 -- Total reward = -152.
Iteration #355401 -- Total reward = -152.
Iteration #355601 -- Total reward = -144.
Iteration #355801 -- Total reward = -149.
Iteration #356001 -- Total reward = -111.
Iteration #356201 -- Total reward = -152.
Iteration #356401 -- Total reward = -150.
Iteration #356601 -- Total reward = -149.
Iteration #356801 -- Total reward = -143.
Iteration #357001 -- Total reward = -150.
Iteration #357201 -- Total reward = -150.
Iteration #357401 -- Total reward = -152.
Iteration #357601 -- Total reward = -156.
Iteration #357801 -- Total reward = -157.
Iteration #358001 -- Total reward = -150.
Iteration #358201 -- Total reward = -149.
Iteration #358401 -- Total reward = -148.
Iteration #358601 -- Total reward = -155.
Iteration #358801 -- Total reward = -150.
Iteration #359001 -- Total reward = -154.
Iteration #359201 -- Total reward = -184.
Iteration #359401 -- Total reward = -153.
Iteration #359601 -- Total reward 

Iteration #394201 -- Total reward = -116.
Iteration #394401 -- Total reward = -164.
Iteration #394601 -- Total reward = -147.
Iteration #394801 -- Total reward = -108.
Iteration #395001 -- Total reward = -136.
Iteration #395201 -- Total reward = -129.
Iteration #395401 -- Total reward = -156.
Iteration #395601 -- Total reward = -158.
Iteration #395801 -- Total reward = -158.
Iteration #396001 -- Total reward = -145.
Iteration #396201 -- Total reward = -112.
Iteration #396401 -- Total reward = -117.
Iteration #396601 -- Total reward = -135.
Iteration #396801 -- Total reward = -148.
Iteration #397001 -- Total reward = -111.
Iteration #397201 -- Total reward = -119.
Iteration #397401 -- Total reward = -112.
Iteration #397601 -- Total reward = -110.
Iteration #397801 -- Total reward = -163.
Iteration #398001 -- Total reward = -193.
Iteration #398201 -- Total reward = -114.
Iteration #398401 -- Total reward = -192.
Iteration #398601 -- Total reward = -150.
Iteration #398801 -- Total reward 

Iteration #433401 -- Total reward = -159.
Iteration #433601 -- Total reward = -148.
Iteration #433801 -- Total reward = -149.
Iteration #434001 -- Total reward = -162.
Iteration #434201 -- Total reward = -153.
Iteration #434401 -- Total reward = -144.
Iteration #434601 -- Total reward = -152.
Iteration #434801 -- Total reward = -153.
Iteration #435001 -- Total reward = -153.
Iteration #435201 -- Total reward = -122.
Iteration #435401 -- Total reward = -163.
Iteration #435601 -- Total reward = -155.
Iteration #435801 -- Total reward = -146.
Iteration #436001 -- Total reward = -146.
Iteration #436201 -- Total reward = -142.
Iteration #436401 -- Total reward = -162.
Iteration #436601 -- Total reward = -153.
Iteration #436801 -- Total reward = -133.
Iteration #437001 -- Total reward = -146.
Iteration #437201 -- Total reward = -147.
Iteration #437401 -- Total reward = -142.
Iteration #437601 -- Total reward = -137.
Iteration #437801 -- Total reward = -149.
Iteration #438001 -- Total reward 

Iteration #472601 -- Total reward = -117.
Iteration #472801 -- Total reward = -148.
Iteration #473001 -- Total reward = -103.
Iteration #473201 -- Total reward = -118.
Iteration #473401 -- Total reward = -152.
Iteration #473601 -- Total reward = -155.
Iteration #473801 -- Total reward = -157.
Iteration #474001 -- Total reward = -162.
Iteration #474201 -- Total reward = -117.
Iteration #474401 -- Total reward = -163.
Iteration #474601 -- Total reward = -111.
Iteration #474801 -- Total reward = -120.
Iteration #475001 -- Total reward = -98.
Iteration #475201 -- Total reward = -162.
Iteration #475401 -- Total reward = -147.
Iteration #475601 -- Total reward = -122.
Iteration #475801 -- Total reward = -91.
Iteration #476001 -- Total reward = -155.
Iteration #476201 -- Total reward = -151.
Iteration #476401 -- Total reward = -157.
Iteration #476601 -- Total reward = -96.
Iteration #476801 -- Total reward = -142.
Iteration #477001 -- Total reward = -160.
Iteration #477201 -- Total reward = -

Iteration #511801 -- Total reward = -145.
Iteration #512001 -- Total reward = -156.
Iteration #512201 -- Total reward = -137.
Iteration #512401 -- Total reward = -151.
Iteration #512601 -- Total reward = -150.
Iteration #512801 -- Total reward = -161.
Iteration #513001 -- Total reward = -148.
Iteration #513201 -- Total reward = -112.
Iteration #513401 -- Total reward = -146.
Iteration #513601 -- Total reward = -149.
Iteration #513801 -- Total reward = -153.
Iteration #514001 -- Total reward = -148.
Iteration #514201 -- Total reward = -143.
Iteration #514401 -- Total reward = -155.
Iteration #514601 -- Total reward = -156.
Iteration #514801 -- Total reward = -148.
Iteration #515001 -- Total reward = -158.
Iteration #515201 -- Total reward = -157.
Iteration #515401 -- Total reward = -152.
Iteration #515601 -- Total reward = -158.
Iteration #515801 -- Total reward = -185.
Iteration #516001 -- Total reward = -154.
Iteration #516201 -- Total reward = -144.
Iteration #516401 -- Total reward 

Iteration #551001 -- Total reward = -101.
Iteration #551201 -- Total reward = -150.
Iteration #551401 -- Total reward = -147.
Iteration #551601 -- Total reward = -178.
Iteration #551801 -- Total reward = -93.
Iteration #552001 -- Total reward = -154.
Iteration #552201 -- Total reward = -95.
Iteration #552401 -- Total reward = -114.
Iteration #552601 -- Total reward = -129.
Iteration #552801 -- Total reward = -163.
Iteration #553001 -- Total reward = -148.
Iteration #553201 -- Total reward = -148.
Iteration #553401 -- Total reward = -157.
Iteration #553601 -- Total reward = -117.
Iteration #553801 -- Total reward = -97.
Iteration #554001 -- Total reward = -141.
Iteration #554201 -- Total reward = -152.
Iteration #554401 -- Total reward = -136.
Iteration #554601 -- Total reward = -139.
Iteration #554801 -- Total reward = -162.
Iteration #555001 -- Total reward = -114.
Iteration #555201 -- Total reward = -111.
Iteration #555401 -- Total reward = -99.
Iteration #555601 -- Total reward = -1

Iteration #590201 -- Total reward = -154.
Iteration #590401 -- Total reward = -148.
Iteration #590601 -- Total reward = -128.
Iteration #590801 -- Total reward = -156.
Iteration #591001 -- Total reward = -163.
Iteration #591201 -- Total reward = -157.
Iteration #591401 -- Total reward = -144.
Iteration #591601 -- Total reward = -121.
Iteration #591801 -- Total reward = -156.
Iteration #592001 -- Total reward = -111.
Iteration #592201 -- Total reward = -200.
Iteration #592401 -- Total reward = -161.
Iteration #592601 -- Total reward = -172.
Iteration #592801 -- Total reward = -110.
Iteration #593001 -- Total reward = -114.
Iteration #593201 -- Total reward = -154.
Iteration #593401 -- Total reward = -146.
Iteration #593601 -- Total reward = -148.
Iteration #593801 -- Total reward = -160.
Iteration #594001 -- Total reward = -126.
Iteration #594201 -- Total reward = -135.
Iteration #594401 -- Total reward = -140.
Iteration #594601 -- Total reward = -146.
Iteration #594801 -- Total reward 

Iteration #629401 -- Total reward = -158.
Iteration #629601 -- Total reward = -141.
Iteration #629801 -- Total reward = -139.
Iteration #630001 -- Total reward = -96.
Iteration #630201 -- Total reward = -116.
Iteration #630401 -- Total reward = -154.
Iteration #630601 -- Total reward = -157.
Iteration #630801 -- Total reward = -153.
Iteration #631001 -- Total reward = -153.
Iteration #631201 -- Total reward = -175.
Iteration #631401 -- Total reward = -143.
Iteration #631601 -- Total reward = -146.
Iteration #631801 -- Total reward = -113.
Iteration #632001 -- Total reward = -121.
Iteration #632201 -- Total reward = -159.
Iteration #632401 -- Total reward = -148.
Iteration #632601 -- Total reward = -146.
Iteration #632801 -- Total reward = -165.
Iteration #633001 -- Total reward = -118.
Iteration #633201 -- Total reward = -139.
Iteration #633401 -- Total reward = -152.
Iteration #633601 -- Total reward = -118.
Iteration #633801 -- Total reward = -136.
Iteration #634001 -- Total reward =

Iteration #668601 -- Total reward = -144.
Iteration #668801 -- Total reward = -145.
Iteration #669001 -- Total reward = -112.
Iteration #669201 -- Total reward = -108.
Iteration #669401 -- Total reward = -146.
Iteration #669601 -- Total reward = -143.
Iteration #669801 -- Total reward = -110.
Iteration #670001 -- Total reward = -144.
Iteration #670201 -- Total reward = -144.
Iteration #670401 -- Total reward = -111.
Iteration #670601 -- Total reward = -146.
Iteration #670801 -- Total reward = -145.
Iteration #671001 -- Total reward = -115.
Iteration #671201 -- Total reward = -144.
Iteration #671401 -- Total reward = -118.
Iteration #671601 -- Total reward = -116.
Iteration #671801 -- Total reward = -116.
Iteration #672001 -- Total reward = -114.
Iteration #672201 -- Total reward = -112.
Iteration #672401 -- Total reward = -119.
Iteration #672601 -- Total reward = -137.
Iteration #672801 -- Total reward = -142.
Iteration #673001 -- Total reward = -137.
Iteration #673201 -- Total reward 

Iteration #707801 -- Total reward = -110.
Iteration #708001 -- Total reward = -148.
Iteration #708201 -- Total reward = -160.
Iteration #708401 -- Total reward = -141.
Iteration #708601 -- Total reward = -162.
Iteration #708801 -- Total reward = -148.
Iteration #709001 -- Total reward = -97.
Iteration #709201 -- Total reward = -120.
Iteration #709401 -- Total reward = -112.
Iteration #709601 -- Total reward = -115.
Iteration #709801 -- Total reward = -150.
Iteration #710001 -- Total reward = -112.
Iteration #710201 -- Total reward = -111.
Iteration #710401 -- Total reward = -115.
Iteration #710601 -- Total reward = -143.
Iteration #710801 -- Total reward = -112.
Iteration #711001 -- Total reward = -89.
Iteration #711201 -- Total reward = -90.
Iteration #711401 -- Total reward = -160.
Iteration #711601 -- Total reward = -92.
Iteration #711801 -- Total reward = -172.
Iteration #712001 -- Total reward = -150.
Iteration #712201 -- Total reward = -111.
Iteration #712401 -- Total reward = -1

Iteration #747001 -- Total reward = -179.
Iteration #747201 -- Total reward = -150.
Iteration #747401 -- Total reward = -145.
Iteration #747601 -- Total reward = -98.
Iteration #747801 -- Total reward = -112.
Iteration #748001 -- Total reward = -111.
Iteration #748201 -- Total reward = -143.
Iteration #748401 -- Total reward = -107.
Iteration #748601 -- Total reward = -90.
Iteration #748801 -- Total reward = -177.
Iteration #749001 -- Total reward = -110.
Iteration #749201 -- Total reward = -111.
Iteration #749401 -- Total reward = -160.
Iteration #749601 -- Total reward = -118.
Iteration #749801 -- Total reward = -112.
Iteration #750001 -- Total reward = -109.
Iteration #750201 -- Total reward = -112.
Iteration #750401 -- Total reward = -118.
Iteration #750601 -- Total reward = -143.
Iteration #750801 -- Total reward = -144.
Iteration #751001 -- Total reward = -151.
Iteration #751201 -- Total reward = -151.
Iteration #751401 -- Total reward = -116.
Iteration #751601 -- Total reward = 

Iteration #786201 -- Total reward = -112.
Iteration #786401 -- Total reward = -156.
Iteration #786601 -- Total reward = -122.
Iteration #786801 -- Total reward = -120.
Iteration #787001 -- Total reward = -103.
Iteration #787201 -- Total reward = -176.
Iteration #787401 -- Total reward = -120.
Iteration #787601 -- Total reward = -124.
Iteration #787801 -- Total reward = -159.
Iteration #788001 -- Total reward = -121.
Iteration #788201 -- Total reward = -158.
Iteration #788401 -- Total reward = -190.
Iteration #788601 -- Total reward = -125.
Iteration #788801 -- Total reward = -123.
Iteration #789001 -- Total reward = -149.
Iteration #789201 -- Total reward = -101.
Iteration #789401 -- Total reward = -148.
Iteration #789601 -- Total reward = -97.
Iteration #789801 -- Total reward = -94.
Iteration #790001 -- Total reward = -127.
Iteration #790201 -- Total reward = -149.
Iteration #790401 -- Total reward = -145.
Iteration #790601 -- Total reward = -144.
Iteration #790801 -- Total reward = 

Iteration #825401 -- Total reward = -143.
Iteration #825601 -- Total reward = -156.
Iteration #825801 -- Total reward = -115.
Iteration #826001 -- Total reward = -157.
Iteration #826201 -- Total reward = -116.
Iteration #826401 -- Total reward = -128.
Iteration #826601 -- Total reward = -153.
Iteration #826801 -- Total reward = -154.
Iteration #827001 -- Total reward = -151.
Iteration #827201 -- Total reward = -150.
Iteration #827401 -- Total reward = -153.
Iteration #827601 -- Total reward = -116.
Iteration #827801 -- Total reward = -152.
Iteration #828001 -- Total reward = -148.
Iteration #828201 -- Total reward = -149.
Iteration #828401 -- Total reward = -153.
Iteration #828601 -- Total reward = -150.
Iteration #828801 -- Total reward = -149.
Iteration #829001 -- Total reward = -144.
Iteration #829201 -- Total reward = -151.
Iteration #829401 -- Total reward = -149.
Iteration #829601 -- Total reward = -153.
Iteration #829801 -- Total reward = -146.
Iteration #830001 -- Total reward 

Iteration #864601 -- Total reward = -112.
Iteration #864801 -- Total reward = -145.
Iteration #865001 -- Total reward = -153.
Iteration #865201 -- Total reward = -107.
Iteration #865401 -- Total reward = -143.
Iteration #865601 -- Total reward = -116.
Iteration #865801 -- Total reward = -146.
Iteration #866001 -- Total reward = -153.
Iteration #866201 -- Total reward = -109.
Iteration #866401 -- Total reward = -144.
Iteration #866601 -- Total reward = -142.
Iteration #866801 -- Total reward = -144.
Iteration #867001 -- Total reward = -119.
Iteration #867201 -- Total reward = -111.
Iteration #867401 -- Total reward = -156.
Iteration #867601 -- Total reward = -145.
Iteration #867801 -- Total reward = -144.
Iteration #868001 -- Total reward = -147.
Iteration #868201 -- Total reward = -150.
Iteration #868401 -- Total reward = -145.
Iteration #868601 -- Total reward = -151.
Iteration #868801 -- Total reward = -147.
Iteration #869001 -- Total reward = -142.
Iteration #869201 -- Total reward 

Iteration #903801 -- Total reward = -148.
Iteration #904001 -- Total reward = -114.
Iteration #904201 -- Total reward = -120.
Iteration #904401 -- Total reward = -112.
Iteration #904601 -- Total reward = -113.
Iteration #904801 -- Total reward = -112.
Iteration #905001 -- Total reward = -113.
Iteration #905201 -- Total reward = -116.
Iteration #905401 -- Total reward = -99.
Iteration #905601 -- Total reward = -118.
Iteration #905801 -- Total reward = -153.
Iteration #906001 -- Total reward = -119.
Iteration #906201 -- Total reward = -147.
Iteration #906401 -- Total reward = -115.
Iteration #906601 -- Total reward = -114.
Iteration #906801 -- Total reward = -114.
Iteration #907001 -- Total reward = -144.
Iteration #907201 -- Total reward = -115.
Iteration #907401 -- Total reward = -144.
Iteration #907601 -- Total reward = -114.
Iteration #907801 -- Total reward = -114.
Iteration #908001 -- Total reward = -141.
Iteration #908201 -- Total reward = -145.
Iteration #908401 -- Total reward =

Iteration #943001 -- Total reward = -163.
Iteration #943201 -- Total reward = -156.
Iteration #943401 -- Total reward = -147.
Iteration #943601 -- Total reward = -110.
Iteration #943801 -- Total reward = -151.
Iteration #944001 -- Total reward = -181.
Iteration #944201 -- Total reward = -92.
Iteration #944401 -- Total reward = -114.
Iteration #944601 -- Total reward = -167.
Iteration #944801 -- Total reward = -95.
Iteration #945001 -- Total reward = -92.
Iteration #945201 -- Total reward = -108.
Iteration #945401 -- Total reward = -117.
Iteration #945601 -- Total reward = -158.
Iteration #945801 -- Total reward = -89.
Iteration #946001 -- Total reward = -110.
Iteration #946201 -- Total reward = -150.
Iteration #946401 -- Total reward = -89.
Iteration #946601 -- Total reward = -109.
Iteration #946801 -- Total reward = -148.
Iteration #947001 -- Total reward = -155.
Iteration #947201 -- Total reward = -147.
Iteration #947401 -- Total reward = -112.
Iteration #947601 -- Total reward = -14

Iteration #982201 -- Total reward = -111.
Iteration #982401 -- Total reward = -113.
Iteration #982601 -- Total reward = -111.
Iteration #982801 -- Total reward = -109.
Iteration #983001 -- Total reward = -144.
Iteration #983201 -- Total reward = -177.
Iteration #983401 -- Total reward = -145.
Iteration #983601 -- Total reward = -123.
Iteration #983801 -- Total reward = -111.
Iteration #984001 -- Total reward = -109.
Iteration #984201 -- Total reward = -112.
Iteration #984401 -- Total reward = -118.
Iteration #984601 -- Total reward = -145.
Iteration #984801 -- Total reward = -172.
Iteration #985001 -- Total reward = -142.
Iteration #985201 -- Total reward = -141.
Iteration #985401 -- Total reward = -160.
Iteration #985601 -- Total reward = -149.
Iteration #985801 -- Total reward = -146.
Iteration #986001 -- Total reward = -158.
Iteration #986201 -- Total reward = -144.
Iteration #986401 -- Total reward = -109.
Iteration #986601 -- Total reward = -126.
Iteration #986801 -- Total reward 

In [6]:
np.mean([run_episode(env, solution_policy, False) for _ in range(100)])

-112.7

## test

In [None]:
import gym
import numpy as np

q_table = np.load('mountiancar-v0_Qtable_Qlearning.npy')

name = "MountainCar-v0"
env = gym.make(name)

n_states = 40
def obs_to_state(env, obs):
    """
    Map an observation to state
    discrete the continuous observation space
    """
    env_low = env.observation_space.low
    env_high = env.observation_space.high
    env_dx = (env_high - env_low) / n_states
    a = int((obs[0] - env_low[0])/env_dx[0])
    b = int((obs[1] - env_low[1])/env_dx[1])
    return a, b

def run_episode(env, policy=None, render=False):
    obs = env.reset()
    total_reward = 0
    steps = 0
    for _ in range(t_max):
        if render:
            env.render()
        if policy is None:
            action = env.action_space.sample()
        else:
            a, b = obs_to_state(env, obs)
            action = policy[a][b]
        obs, reward, done, _ = env.step(action)
        total_reward += gamma**steps * reward
        steps += 1
        
        if done:
            break
    return total_reward

solution_policy = np.argmax(q_table, axis=2)

print(np.mean([run_episode(env, solution_policy, False) for _ in range(100)]))

run_episode(env, solution_policy, True)