In [1]:
import numpy as np


gamma = 0.8

# maze
G = {
    1: [(2,0)],
    2: [(1,0),(3,0),(4,0)],
    3: [(5,50)],
    4: [(6,100)],
    5: [(4,0), (7,0)],
    6: [(6,0)],
    7: [(6,100)]
}

# state-value function
# v - old state-value function, s - state
def v_pi(s, v):
    val = 0

    allowed_a_cnt = len(G[s])

    max_val = 0
    for s_new,r in G[s]:
        max_val = max(max_val, r + gamma * v[s_new])

    return max_val
    
def compute_state_value_function(iter_cnt):
    n = len(G)
    v_old = {}
    for s in G:
        v_old[s] = 0
        
    v_new = {}
    
    for i in range(iter_cnt):
        for s in G:
            v_new[s] = v_pi(s, v_old)
        
        v_old = v_new
    
    return v_old

a) Compute the V* values for each state with discount factor $ \gamma = 0.8 $.

In [2]:
v = compute_state_value_function(20)
print(v)

{1: 72.96000000000001, 2: 91.2, 3: 114.0, 4: 100.0, 5: 80.0, 6: 0, 7: 100.0}


In [3]:
def compute_optimal_policy(v):
    pi = {}
    for s in G:
        val = None
        for s_new,r in G[s]:
            if val is None or val < r + gamma * v[s_new]:
                val = r + gamma * v[s_new]
                pi[s] = s_new
    return pi

b) What is the optimal policy when $ \gamma = 0.8 $

In [4]:
pi = compute_optimal_policy(v)
print(pi)

{1: 2, 2: 3, 3: 5, 4: 6, 5: 4, 6: 6, 7: 6}


c) Does the optimal policy change if $ \gamma $ is set to 0.5 instead? If yes, give the new
policy. If not, explain.

In [5]:
gamma = 0.5
v = compute_state_value_function(20)
print(v)
pi = compute_optimal_policy(v)
print(pi)

{1: 25.0, 2: 50.0, 3: 75.0, 4: 100.0, 5: 50.0, 6: 0, 7: 100.0}
{1: 2, 2: 4, 3: 5, 4: 6, 5: 4, 6: 6, 7: 6}


$ Q(s,a) = r(s,a) + \gamma  \nu^{*} (\delta(s,a)) $

d) Compute the $Q(s,a)$ values for the following state action pairs: (S2,West),
(S6,Stay), (S3, North). Let $\gamma = 0.8$ and $\alpha = 1$.

In [7]:
gamma = 0.8
alpha = 1.

v = compute_state_value_function(20)

# Q(S2,West)
print(gamma * v[3])
# Q(S6, Stay)
print(gamma * v[6])
# Q(S3, North)
print(50 + gamma * v[5])

91.2
0.0
114.0


e) Consider applying the $Q$-learning algorithm to the "treasure-hunting" game.
Let $Q'$ be the estimate of $Q$. Initially all $Q'$ values are set to 0, and $\gamma = 0.8$ and
$\alpha = 1$. Assume that the agent moves from state S1, via states S2, S3, S5, and
S7, to state S6. Show how the $Q'$ values are updated during this episode.
Repeat the same episode twice more and show how the $Q'$ values are revised
during each episode.

In [8]:
episode = [(1,2,0),(2,3,0),(3,5,50),(5,7,0),(7,6,100)]

Q = {}
for s in G.keys():
    Q[s] = 0

for _ in range(3):
    for s,s_new,r in episode:
        Q[s] = (1. - alpha) * Q[s] + alpha * (r + gamma * Q[s_new])
    print(Q)

{1: 0.0, 2: 0.0, 3: 50.0, 4: 0, 5: 0.0, 6: 0, 7: 100.0}
{1: 0.0, 2: 40.0, 3: 50.0, 4: 0, 5: 80.0, 6: 0, 7: 100.0}
{1: 32.0, 2: 40.0, 3: 114.0, 4: 0, 5: 80.0, 6: 0, 7: 100.0}


In [8]:
import gym
env = gym.make('CartPole-v0')
env.reset()
# for _ in range(1000):
#     env.render()
#     env.step(env.action_space.sample()) # take a random action

for i_episode in range(20):
    observation = env.reset()
    for t in range(100):
        env.render()
        print(observation)
        action = env.action_space.sample()
        observation, reward, done, info = env.step(action)
        if done:
            print("Episode finished after {} timesteps".format(t+1))
            break
            
env.close()

[2017-04-04 20:05:42,433] Making new env: CartPole-v0


[ 0.0331188   0.03650003 -0.00068797 -0.04183878]
[ 0.0338488  -0.15861205 -0.00152474  0.25062701]
[ 0.03067656  0.03653164  0.0034878  -0.04253646]
[ 0.0314072   0.23160341  0.00263707 -0.33411692]
[ 0.03603926  0.03644402 -0.00404527 -0.04060356]
[ 0.03676815  0.23162375 -0.00485734 -0.33456006]
[ 0.04140062  0.42681449 -0.01154854 -0.62877077]
[ 0.04993691  0.62209569 -0.02412396 -0.92506824]
[ 0.06237882  0.81753502 -0.04262532 -1.22523369]
[ 0.07872952  1.01317914 -0.06712999 -1.53096135]
[ 0.09899311  1.20904307 -0.09774922 -1.8438169 ]
[ 0.12317397  1.40509764 -0.13462556 -2.16518745]
[ 0.15127592  1.21152446 -0.17792931 -1.91691541]
Episode finished after 13 timesteps
[-0.04338556 -0.02279493 -0.03550886  0.04585677]
[-0.04384146  0.17281773 -0.03459172 -0.25781467]
[-0.0403851  -0.02179373 -0.03974801  0.02375995]
[-0.04082098 -0.21632378 -0.03927281  0.30364172]
[-0.04514745 -0.41086467 -0.03319998  0.5836848 ]
[-0.05336475 -0.60550618 -0.02152628  0.86572717]
[-0.06547487 -

In [9]:
from gym import envs
print(envs.registry.all())

dict_values([EnvSpec(Zaxxon-ramNoFrameskip-v0), EnvSpec(Venture-v3), EnvSpec(Solaris-ramDeterministic-v3), EnvSpec(PhoenixDeterministic-v3), EnvSpec(CrazyClimber-ram-v3), EnvSpec(MsPacman-ramNoFrameskip-v0), EnvSpec(Roulette-v0), EnvSpec(WizardOfWorNoFrameskip-v0), EnvSpec(BeamRiderNoFrameskip-v3), EnvSpec(JourneyEscape-v3), EnvSpec(Seaquest-ramDeterministic-v0), EnvSpec(Assault-ramDeterministic-v3), EnvSpec(BipedalWalker-v2), EnvSpec(ChopperCommandDeterministic-v3), EnvSpec(NameThisGame-ram-v3), EnvSpec(FishingDerby-v0), EnvSpec(Centipede-ramNoFrameskip-v3), EnvSpec(Assault-ramNoFrameskip-v0), EnvSpec(Enduro-ram-v3), EnvSpec(Alien-ram-v3), EnvSpec(PitfallDeterministic-v0), EnvSpec(Seaquest-ramNoFrameskip-v3), EnvSpec(Bowling-ramDeterministic-v3), EnvSpec(Boxing-ramDeterministic-v3), EnvSpec(Asteroids-ramNoFrameskip-v3), EnvSpec(SemisuperPendulumRandom-v0), EnvSpec(Pitfall-ram-v3), EnvSpec(NameThisGameNoFrameskip-v3), EnvSpec(JamesbondNoFrameskip-v3), EnvSpec(Centipede-v0), EnvSpec(Rob

In [11]:
from gym import wrappers
env = gym.make('CartPole-v0')
env = wrappers.Monitor(env, '/tmp/cartpole-experiment-1')
for i_episode in range(20):
    observation = env.reset()
    for t in range(100):
        env.render()
        print(observation)
        action = env.action_space.sample()
        observation, reward, done, info = env.step(action)
        if done:
            print("Episode finished after {} timesteps".format(t+1))
            break
            
env.close()

[2017-04-04 20:06:47,912] Making new env: CartPole-v0
[2017-04-04 20:06:47,916] Creating monitor directory /tmp/cartpole-experiment-1
[2017-04-04 20:06:47,920] Starting new video recorder writing to /tmp/cartpole-experiment-1/openaigym.video.0.25983.video000000.mp4


[-0.00933297 -0.01626376  0.0467672  -0.01452368]
[-0.00965824  0.17815738  0.04647672 -0.29209197]
[-0.0060951   0.3725869   0.04063488 -0.56976209]
[ 0.00135664  0.17691932  0.02923964 -0.26455963]
[ 0.00489503 -0.0186075   0.02394845  0.03720047]
[ 0.00452288  0.17616298  0.02469246 -0.24783131]
[ 0.00804614  0.37092374  0.01973583 -0.53262455]
[ 0.01546461  0.17552986  0.00908334 -0.23378884]
[ 0.01897521  0.37052085  0.00440756 -0.5235928 ]
[ 0.02638563  0.5655805  -0.00606429 -0.81488361]
[ 0.03769724  0.76078496 -0.02236196 -1.10946781]
[ 0.05291294  0.95619351 -0.04455132 -1.40908122]
[ 0.07203681  1.15183888 -0.07273295 -1.71535184]
[ 0.09507358  1.34771596 -0.10703998 -2.02975528]


[2017-04-04 20:06:48,788] Starting new video recorder writing to /tmp/cartpole-experiment-1/openaigym.video.0.25983.video000001.mp4


[ 0.1220279   1.15384991 -0.14763509 -1.77203024]
[ 0.1451049   0.96066985 -0.18307569 -1.52865956]
Episode finished after 16 timesteps
[ 0.02570834 -0.04372991 -0.02990138 -0.02519717]
[ 0.02483375 -0.23841058 -0.03040533  0.25790359]
[ 0.02006553 -0.04286803 -0.02524726 -0.04421244]
[ 0.01920817 -0.23761902 -0.0261315   0.24039904]
[ 0.01445579 -0.43235813 -0.02132352  0.52472618]
[ 0.00580863 -0.23694269 -0.010829    0.22540101]
[ 0.00106978 -0.04166766 -0.00632098 -0.07067802]
[  2.36423551e-04   1.53544344e-01  -7.73454037e-03  -3.65348526e-01]
[ 0.00330731  0.34877535 -0.01504151 -0.66046021]
[ 0.01028282  0.54410336 -0.02825072 -0.95784111]
[ 0.02116488  0.34937243 -0.04740754 -0.67416588]
[ 0.02815233  0.54512008 -0.06089085 -0.98139006]
[ 0.03905473  0.74100288 -0.08051866 -1.29256057]
[ 0.05387479  0.9370507  -0.10636987 -1.60932646]
[ 0.07261581  0.74333427 -0.1385564  -1.35160794]
[ 0.08748249  0.5501975  -0.16558856 -1.10528456]
[ 0.09848644  0.74706323 -0.18769425 -1.4450

[2017-04-04 20:06:51,861] Starting new video recorder writing to /tmp/cartpole-experiment-1/openaigym.video.0.25983.video000008.mp4


[-0.02110659 -0.37544105  0.08361532  0.63647549]
[-0.02861541 -0.57162345  0.09634483  0.95427524]
[-0.04004788 -0.76790024  0.11543033  1.27560662]
[-0.05540588 -0.9642895   0.14094246  1.60209137]
[-0.07469167 -0.77108941  0.17298429  1.3564649 ]
[-0.09011346 -0.57850761  0.20011359  1.1225076 ]
Episode finished after 12 timesteps
[ 0.00648941  0.04587614 -0.00935272  0.04915516]
[ 0.00740693 -0.14911046 -0.00836961  0.33887261]
[ 0.00442472  0.04612958 -0.00159216  0.04356217]
[ 0.00534731 -0.14896951 -0.00072092  0.33574233]
[ 0.00236792  0.0461627   0.00599393  0.04283215]
[ 0.00329118  0.24119819  0.00685057 -0.24795363]
[ 0.00811514  0.43622164  0.0018915  -0.53846788]
[ 0.01683957  0.63131694 -0.00887786 -0.83055422]
[ 0.02946591  0.82655911 -0.02548894 -1.12601599]
[ 0.0459971   1.02200564 -0.04800926 -1.42658349]
[ 0.06643721  1.21768679 -0.07654093 -1.73387632]
[ 0.09079094  1.41359408 -0.11121846 -2.04935794]
[ 0.11906283  1.60966644 -0.15220562 -2.37428022]
[ 0.15125615  

[2017-04-04 20:06:56,549] Finished writing results. You can upload them to the scoreboard via gym.upload('/tmp/cartpole-experiment-1')


[-0.12150053 -1.20432376  0.20711966  2.08241597]
Episode finished after 15 timesteps


In [12]:
gym.upload('/tmp/cartpole-experiment-1', api_key='sk_aALYKy71TVaaZKyjychDpg')

[2017-04-04 20:07:10,117] [CartPole-v0] Uploading 20 episodes of training data
[2017-04-04 20:07:12,793] [CartPole-v0] Uploading videos of 3 training episodes (8220 bytes)
[2017-04-04 20:07:13,288] [CartPole-v0] Creating evaluation object from /tmp/cartpole-experiment-1 with learning curve and training video
[2017-04-04 20:07:13,695] 
****************************************************
You successfully uploaded your evaluation on CartPole-v0 to
OpenAI Gym! You can find it at:

    https://gym.openai.com/evaluations/eval_7FQjiPtTE6OfOLXr9lWjQ

****************************************************
