In [1]:
import gym
import numpy as np
from envs.gridworld import GridworldEnv
from envs.windy_gridworld import WindyGridworldEnv

# Policy Evaluation

In [2]:
def evaluate_policy(env, V, pi, gamma, theta):
    while True:
        delta = 0
        for s in range(env.observation_space.n):
            v = V[s]
            bellman_update(env, V, pi, s, gamma)
            delta = max(delta, abs(v - V[s]))
        if delta <= theta:
            break
    return V

def bellman_update(env, V, pi, s, gamma):
    v = 0
    for action, p_pi in enumerate(pi[s]): 
        transitions = env.P[s][action]
        for p, s_, r, _ in transitions:
            v += p_pi * p * (r + gamma*V[s_])
    
    V[s] = v
    
def q_greedify_policy(env, V, pi, s, gamma):
    q_max = -float('inf')
    a_max = 0
    for action, _ in enumerate(pi[s]):
        q = 0
        pi[s][action] = 0
        transitions = env.P[s][action]
        
        for p, s_, r, _ in transitions:
            q += p * (r + gamma*V[s_])
            
        if (q > q_max):
            q_max = q
            a_max = action
            
    pi[s][a_max] = 1.0
    
def improve_policy(env, V, pi, gamma):
    policy_stable = True
    for s in range(env.observation_space.n):
        old = pi[s].copy()
        q_greedify_policy(env, V, pi, s, gamma)
        if not np.array_equal(pi[s], old):
            policy_stable = False
    return pi, policy_stable

def policy_iteration(env, gamma, theta):
    V = np.zeros(env.observation_space.n)
    pi = np.ones((env.observation_space.n, env.action_space.n)) / env.action_space.n
    policy_stable = False
    while not policy_stable:
        V = evaluate_policy(env, V, pi, gamma, theta)
        pi, policy_stable = improve_policy(env, V, pi, gamma)
    return V, pi

def value_iteration(env, gamma, theta):
    V = np.zeros(env.observation_space.n)
    while True:
        delta = 0
        for s in range(env.observation_space.n):
            v = V[s]
            bellman_optimality_update(env, V, s, gamma)
            delta = max(delta, abs(v - V[s]))
        if delta <= theta:
            break
    pi = np.ones((env.observation_space.n, env.action_space.n)) / env.action_space.n
    for s in range(env.observation_space.n):
        q_greedify_policy(env, V, pi, s, gamma)
    return V, pi

def bellman_optimality_update(env, V, s, gamma):
    v_max = -float('inf')
    for action in range(env.action_space.n):
        v = 0
        transitions = env.P[s][action]
        
        for p, s_, r, _ in transitions:
            v += p * (r + gamma*V[s_])
        
        if (v > v_max):
            v_max = v
    
    V[s] = v_max

def run_policy(env, pi, episodes):
    rewards = 0
    for episode in range(episodes):
        state = env.reset()
        done = False
        episode_rewards = 0
        while not done:
            state, reward, done, _ = env.step(np.argmax(pi[state]))
            episode_rewards += reward

        rewards += episode_rewards
    return (rewards / episodes)

# Frozen Lake

In [3]:
env_fl = gym.make('FrozenLake-v0', map_name="4x4", is_slippery=False).env
env_fl_slippery = gym.make('FrozenLake-v0', map_name="4x4", is_slippery=True).env

gamma_fl = 0.999
theta_fl = 0
shape_fl = (4,4)

In [4]:
V_fl = np.zeros(env_fl.observation_space.n)
V_fl_slippery = np.zeros(env_fl_slippery.observation_space.n)

pi_fl = np.ones((env_fl.observation_space.n, env_fl.action_space.n)) / env_fl.action_space.n
pi_fl_slippery = np.ones((env_fl_slippery.observation_space.n, env_fl_slippery.action_space.n)) / env_fl_slippery.action_space.n

V_fl, pi_fl = policy_iteration(env_fl, gamma_fl, theta_fl)
V_fl_slippery, pi_fl_slippery = policy_iteration(env_fl_slippery, gamma_fl, theta_fl)

In [5]:
print(V_fl.reshape(shape_fl))
print(np.argmax(pi_fl, axis=1).reshape(shape_fl))

[[0.99500999 0.996006   0.997003   0.996006  ]
 [0.996006   0.         0.998001   0.        ]
 [0.997003   0.998001   0.999      0.        ]
 [0.         0.999      1.         0.        ]]
[[1 2 1 0]
 [1 0 1 0]
 [2 1 1 0]
 [0 2 2 0]]


In [6]:
print(V_fl_slippery.reshape(shape_fl))
print(np.argmax(pi_fl_slippery, axis=1).reshape(shape_fl))

[[0.78553326 0.77855409 0.77391292 0.77159582]
 [0.78789222 0.         0.50573092 0.        ]
 [0.79261722 0.79972245 0.74479855 0.        ]
 [0.         0.86415315 0.93117891 0.        ]]
[[0 3 3 3]
 [0 0 0 0]
 [3 1 0 0]
 [0 2 1 0]]


In [7]:
episodes = 10000
print("env (pi) = {}".format(run_policy(env_fl, pi_fl, episodes)))
print("env_slippery (pi_slippery) = {}".format(run_policy(env_fl_slippery, pi_fl_slippery, episodes)))
print("env_slippery (pi) = {}".format(run_policy(env_fl_slippery, pi_fl, episodes)))

env (pi) = 1.0
env_slippery (pi_slippery) = 0.8241
env_slippery (pi) = 0.045


In [8]:
V_fl_slippery = np.zeros(env_fl_slippery.observation_space.n)
pi_fl_slippery = np.ones((env_fl_slippery.observation_space.n, env_fl_slippery.action_space.n)) / env_fl_slippery.action_space.n

V_fl_slippery, pi_fl_slippery = value_iteration(env_fl_slippery, gamma_fl, theta_fl)

print(V_fl_slippery.reshape(shape_fl))
print(np.argmax(pi_fl_slippery, axis=1).reshape(shape_fl))

[[0.78553326 0.77855409 0.77391292 0.77159582]
 [0.78789222 0.         0.50573092 0.        ]
 [0.79261722 0.79972245 0.74479855 0.        ]
 [0.         0.86415315 0.93117891 0.        ]]
[[0 3 3 3]
 [0 0 0 0]
 [3 1 0 0]
 [0 2 1 0]]


# Grid World

In [9]:
env_grid = GridworldEnv()

gamma_grid = 1
theta_grid = 0
shape_grid=(4, 4)

V_grid = np.zeros(env_grid.observation_space.n)
pi_grid = np.ones((env_grid.observation_space.n, env_grid.action_space.n)) / env_grid.action_space.n

In [10]:
V_grid=evaluate_policy(env_grid, V_grid, pi_grid, gamma_grid, theta_grid)
pi_grid, _ = improve_policy(env_grid, V_grid, pi_grid, gamma_grid)
print(V_grid.reshape(shape_grid))
print(np.argmax(pi_grid, axis=1).reshape(shape_grid))

[[  0. -14. -20. -22.]
 [-14. -18. -20. -20.]
 [-20. -20. -18. -14.]
 [-22. -20. -14.   0.]]
[[0 3 3 3]
 [0 0 2 2]
 [0 1 2 2]
 [0 1 1 0]]


# Cliff Walk

In [30]:
env_cliff = gym.make('CliffWalking-v0')

gamma_cliff = 0.9
theta_cliff = 0.01
shape_cliff=(4, 12)

V_cliff = np.zeros(env_cliff.observation_space.n)
pi_cliff = np.ones((env_cliff.observation_space.n, env_cliff.action_space.n)) / env_cliff.action_space.n

In [34]:
print(pi_cliff)


V_cliff = evaluate_policy(env_cliff, V_cliff, pi_cliff, gamma_cliff, theta_cliff)
pi_cliff, policy_stable = improve_policy(env_cliff, V_cliff, pi_cliff, gamma_cliff)
print(V_cliff.reshape(shape_cliff))
print(np.argmax(pi_cliff, axis=1).reshape(shape_cliff))
print(policy_stable)

[[0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]]
[[-10.02436955 -10.0219326  -10.01973934 -10.0177654  -10.01598886
  -10.01438998 -10.01295098 -10.01165588 -10.03715712 -10.03344141
  -10.03344141 -10.03344141]
 [-10.0219326  -10.01973934 -10.0177654  -10.01598886 -10.01438998
  -10.01295098 -10.01165588 -10.01049029 -10.0334

In [35]:
V_cliff, pi_cliff = policy_iteration(env_cliff, gamma_cliff, theta_cliff)
print(V_cliff.reshape(shape_cliff))
print(np.argmax(pi_cliff, axis=1).reshape(shape_cliff))

[[-10. -10. -10. -10. -10. -10. -10. -10. -10. -10. -10. -10.]
 [-10. -10. -10. -10. -10. -10. -10. -10. -10. -10. -10. -10.]
 [-10. -10. -10. -10. -10. -10. -10. -10. -10. -10. -10. -10.]
 [-10. -10. -10. -10. -10. -10. -10. -10. -10. -10. -10. -10.]]
[[0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0]]


# Windy Grid World

In [18]:
env_windy = WindyGridworldEnv()

gamma_windy = 0.9
theta_windy = 0
shape_windy=(7, 10)

V_windy = np.zeros(env_windy.observation_space.n)
pi_windy = np.ones((env_windy.observation_space.n, env_windy.action_space.n)) / env_windy.action_space.n

In [19]:
V_windy = evaluate_policy(env_windy, V_windy, pi_windy, gamma_windy, theta_windy)
pi_windy, policy_stable = improve_policy(env_windy, V_windy, pi_windy, gamma_windy)
print(V_windy.reshape(shape_windy))
print(np.argmax(pi_windy, axis=1).reshape(shape_windy))

[[-10. -10. -10. -10. -10. -10. -10. -10. -10. -10.]
 [-10. -10. -10. -10. -10. -10. -10. -10. -10. -10.]
 [-10. -10. -10. -10. -10. -10. -10. -10. -10. -10.]
 [-10. -10. -10. -10. -10. -10. -10. -10. -10. -10.]
 [-10. -10. -10. -10. -10. -10. -10. -10. -10. -10.]
 [-10. -10. -10. -10. -10. -10. -10. -10. -10. -10.]
 [-10. -10. -10. -10. -10. -10. -10. -10. -10. -10.]]
[[0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]]


In [20]:
V_windy, pi_windy = policy_iteration(env_windy, gamma_windy, theta_windy)
print(V_windy.reshape(shape_windy))
print(np.argmax(pi_windy, axis=1).reshape(shape_windy))

[[-10. -10. -10. -10. -10. -10. -10. -10. -10. -10.]
 [-10. -10. -10. -10. -10. -10. -10. -10. -10. -10.]
 [-10. -10. -10. -10. -10. -10. -10. -10. -10. -10.]
 [-10. -10. -10. -10. -10. -10. -10. -10. -10. -10.]
 [-10. -10. -10. -10. -10. -10. -10. -10. -10. -10.]
 [-10. -10. -10. -10. -10. -10. -10. -10. -10. -10.]
 [-10. -10. -10. -10. -10. -10. -10. -10. -10. -10.]]
[[0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]]
