In [71]:
import random
import numpy as np
import gym
from pprint import pprint

import sys
if "../" not in sys.path:
    sys.path.append("../") 
from envs.classic_gridworld import *

# Policy Evaluation

In [5]:
def policy_evaluation(policy, env, discount_factor=1.0, theta=1e-5):
    # Start with a random (all 0) value function
    V = np.zeros(env.nS)
    while True:
        delta = 0
        for s in range(env.nS-1):
            v = 0
            # Look at the possible next actions
            for a, action_prob in enumerate(policy[s]):
                # For each action, look at the possible next states...
                for  prob, next_state, reward, done in env.P[s][a]:
                    v += action_prob * prob * (reward + discount_factor * V[next_state])               
            # How much our value function changed (across any states)
            delta = max(delta, np.abs(v - V[s]))
            V[s] = v
        # Stop evaluating once our value function change is below a threshold
        if delta < theta:
            break
    return np.array(V)

### Example: Norvig and Russel Chapters 17 and 21

In [51]:
env = ClassicGridEnv3x4(nrew=-0.04)
policy = np.array([
    [0.0, 0.0, 1.0, 0.0],
    [0.0, 0.0, 1.0, 0.0],
    [0.0, 0.0, 1.0, 0.0],
    [0.0, 0.0, 1.0, 0.0],

    [0.0, 0.0, 0.0, 1.0],
    [0.0, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.0, 1.0],
    [0.0, 0.0, 0.0, 1.0],

    [0.0, 0.0, 0.0, 1.0],
    [1.0, 0.0, 0.0, 0.0],
    [1.0, 0.0, 0.0, 0.0],
    [1.0, 0.0, 0.0, 0.0],
])

print(policy_evaluation(policy, env)[:-1].reshape((3,4)))

[[ 0.81155813  0.86780822  0.91780822  1.        ]
 [ 0.76155803  0.          0.66027397 -1.        ]
 [ 0.70530727  0.65530642  0.61141351  0.38792265]]


### Example: CS229 Lecture 16 (32:00)

In [37]:
env = ClassicGridEnv3x4(nrew=-0.02)
policy = np.array([
    [0.0, 0.0, 1.0, 0.0],
    [0.0, 0.0, 1.0, 0.0],
    [0.0, 0.0, 1.0, 0.0],
    [0.0, 0.0, 1.0, 0.0],

    [0.0, 1.0, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.0],
    [0.0, 0.0, 1.0, 0.0],
    [0.0, 0.0, 0.0, 1.0],

    [0.0, 0.0, 1.0, 0.0],
    [0.0, 0.0, 1.0, 0.0],
    [0.0, 0.0, 0.0, 1.0],
    [0.0, 0.0, 0.0, 1.0],
])

print(policy_evaluation(policy, env, 0.99)[:-1].reshape((3,4))) 
V = policy_evaluation(policy, env, 0.99)

[[ 0.52265345  0.73215214  0.76664901  1.        ]
 [-0.89853063  0.         -0.82069941 -1.        ]
 [-0.88462558 -0.86880462 -0.85452187 -0.99511395]]


# Policy Improvement

### Example: CS229 Lecture 16 (32:00)

In [69]:
# Create environment
env = ClassicGridEnv3x4(nrew=-0.02)

# set policy
policy = np.array([
    [0.0, 0.0, 1.0, 0.0],
    [0.0, 0.0, 1.0, 0.0],
    [0.0, 0.0, 1.0, 0.0],
    [0.0, 0.0, 1.0, 0.0],

    [0.0, 1.0, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.0],
    [0.0, 0.0, 1.0, 0.0],
    [0.0, 0.0, 0.0, 1.0],

    [0.0, 0.0, 1.0, 0.0],
    [0.0, 0.0, 1.0, 0.0],
    [0.0, 0.0, 0.0, 1.0],
    [0.0, 0.0, 0.0, 1.0],
])

# Evaluate policy
discount_factor = 0.99
V = policy_evaluation(policy, env, discount_factor)
print('Original policy \n', policy, '\n')
# print(policy, '\n')
print('State Values \n', V[:-1].reshape((3,4)), '\n')
# Improve policy
new_policy = np.zeros((env.nS, env.nA))
for state in env.P:
    action_returns = np.zeros(len(env.P[state]))
    for i, action in enumerate(env.P[state]):
        a = 0.0
        for prob, next_state, reward, done in env.P[state][action]:
            a += prob * (reward + discount_factor * V[next_state])
        action_returns[i] = a
    new_action = int(np.argmax(action_returns))
    new_policy[state][new_action] = 1.0

# Evaluate new policy
new_V = policy_evaluation(new_policy, env, discount_factor)

print('New policy')
print(new_policy, '\n')
print('State Values')
print(new_V[:-1].reshape((3,4))) 
print()

optimal_policy = np.array([
    [0.0, 0.0, 1.0, 0.0],
    [0.0, 0.0, 1.0, 0.0],
    [0.0, 0.0, 1.0, 0.0],
    [0.0, 0.0, 1.0, 0.0],

    [0.0, 0.0, 0.0, 1.0],
    [0.0, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.0, 1.0],
    [0.0, 0.0, 0.0, 1.0],

    [0.0, 0.0, 0.0, 1.0],
    [1.0, 0.0, 0.0, 0.0],
    [1.0, 0.0, 0.0, 0.0],
    [1.0, 0.0, 0.0, 0.0],
])
print('Optimal policy')
print(optimal_policy)
optimal_V = policy_evaluation(policy, env)
print()
print('State Values')
print(optimal_V[:-1].reshape((3,4)))

Original policy 
 [[0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 1. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]] 

State Values 
 [[ 0.52265345  0.73215214  0.76664901  1.        ]
 [-0.89853063  0.         -0.82069941 -1.        ]
 [-0.88462558 -0.86880462 -0.85452187 -0.99511395]] 

New policy
[[0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]] 

State Values
[[ 0.85530046  0.89580324  0.93236641  1.        ]
 [ 0.8196974   0.          0.68749634 -1.        ]
 [ 0.58789645  0.59159775  0.6243202   0.41671652]]

Optimal policy
[[0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]
 [0. 0. 0. 0.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]]

State Values
[[ 0.53806204  0.7494399   0.7744399   1.

# Policy Iteration

In [100]:
def policy_improvement(env, V, discount_factor):
    new_policy = np.zeros((env.nS, env.nA))
    for state in env.P:
        action_returns = np.zeros(len(env.P[state]))
        for i, action in enumerate(env.P[state]):
            a = 0.0
            for prob, next_state, reward, done in env.P[state][action]:
                a += prob * (reward + discount_factor * V[next_state])
            action_returns[i] = a
        new_action = int(np.argmax(action_returns))
        new_policy[state][new_action] = 1.0   
    return new_policy

def policy_evaluation(policy, env, discount_factor=1.0, theta=1e-5):
    # Start with a random (all 0) value function
    V = np.zeros(env.nS)
    while True:
        delta = 0
        for s in range(env.nS-1):
            v = 0
            # Look at the possible next actions
            for a, action_prob in enumerate(policy[s]):
                # For each action, look at the possible next states...
                for  prob, next_state, reward, done in env.P[s][a]:
                    v += action_prob * prob * (reward + discount_factor * V[next_state])               
            # How much our value function changed (across any states)
            delta = max(delta, np.abs(v - V[s]))
            V[s] = v
        # Stop evaluating once our value function change is below a threshold
        if delta < theta:
            break
    return np.array(V)

def init_random_policy(evn):
    policy = np.zeros((env.nS, env.nA))
    for state in range(env.nS):
        policy[state][random.randrange(env.nA)] = 1.0
    return policy

def policy_iteration(env, discount_factor=1.0, theta=1e-5):
    # Initiate random policy
    policy = init_random_policy(env)
    while True:
        # Evaluate policy
        V = policy_evaluation(policy, env, discount_factor, theta)
        
        # Improve policy
        new_policy = policy_improvement(env, V, discount_factor)
        
        # Differences between policies
        num_differences = np.sum(np.abs(policy - new_policy))
        if num_differences == 0:
            policy = new_policy
            break
        policy = new_policy
    return policy

In [110]:
opt_policy = policy_iteration(env, 0.99)[:-1]
print(np.sum(np.abs(opt_policy[[0,1,2,4,6,8,9,10,11]] - optimal_policy[[0,1,2,4,6,8,9,10,11]])))

0.0


# Value Iteration

In [None]:
def value_iteration(env):
    pass

# Sarsa

In [None]:
def sarsa(env):
    pass

# Q-learning

In [None]:
def q_learning(env):
    pass