In [1]:
import numpy as np
import pprint
import sys
if "../" not in sys.path:
  sys.path.append("../") 
from lib.envs.gridworld import GridworldEnv
import gym
from gym import wrappers
from gym.envs.toy_text.frozen_lake import FrozenLakeEnv

In [2]:
pp = pprint.PrettyPrinter(indent=2)
env = GridworldEnv()

In [3]:
def value_iteration(env, theta=0.0001, discount_factor=1.0):
    """
    Value Iteration Algorithm.
    
    Args:
        env: OpenAI environment. env.P represents the transition probabilities of the environment.
        theta: Stopping threshold. If the value of all states changes less than theta
            in one iteration we are done.
        discount_factor: lambda time discount factor.
        
    Returns:
        A tuple (policy, V) of the optimal policy and the optimal value function.        
    """
    
    # Value iteration
    V = np.zeros(env.nS)
    while True:
        delta = 0
        for s in range(env.nS):
            action_values = np.zeros(env.nA)
            for a in range(env.nA):
                for prob, successor, reward, done in env.P[s][a]:
                    action_values[a] += prob*(reward + discount_factor*V[successor])
            best_av = np.max(action_values)
            delta = max(delta, np.abs(best_av-V[s]))
            V[s] = best_av
        if delta<theta:
            break
    
    # Greedy policy
    policy = np.zeros([env.nS, env.nA])
    for s in range(env.nS):
        action_values = np.zeros(env.nA)
        for a in range(env.nA):
            for prob, successor, reward, done in env.P[s][a]:
                action_values[a] += prob*(reward + discount_factor*V[successor])
        best_action = np.argmax(action_values)
        policy[s,best_action] = 1
    
    return policy, V

In [4]:
policy, v = value_iteration(env)

print("Policy Probability Distribution:")
print(policy)
print("")

print("Reshaped Grid Policy (0=up, 1=right, 2=down, 3=left):")
print(np.reshape(np.argmax(policy, axis=1), env.shape))
print("")

print("Value Function:")
print(v)
print("")

print("Reshaped Grid Value Function:")
print(v.reshape(env.shape))
print("")

Policy Probability Distribution:
[[ 1.  0.  0.  0.]
 [ 0.  0.  0.  1.]
 [ 0.  0.  0.  1.]
 [ 0.  0.  1.  0.]
 [ 1.  0.  0.  0.]
 [ 1.  0.  0.  0.]
 [ 1.  0.  0.  0.]
 [ 0.  0.  1.  0.]
 [ 1.  0.  0.  0.]
 [ 1.  0.  0.  0.]
 [ 0.  1.  0.  0.]
 [ 0.  0.  1.  0.]
 [ 1.  0.  0.  0.]
 [ 0.  1.  0.  0.]
 [ 0.  1.  0.  0.]
 [ 1.  0.  0.  0.]]

Reshaped Grid Policy (0=up, 1=right, 2=down, 3=left):
[[0 3 3 2]
 [0 0 0 2]
 [0 0 1 2]
 [0 1 1 0]]

Value Function:
[ 0. -1. -2. -3. -1. -2. -3. -2. -2. -3. -2. -1. -3. -2. -1.  0.]

Reshaped Grid Value Function:
[[ 0. -1. -2. -3.]
 [-1. -2. -3. -2.]
 [-2. -3. -2. -1.]
 [-3. -2. -1.  0.]]



In [5]:
# Test the value function
expected_v = np.array([ 0, -1, -2, -3, -1, -2, -3, -2, -2, -3, -2, -1, -3, -2, -1,  0])
np.testing.assert_array_almost_equal(v, expected_v, decimal=2)

Here, we train a new policy on the FrozenLake environment via value iteration.  FrozenLake is similar to Gridworld, except that certain cells in the grid are designated "holes". Falling into one of these holes automatically terminates the episode with reward 0.  Additionally, the  lake is slippery, so that actions are not completely deterministic. This means that taking an action of Right may actually move the agent in a direction other than Right with some slight probability.

In [6]:
env = FrozenLakeEnv()
policy, v = value_iteration(env)

max_time_steps = 100000
n_episode = 1

env = gym.make('FrozenLake-v0')
env = wrappers.Monitor(env, './frozen-lake-experiment', video_callable=lambda x: True, force=True)

average_reward = 0
for i_episode in range(n_episode):

    observation = env.reset() #reset environment to beginning 
    episode_reward = 0
    #run for several time-steps
    for t in range(max_time_steps): 
        #display experiment
        #env.render() 

        #sample a random action 
        action = np.argmax(policy[observation])

        #observe next step and get reward 
        observation, reward, done, info = env.step(action)
        episode_reward += reward
        
        if done:
            #env.render() 
            print("Simulation finished after {0} timesteps".format(t))
            print("Episode reward was {0}.".format(episode_reward))
            break
    average_reward += episode_reward

average_reward /= n_episode
print("Average episode reward was {0}.".format(average_reward))
env.render(close=True)

[2017-05-02 12:07:39,997] Making new env: FrozenLake-v0
[2017-05-02 12:07:40,006] Clearing 22 monitor files from previous run (because force=True was provided)
[2017-05-02 12:07:40,010] Starting new video recorder writing to /Users/jasonmancuso/reinforcement-learning/DP/frozen-lake-experiment/openaigym.video.0.15582.video000000.json
[2017-05-02 12:07:40,016] Starting new video recorder writing to /Users/jasonmancuso/reinforcement-learning/DP/frozen-lake-experiment/openaigym.video.0.15582.video000001.json
[2017-05-02 12:07:40,021] Starting new video recorder writing to /Users/jasonmancuso/reinforcement-learning/DP/frozen-lake-experiment/openaigym.video.0.15582.video000002.json
[2017-05-02 12:07:40,030] Starting new video recorder writing to /Users/jasonmancuso/reinforcement-learning/DP/frozen-lake-experiment/openaigym.video.0.15582.video000003.json
[2017-05-02 12:07:40,034] Starting new video recorder writing to /Users/jasonmancuso/reinforcement-learning/DP/frozen-lake-experiment/openai

Simulation finished after 70 timesteps
Episode reward was 1.0.
Simulation finished after 21 timesteps
Episode reward was 1.0.
Simulation finished after 99 timesteps
Episode reward was 0.0.
Simulation finished after 29 timesteps
Episode reward was 1.0.
Simulation finished after 18 timesteps
Episode reward was 1.0.
Simulation finished after 15 timesteps
Episode reward was 0.0.
Simulation finished after 95 timesteps
Episode reward was 1.0.
Simulation finished after 99 timesteps
Episode reward was 0.0.
Simulation finished after 99 timesteps
Episode reward was 0.0.
Simulation finished after 43 timesteps
Episode reward was 1.0.
Average episode reward was 0.6.
