In [1]:
import numpy as np
import pprint
import sys

import gym
from gym import wrappers
from gym import spaces
from gym import envs

In [2]:
class DP(object):
    #implement policy iteration algorithm for dynamic programming
    def __init__(self, env, discount_factor = 1, theta = 0.000001):
        """
        if not isinstance(env.nS, spaces.discrete.Discrete):
            raise NameError('Observation space {} incompatible with {}. (Only supports Discrete observation spaces.)'.format(env.nS, self))
        if not isinstance(env.nA, spaces.discrete.Discrete):
            raise NameError('Action space {} incompatible with {}. (Only supports Discrete action spaces.)'.format(env.nA, self))
        """
        self.policy = np.ones([env.nS, env.nA]) / env.nA
        self.env = env
        self.discount_factor = discount_factor
        self.theta = theta
    
    def policy_eval(self):
        V = np.zeros(self.env.nS)
        c=0
        while True:
            c += 1
            delta = 0
            for s in range(self.env.nS):
                v = 0
                
                for a in range(self.env.nA):
                    for prob, next_state, reward, done in self.env.P[s][a]:
                        v += self.policy[s][a] * prob * (reward + self.discount_factor * V[next_state])
                        
                delta = max(delta, np.abs(V[s] - v))
                V[s] = v
                
            if delta < self.theta:
                break
        
        return np.array(V)
    
    def policy_iteration(self):
        c=0
        while True:
            c+=1
            l=0
            V = self.policy_eval()
            policy_stable = True
            for s in range(self.env.nS):
                chosen_a = np.argmax(self.policy[s])
                q_values = np.zeros(self.env.nA)
                for a in range(self.env.nA):
                    for prob, next_state, reward, done in self.env.P[s][a]:
                        q_values[a] += prob * (reward + self.discount_factor * V[next_state])
                best_a = np.argmax(q_values)
                if chosen_a != best_a:
                    policy_stable = False
                    l+=1
                p = np.eye(self.env.nA)
                self.policy[s] = p[best_a]
                
            print("\rPolicy Improvement iteration {}. Number of instable choice: {}".format(c,l), end="")
            sys.stdout.flush()
            if policy_stable:
                return
            
            
            
    def act(self, ob, reward, done):
        return np.argmax(self.policy[ob])    
        

In [3]:
if __name__ == '__main__':
    env = gym.make('Taxi-v1')

    # You provide the directory to write to (can be an existing
    # directory, including one with existing data -- all monitor files
    # will be namespaced). You can also dump to a tempdir if you'd
    # like: tempfile.mkdtemp().
    outdir = '/Users/jacopo/openaigym/project/DP/results'
    env = wrappers.Monitor(env, outdir, force=True)
    env.seed(0)
    agent = DP(env.unwrapped, discount_factor=0.9)
    
    agent.policy_iteration()

    episode_count = 100
    reward = 0
    done = False

    for i in range(episode_count):
        ob = env.reset()
        while True:
            action = agent.act(ob, reward, done)
            ob, reward, done, _ = env.step(action)
            if done:
                break
            # Note there's no env.render() here. But the environment still can open window and
            # render if asked by env.monitor: it calls env.render('rgb_array') to record video.
            # Video is not recorded every episode, see capped_cubic_video_schedule for details.

    # Close the env and write monitor result info to disk
    env.close()

[2017-01-20 16:42:51,490] Making new env: Taxi-v1
[2017-01-20 16:42:51,548] Clearing 12 monitor files from previous run (because force=True was provided)


Policy Improvement iteration 11. Number of instable choice: 0

[2017-01-20 16:43:01,019] Starting new video recorder writing to /Users/jacopo/openaigym/project/DP/results/openaigym.video.0.6625.video000000.json
[2017-01-20 16:43:01,025] Starting new video recorder writing to /Users/jacopo/openaigym/project/DP/results/openaigym.video.0.6625.video000001.json
[2017-01-20 16:43:01,041] Starting new video recorder writing to /Users/jacopo/openaigym/project/DP/results/openaigym.video.0.6625.video000008.json
[2017-01-20 16:43:01,075] Starting new video recorder writing to /Users/jacopo/openaigym/project/DP/results/openaigym.video.0.6625.video000027.json
[2017-01-20 16:43:01,103] Starting new video recorder writing to /Users/jacopo/openaigym/project/DP/results/openaigym.video.0.6625.video000064.json
[2017-01-20 16:43:01,122] Finished writing results. You can upload them to the scoreboard via gym.upload('/Users/jacopo/openaigym/project/DP/results')


In [4]:
gym.upload(outdir, api_key='sk_v7ktbUr7SzC68vXvnrwLLQ')

[2017-01-20 16:43:04,157] [Taxi-v1] Uploading 100 episodes of training data
[2017-01-20 16:43:06,360] [Taxi-v1] Uploading videos of 5 training episodes (1119 bytes)
[2017-01-20 16:43:06,808] [Taxi-v1] Creating evaluation object from /Users/jacopo/openaigym/project/DP/results with learning curve and training video
[2017-01-20 16:43:07,175] 
****************************************************
You successfully uploaded your evaluation on Taxi-v1 to
OpenAI Gym! You can find it at:

    https://gym.openai.com/evaluations/eval_r9HLCyRa2kMrVULI6Ig

****************************************************
