In [1]:
#http://sawcordwell.github.io/mdp/conservation/2015/01/10/possingham1997-1/
import numpy as np
import pandas as pd
import random
from hiive.visualization import mdpviz
from time import time
import itertools
import gym
from gym.envs.toy_text.frozen_lake import generate_random_map
from gym.envs.registration import register
from gym import wrappers
from hiive.mdptoolbox import mdp
from collections import defaultdict
import sys
from collections import namedtuple
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
class FireManagementSpec:

    def __init__(self, population_classes=30, fire_classes= 30, seed=1234, verbose=True):
        self.seed = seed
        self.verbose = verbose
        self.population_classes = population_classes
        self.fire_classes = fire_classes
        self.states = {}

        self.spec = mdpviz.MDPSpec()

        self._action_do_nothing = self.spec.action('do_nothing')
        self._action_burn = self.spec.action('burn')

        self._probabilities = {}
        self.name = f'fire_management_{population_classes}_{fire_classes}_{seed}'
        self.n_actions = 2
        self.n_states = self.fire_classes * self.population_classes

        self.reset()

    def reset(self):
        np.random.seed(self.seed)
        self._setup_mdp()

    def _reset_state_probabilities(self):
        self._probabilities = {}

    def _get_probability_for_state(self, pc, fc):
        state_name = self._get_state_name(pc, fc)
        if state_name not in self._probabilities:
            return None
        return self._probabilities[state_name]

    def _set_probability_for_state(self, pc, fc, p):
        state_name = self._get_state_name(pc, fc)
        if state_name not in self._probabilities:
            self._probabilities[state_name] = 0.
        self._probabilities[state_name] += p
        return self._probabilities[state_name]

    @staticmethod
    def _is_terminal(s):
        return False  # s == 0

    @staticmethod
    def get_habitat_suitability(years):
        if years < 0:
            msg = "Invalid years '%s', it should be positive." % str(years)
            raise ValueError(msg)
        if years <= 5:
            return 0.2 * years
        elif 5 <= years <= 10:
            return -0.1 * years + 1.5
        else:
            return 0.5

    @staticmethod
    def _get_state_name(pc, fc):
        return f'pc:{pc}, fc:{fc}'

    def _get_state(self, pc, fc):
        state_name = self._get_state_name(pc, fc)
        is_terminal = self._is_terminal(pc)
        if state_name not in self.states:
            state = self.spec.state(name=state_name, terminal_state=is_terminal)
            self.states[state_name] = state
        # print(f'{state_name} : {is_terminal}')
        state = self.states[state_name]
        return state

    def _add_state_transition_and_reward(self, pc, fc, action):
        cs = self._get_state(pc, fc)
        results = self._get_reward_and_new_state_values(pc, fc, action)
        for reward, npc, nfc, tp in results:
            ns = self._get_state(npc, nfc)
            ns = mdpviz.NextState(state=ns, weight=tp)
            self.spec.transition(state=cs, action=action, outcome=ns)
            self.spec.transition(state=cs, action=action, outcome=mdpviz.Reward(reward))
            if self.verbose:
                print(f'[state:action]: [{(pc, fc)} : {action.name}] -> new state: {(npc, nfc)}, '
                      f'p(t): {tp}, reward: {reward} ')

    def transition_fire_class(self, fc, action):
        if action == self._action_do_nothing:
            return (fc + 1) if fc < self.fire_classes - 1 else fc
        elif action == self._action_burn:
            return 0
        return fc

    def _get_reward_and_new_state_values(self, pc, fc, action, default_p=0.5):
        pop_change_down = -1
        pop_change_same = 0

        self._probabilities = {}
        transition_probability_up = None
#        if pc == 1 and fc == 0 and action == self._action_burn:
#            print()

        r = self.get_habitat_suitability(fc)
        fc = self.transition_fire_class(fc, action)
        if pc == 0:
            # dead
            return [[0.0, 0, fc, 1.0]]  # stays in same state
        if pc == self.population_classes - 1:
            pop_change_up = 0
            if action == self._action_burn:
                pop_change_same -= 1
                pop_change_down -= 1

            tsd = self._set_probability_for_state(pc=pc + pop_change_down,
                                                  fc=fc,
                                                  p=(1.0 - default_p) * (1.0 - r))
            tss = self._set_probability_for_state(pc=pc + pop_change_same,
                                                  fc=fc,
                                                  p=1.0 - tsd)
        else:
            # Population abundance class can stay the same, transition up, or
            # transition down.
            pop_change_same = 0
            pop_change_up = 1
            pop_change_down = -1

            # If action 1 is taken, then the patch is burned so the population
            # abundance moves down a class.
            if action == self._action_burn:
                pop_change_same -= 1
                pop_change_up -= 1
                pop_change_down -= (1 if pop_change_down > 0 else 0)

            tss = self._set_probability_for_state(pc=pc + pop_change_same,
                                                  fc=fc,
                                                  p=default_p)

            tsu = self._set_probability_for_state(pc=pc + pop_change_up,
                                                  fc=fc,
                                                  p=(1 - default_p)*r)
            # In the case when transition_down = 0 before the effect of an action
            # is applied, then the final state is going to be the same as that for
            # transition_same, so we need to add the probabilities together.
            tsd = self._set_probability_for_state(pc=pc + pop_change_down,
                                                  fc=fc,
                                                  p=(1 - default_p)*(1 - r))

        # build results
        results = []

        npc_up = pc + pop_change_up
        npc_down = pc + pop_change_down
        npc_same = pc + pop_change_same

        transition_probabilities = {
            (npc_up, self._get_probability_for_state(npc_up, fc)),
            (npc_down, self._get_probability_for_state(npc_down, fc)),
            (npc_same, self._get_probability_for_state(npc_same, fc))
        }

        for npc, probability in transition_probabilities:
            if probability is not None and probability > 0.0:
                reward = int(npc > 0)
                results.append((reward, npc, fc, probability))

        return results

    # noinspection PyStatementEffect
    def _setup_mdp(self):
        # build transitions
        for pc in range(0, self.population_classes):
            if self._is_terminal(pc):
                continue
            for fc in range(0, self.fire_classes):
                # actions
                self._add_state_transition_and_reward(pc=pc, fc=fc, action=self._action_do_nothing)
                self._add_state_transition_and_reward(pc=pc, fc=fc, action=self._action_burn)
                if self.verbose:
                    print()

    def get_transition_and_reward_arrays(self, p_default=0.5):
        return self.spec.get_transition_and_reward_arrays(p_default)

    def to_graph(self):
        return self.spec.to_graph()

    def to_env(self):
        return self.spec.to_discrete_env()
    
    def print_policy(self,policy):

        p = np.array(policy).reshape(self.population_classes, self.fire_classes)
        print("    " + " ".join("%2d" % f for f in range(self.fire_classes)))
        print("    " + "---" * self.fire_classes)
        for x in range(self.population_classes):
            print(" %2d|" % x + " ".join("%2d" % p[x, f] for f in
                                     range(self.fire_classes)))

In [3]:
fm_spec = FireManagementSpec()
envFM = fm_spec.to_env()

[state:action]: [(0, 0) : do_nothing] -> new state: (0, 1), p(t): 1.0, reward: 0.0 
[state:action]: [(0, 0) : burn] -> new state: (0, 0), p(t): 1.0, reward: 0.0 

[state:action]: [(0, 1) : do_nothing] -> new state: (0, 2), p(t): 1.0, reward: 0.0 
[state:action]: [(0, 1) : burn] -> new state: (0, 0), p(t): 1.0, reward: 0.0 

[state:action]: [(0, 2) : do_nothing] -> new state: (0, 3), p(t): 1.0, reward: 0.0 
[state:action]: [(0, 2) : burn] -> new state: (0, 0), p(t): 1.0, reward: 0.0 

[state:action]: [(0, 3) : do_nothing] -> new state: (0, 4), p(t): 1.0, reward: 0.0 
[state:action]: [(0, 3) : burn] -> new state: (0, 0), p(t): 1.0, reward: 0.0 

[state:action]: [(0, 4) : do_nothing] -> new state: (0, 5), p(t): 1.0, reward: 0.0 
[state:action]: [(0, 4) : burn] -> new state: (0, 0), p(t): 1.0, reward: 0.0 

[state:action]: [(0, 5) : do_nothing] -> new state: (0, 6), p(t): 1.0, reward: 0.0 
[state:action]: [(0, 5) : burn] -> new state: (0, 0), p(t): 1.0, reward: 0.0 

[state:action]: [(0, 6


[state:action]: [(12, 22) : do_nothing] -> new state: (12, 23), p(t): 0.5, reward: 1 
[state:action]: [(12, 22) : do_nothing] -> new state: (13, 23), p(t): 0.25, reward: 1 
[state:action]: [(12, 22) : do_nothing] -> new state: (11, 23), p(t): 0.25, reward: 1 
[state:action]: [(12, 22) : burn] -> new state: (12, 0), p(t): 0.25, reward: 1 
[state:action]: [(12, 22) : burn] -> new state: (11, 0), p(t): 0.75, reward: 1 

[state:action]: [(12, 23) : do_nothing] -> new state: (12, 24), p(t): 0.5, reward: 1 
[state:action]: [(12, 23) : do_nothing] -> new state: (13, 24), p(t): 0.25, reward: 1 
[state:action]: [(12, 23) : do_nothing] -> new state: (11, 24), p(t): 0.25, reward: 1 
[state:action]: [(12, 23) : burn] -> new state: (12, 0), p(t): 0.25, reward: 1 
[state:action]: [(12, 23) : burn] -> new state: (11, 0), p(t): 0.75, reward: 1 

[state:action]: [(12, 24) : do_nothing] -> new state: (12, 25), p(t): 0.5, reward: 1 
[state:action]: [(12, 24) : do_nothing] -> new state: (13, 25), p(t): 0

[state:action]: [(21, 29) : do_nothing] -> new state: (22, 29), p(t): 0.25, reward: 1 
[state:action]: [(21, 29) : burn] -> new state: (20, 0), p(t): 0.75, reward: 1 
[state:action]: [(21, 29) : burn] -> new state: (21, 0), p(t): 0.25, reward: 1 

[state:action]: [(22, 0) : do_nothing] -> new state: (21, 1), p(t): 0.5, reward: 1 
[state:action]: [(22, 0) : do_nothing] -> new state: (22, 1), p(t): 0.5, reward: 1 
[state:action]: [(22, 0) : burn] -> new state: (21, 0), p(t): 1.0, reward: 1 

[state:action]: [(22, 1) : do_nothing] -> new state: (23, 2), p(t): 0.1, reward: 1 
[state:action]: [(22, 1) : do_nothing] -> new state: (21, 2), p(t): 0.4, reward: 1 
[state:action]: [(22, 1) : do_nothing] -> new state: (22, 2), p(t): 0.5, reward: 1 
[state:action]: [(22, 1) : burn] -> new state: (21, 0), p(t): 0.9, reward: 1 
[state:action]: [(22, 1) : burn] -> new state: (22, 0), p(t): 0.1, reward: 1 

[state:action]: [(22, 2) : do_nothing] -> new state: (23, 3), p(t): 0.2, reward: 1 
[state:actio

In [4]:
fm_spec.reset()
fm_spec._setup_mdp()

[state:action]: [(0, 0) : do_nothing] -> new state: (0, 1), p(t): 1.0, reward: 0.0 
[state:action]: [(0, 0) : burn] -> new state: (0, 0), p(t): 1.0, reward: 0.0 

[state:action]: [(0, 1) : do_nothing] -> new state: (0, 2), p(t): 1.0, reward: 0.0 
[state:action]: [(0, 1) : burn] -> new state: (0, 0), p(t): 1.0, reward: 0.0 

[state:action]: [(0, 2) : do_nothing] -> new state: (0, 3), p(t): 1.0, reward: 0.0 
[state:action]: [(0, 2) : burn] -> new state: (0, 0), p(t): 1.0, reward: 0.0 

[state:action]: [(0, 3) : do_nothing] -> new state: (0, 4), p(t): 1.0, reward: 0.0 
[state:action]: [(0, 3) : burn] -> new state: (0, 0), p(t): 1.0, reward: 0.0 

[state:action]: [(0, 4) : do_nothing] -> new state: (0, 5), p(t): 1.0, reward: 0.0 
[state:action]: [(0, 4) : burn] -> new state: (0, 0), p(t): 1.0, reward: 0.0 

[state:action]: [(0, 5) : do_nothing] -> new state: (0, 6), p(t): 1.0, reward: 0.0 
[state:action]: [(0, 5) : burn] -> new state: (0, 0), p(t): 1.0, reward: 0.0 

[state:action]: [(0, 6

[state:action]: [(11, 15) : do_nothing] -> new state: (12, 16), p(t): 0.25, reward: 1 
[state:action]: [(11, 15) : do_nothing] -> new state: (10, 16), p(t): 0.25, reward: 1 
[state:action]: [(11, 15) : do_nothing] -> new state: (11, 16), p(t): 0.5, reward: 1 
[state:action]: [(11, 15) : burn] -> new state: (10, 0), p(t): 0.75, reward: 1 
[state:action]: [(11, 15) : burn] -> new state: (11, 0), p(t): 0.25, reward: 1 

[state:action]: [(11, 16) : do_nothing] -> new state: (12, 17), p(t): 0.25, reward: 1 
[state:action]: [(11, 16) : do_nothing] -> new state: (10, 17), p(t): 0.25, reward: 1 
[state:action]: [(11, 16) : do_nothing] -> new state: (11, 17), p(t): 0.5, reward: 1 
[state:action]: [(11, 16) : burn] -> new state: (10, 0), p(t): 0.75, reward: 1 
[state:action]: [(11, 16) : burn] -> new state: (11, 0), p(t): 0.25, reward: 1 

[state:action]: [(11, 17) : do_nothing] -> new state: (12, 18), p(t): 0.25, reward: 1 
[state:action]: [(11, 17) : do_nothing] -> new state: (10, 18), p(t): 0

[state:action]: [(20, 13) : do_nothing] -> new state: (21, 14), p(t): 0.25, reward: 1 
[state:action]: [(20, 13) : do_nothing] -> new state: (19, 14), p(t): 0.25, reward: 1 
[state:action]: [(20, 13) : burn] -> new state: (20, 0), p(t): 0.25, reward: 1 
[state:action]: [(20, 13) : burn] -> new state: (19, 0), p(t): 0.75, reward: 1 

[state:action]: [(20, 14) : do_nothing] -> new state: (20, 15), p(t): 0.5, reward: 1 
[state:action]: [(20, 14) : do_nothing] -> new state: (21, 15), p(t): 0.25, reward: 1 
[state:action]: [(20, 14) : do_nothing] -> new state: (19, 15), p(t): 0.25, reward: 1 
[state:action]: [(20, 14) : burn] -> new state: (20, 0), p(t): 0.25, reward: 1 
[state:action]: [(20, 14) : burn] -> new state: (19, 0), p(t): 0.75, reward: 1 

[state:action]: [(20, 15) : do_nothing] -> new state: (20, 16), p(t): 0.5, reward: 1 
[state:action]: [(20, 15) : do_nothing] -> new state: (21, 16), p(t): 0.25, reward: 1 
[state:action]: [(20, 15) : do_nothing] -> new state: (19, 16), p(t): 0

[state:action]: [(29, 25) : do_nothing] -> new state: (29, 26), p(t): 0.75, reward: 1 
[state:action]: [(29, 25) : burn] -> new state: (28, 0), p(t): 0.75, reward: 1 
[state:action]: [(29, 25) : burn] -> new state: (27, 0), p(t): 0.25, reward: 1 

[state:action]: [(29, 26) : do_nothing] -> new state: (28, 27), p(t): 0.25, reward: 1 
[state:action]: [(29, 26) : do_nothing] -> new state: (29, 27), p(t): 0.75, reward: 1 
[state:action]: [(29, 26) : burn] -> new state: (28, 0), p(t): 0.75, reward: 1 
[state:action]: [(29, 26) : burn] -> new state: (27, 0), p(t): 0.25, reward: 1 

[state:action]: [(29, 27) : do_nothing] -> new state: (28, 28), p(t): 0.25, reward: 1 
[state:action]: [(29, 27) : do_nothing] -> new state: (29, 28), p(t): 0.75, reward: 1 
[state:action]: [(29, 27) : burn] -> new state: (28, 0), p(t): 0.75, reward: 1 
[state:action]: [(29, 27) : burn] -> new state: (27, 0), p(t): 0.25, reward: 1 

[state:action]: [(29, 28) : do_nothing] -> new state: (28, 29), p(t): 0.25, reward

[state:action]: [(9, 20) : do_nothing] -> new state: (9, 21), p(t): 0.5, reward: 1 
[state:action]: [(9, 20) : do_nothing] -> new state: (10, 21), p(t): 0.25, reward: 1 
[state:action]: [(9, 20) : do_nothing] -> new state: (8, 21), p(t): 0.25, reward: 1 
[state:action]: [(9, 20) : burn] -> new state: (9, 0), p(t): 0.25, reward: 1 
[state:action]: [(9, 20) : burn] -> new state: (8, 0), p(t): 0.75, reward: 1 

[state:action]: [(9, 21) : do_nothing] -> new state: (9, 22), p(t): 0.5, reward: 1 
[state:action]: [(9, 21) : do_nothing] -> new state: (10, 22), p(t): 0.25, reward: 1 
[state:action]: [(9, 21) : do_nothing] -> new state: (8, 22), p(t): 0.25, reward: 1 
[state:action]: [(9, 21) : burn] -> new state: (9, 0), p(t): 0.25, reward: 1 
[state:action]: [(9, 21) : burn] -> new state: (8, 0), p(t): 0.75, reward: 1 

[state:action]: [(9, 22) : do_nothing] -> new state: (9, 23), p(t): 0.5, reward: 1 
[state:action]: [(9, 22) : do_nothing] -> new state: (10, 23), p(t): 0.25, reward: 1 
[state

[state:action]: [(21, 19) : do_nothing] -> new state: (20, 20), p(t): 0.25, reward: 1 
[state:action]: [(21, 19) : do_nothing] -> new state: (21, 20), p(t): 0.5, reward: 1 
[state:action]: [(21, 19) : do_nothing] -> new state: (22, 20), p(t): 0.25, reward: 1 
[state:action]: [(21, 19) : burn] -> new state: (20, 0), p(t): 0.75, reward: 1 
[state:action]: [(21, 19) : burn] -> new state: (21, 0), p(t): 0.25, reward: 1 

[state:action]: [(21, 20) : do_nothing] -> new state: (20, 21), p(t): 0.25, reward: 1 
[state:action]: [(21, 20) : do_nothing] -> new state: (21, 21), p(t): 0.5, reward: 1 
[state:action]: [(21, 20) : do_nothing] -> new state: (22, 21), p(t): 0.25, reward: 1 
[state:action]: [(21, 20) : burn] -> new state: (20, 0), p(t): 0.75, reward: 1 
[state:action]: [(21, 20) : burn] -> new state: (21, 0), p(t): 0.25, reward: 1 

[state:action]: [(21, 21) : do_nothing] -> new state: (20, 22), p(t): 0.25, reward: 1 
[state:action]: [(21, 21) : do_nothing] -> new state: (21, 22), p(t): 0

[state:action]: [(29, 4) : do_nothing] -> new state: (28, 5), p(t): 0.09999999999999998, reward: 1 
[state:action]: [(29, 4) : burn] -> new state: (27, 0), p(t): 0.09999999999999998, reward: 1 
[state:action]: [(29, 4) : burn] -> new state: (28, 0), p(t): 0.9, reward: 1 

[state:action]: [(29, 5) : do_nothing] -> new state: (29, 6), p(t): 1.0, reward: 1 
[state:action]: [(29, 5) : burn] -> new state: (28, 0), p(t): 1.0, reward: 1 

[state:action]: [(29, 6) : do_nothing] -> new state: (28, 7), p(t): 0.050000000000000044, reward: 1 
[state:action]: [(29, 6) : do_nothing] -> new state: (29, 7), p(t): 0.95, reward: 1 
[state:action]: [(29, 6) : burn] -> new state: (28, 0), p(t): 0.95, reward: 1 
[state:action]: [(29, 6) : burn] -> new state: (27, 0), p(t): 0.050000000000000044, reward: 1 

[state:action]: [(29, 7) : do_nothing] -> new state: (28, 8), p(t): 0.10000000000000003, reward: 1 
[state:action]: [(29, 7) : do_nothing] -> new state: (29, 8), p(t): 0.8999999999999999, reward: 1 
[sta

In [5]:
# print the state space and action space
print(envFM.observation_space)
print(envFM.action_space)

Discrete(900)
Discrete(2)


In [6]:
P, R = fm_spec.get_transition_and_reward_arrays(p_default=0.5)

In [7]:
print(P,R)

[[[0.   1.   0.   ... 0.   0.   0.  ]
  [0.   0.   1.   ... 0.   0.   0.  ]
  [0.   0.   0.   ... 0.   0.   0.  ]
  ...
  [0.   0.   0.   ... 0.75 0.   0.  ]
  [0.   0.   0.   ... 0.   0.   0.5 ]
  [0.   0.   0.   ... 0.   0.   0.  ]]

 [[1.   0.   0.   ... 0.   0.   0.  ]
  [1.   0.   0.   ... 0.   0.   0.  ]
  [1.   0.   0.   ... 0.   0.   0.  ]
  ...
  [0.   0.   0.   ... 0.   0.   0.  ]
  [0.   0.   0.   ... 0.   0.   0.  ]
  [0.   0.   0.   ... 0.   0.   0.  ]]] [[0.  0. ]
 [0.  0. ]
 [0.  0. ]
 ...
 [0.5 0.5]
 [0.5 0.5]
 [0.5 0.5]]


In [8]:
discount_factor = .90

In [9]:
pi = mdp.PolicyIteration(P, R, discount_factor, policy0=None, max_iter=10**4, eval_type=0)

In [10]:
start = time()
statsPI = pi.run()
stop = time()
totalTime = stop-start
print('Time to train: ', totalTime)

Time to train:  0.05672287940979004


In [11]:
statsPI

[{'State': None,
  'Action': None,
  'Reward': 6.007385915149921,
  'Error': 0.3797686967279379,
  'Time': 0.019702911376953125,
  'V[0]': -4.811959615916733e-18,
  'Max V': 6.007385915149921,
  'Mean V': 5.52874624126733,
  'Iteration': 1},
 {'State': None,
  'Action': None,
  'Reward': 6.016090629230573,
  'Error': 0.1396743231949067,
  'Time': 0.031484127044677734,
  'V[0]': -1.644774851296528e-15,
  'Max V': 6.016090629230573,
  'Mean V': 5.550482410339146,
  'Iteration': 2},
 {'State': None,
  'Action': None,
  'Reward': 6.01622187528806,
  'Error': 2.6645352591003757e-15,
  'Time': 0.043612003326416016,
  'V[0]': -1.875674183446222e-17,
  'Max V': 6.01622187528806,
  'Mean V': 5.562195199968543,
  'Iteration': 3},
 {'State': None,
  'Action': None,
  'Reward': 6.01622187528806,
  'Error': 2.6645352591003757e-15,
  'Time': 0.05644583702087402,
  'V[0]': 2.1930331350620375e-15,
  'Max V': 6.01622187528806,
  'Mean V': 5.562195199968543,
  'Iteration': 4}]

In [12]:
dfPI = pd.DataFrame(statsPI)
dfPI.to_csv('pi_large.csv')

In [13]:
fm_spec.print_policy(pi.policy)

     0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29
    ------------------------------------------------------------------------------------------
  0| 1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1| 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  2| 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  3| 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  4| 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  5| 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  6| 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  7| 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  8| 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  

In [14]:
vi = mdp.ValueIteration(P, R, discount_factor, epsilon=0.01, max_iter=10**4, initial_value=0)

In [15]:
start = time()
statsVI = vi.run()
stop = time()
totalTime = stop-start
print('Time to train: ', totalTime)

Time to train:  0.022237062454223633


In [16]:
statsVI

[{'State': None,
  'Action': None,
  'Reward': 0.6666666666666666,
  'Error': 0.6666666666666666,
  'Time': 0.0011000633239746094,
  'Max V': 0.6666666666666666,
  'Mean V': 0.5690370370370369,
  'Iteration': 1},
 {'State': None,
  'Action': None,
  'Reward': 1.2066666666666666,
  'Error': 0.6000000000000001,
  'Time': 0.001912832260131836,
  'Max V': 1.2066666666666666,
  'Mean V': 1.0827903703703707,
  'Iteration': 2},
 {'State': None,
  'Action': None,
  'Reward': 1.6926666666666668,
  'Error': 0.54,
  'Time': 0.0025548934936523438,
  'Max V': 1.6926666666666668,
  'Mean V': 1.541943970370371,
  'Iteration': 3},
 {'State': None,
  'Action': None,
  'Reward': 2.130066666666667,
  'Error': 0.4860000000000002,
  'Time': 0.003177165985107422,
  'Max V': 2.130066666666667,
  'Mean V': 1.953505416703704,
  'Iteration': 4},
 {'State': None,
  'Action': None,
  'Reward': 2.5237266666666667,
  'Error': 0.43740000000000023,
  'Time': 0.003693819046020508,
  'Max V': 2.5237266666666667,
  'Mea

In [17]:
dfVI = pd.DataFrame(statsVI)
dfVI.to_csv('VI_large.csv')

In [18]:
vi.policy

(0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,


In [19]:
fm_spec.print_policy(vi.policy)

     0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29
    ------------------------------------------------------------------------------------------
  0| 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  1| 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  2| 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  3| 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  4| 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  5| 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  6| 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  7| 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  8| 0  0  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  

In [20]:
# Check converge

expected = pi.policy
all(expected[k] - vi.V[k] < 1e-12 for k in range(len(expected)))

False

In [21]:
#Check sizes
action_space_size = envFM.action_space.n
state_space_size = envFM.observation_space.n

In [22]:
print('Action space: ', action_space_size)
print('State space: ', state_space_size)

Action space:  2
State space:  900


In [23]:
Q = mdp.QLearning(P, R, discount_factor, alpha=0.30, alpha_decay=0.95, alpha_min=0.01,
                 epsilon=.30, epsilon_min=.01, epsilon_decay=0.01,
                 n_iter=10**4, skip_check=False, iter_callback=None)

In [24]:
start = time()
statsQ = Q.run()
stop = time()
totalTime = stop-start
print('Time to train: ', totalTime)

Time to train:  2.436034917831421


In [25]:
fm_spec.print_policy(Q.policy)

     0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29
    ------------------------------------------------------------------------------------------
  0| 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  1| 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  2| 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  3| 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  4| 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  5| 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  6| 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  7| 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  8| 0  0  0  0  0  0  0  0  0  0  0  0  1  0  0  0  0  0  

In [26]:
statsQ

[{'State': 815,
  'Action': 0,
  'Reward': 0.6,
  'Error': 0.18,
  'Time': 0.0006489753723144531,
  'Alpha': 0.3,
  'Epsilon': 0.3,
  'Gamma': 0.9,
  'V[0]': 0.0,
  'Max V': 0.18,
  'Mean V': 0.00019999999999999998,
  'Iteration': 1},
 {'State': 846,
  'Action': 0,
  'Reward': 0.6,
  'Error': 0.17099999999999999,
  'Time': 0.0011608600616455078,
  'Alpha': 0.285,
  'Epsilon': 0.01,
  'Gamma': 0.9,
  'V[0]': 0.0,
  'Max V': 0.18,
  'Mean V': 0.00039,
  'Iteration': 2},
 {'State': 877,
  'Action': 0,
  'Reward': 0.5,
  'Error': 0.135375,
  'Time': 0.0016820430755615234,
  'Alpha': 0.27075,
  'Epsilon': 0.01,
  'Gamma': 0.9,
  'V[0]': 0.0,
  'Max V': 0.18,
  'Mean V': 0.0005404166666666667,
  'Iteration': 3},
 {'State': 848,
  'Action': 0,
  'Reward': 0.6,
  'Error': 0.15432749999999998,
  'Time': 0.002183198928833008,
  'Alpha': 0.25721249999999996,
  'Epsilon': 0.01,
  'Gamma': 0.9,
  'V[0]': 0.0,
  'Max V': 0.18,
  'Mean V': 0.0007118916666666666,
  'Iteration': 4},
 {'State': 879,
  '

In [27]:
# Save for graphs
dfFMQ = pd.DataFrame(statsQ)
dfFMQ.to_csv('Q_large.csv')

In [28]:
Q

P: 
array([[0.  , 1.  , 0.  , ..., 0.  , 0.  , 0.  ],
       [0.  , 0.  , 1.  , ..., 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.  ],
       ...,
       [0.  , 0.  , 0.  , ..., 0.75, 0.  , 0.  ],
       [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.5 ],
       [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.  ]])
array([[1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

R: 
array([0., 0.])
array([0., 0.])