In [1]:
import numpy as np

np.set_printoptions(precision=3)

SIZE = 9

In [2]:
class Env():
    
    def __init__(self, size=SIZE, prob=1):
        
        # size : Int, size of array
        # prob : Float, probability of stochastic movement
        
        self.size = size
        self.prob = prob
        self.payoff = np.ones(size) * - 1
        self.payoff[size-1] = 0 # value of terminal state
    
    def feedback(self, curr, action):
        # :: current_state, action -> reward, next_state
        action = -1 if action == 0 else 1
        next_state = int(max(0, curr + action))
        reward = self.payoff[next_state]
            
        return reward, next_state
            
    def get_payoff(self):
        return self.payoff
    
    def get_prob(self):
        return self.prob

In [3]:
env = Env()
env.get_payoff()

# the agent will be traversing thru this 1d array
# starting on 2nd element,
# the goal is to reach the right end as fast as possible

array([-1., -1., -1., -1., -1., -1., -1., -1.,  0.])

In [6]:
class Iterative_DP():
    
    def __init__(self, env, gamma=0.99):
        
        self.size = env.size-1
        self.gamma = gamma
        
        self.values = np.zeros(self.size+1)
        self.policy = np.zeros(self.size+1)
        self.update_policy(env)
        
    def sweep(self, env):
        
        for _ in range(1):
            for s in range(self.size):
                r, s_ = env.feedback(s, self.policy[s])
                self.values[s] = r + self.gamma * self.values[s_]
            
        self.update_policy(env)
            
    def update_policy(self, env):
        
        for s in range(self.size):
            # state after left/right movement
            r_l, s_l = env.feedback(s, 0)
            r_r, s_r = env.feedback(s, 1)
            
            v_l = r_l + self.gamma * self.values[s_l]
            v_r = r_r + self.gamma * self.values[s_r]
            
            pi = 0 if (v_l > v_r) else 1
            
            self.policy[s] = pi

In [9]:
agent1 = Iterative_DP(env)

for _ in range(10):
    agent1.sweep(env)
    print(agent1.values[:-1])

print(agent1.policy[:-1])

[-1. -1. -1. -1. -1. -1. -1.  0.]
[-1.99 -1.99 -1.99 -1.99 -1.99 -1.99 -1.    0.  ]
[-2.97 -2.97 -2.97 -2.97 -2.97 -1.99 -1.    0.  ]
[-3.94 -3.94 -3.94 -3.94 -2.97 -1.99 -1.    0.  ]
[-4.901 -4.901 -4.901 -3.94  -2.97  -1.99  -1.     0.   ]
[-5.852 -5.852 -4.901 -3.94  -2.97  -1.99  -1.     0.   ]
[-6.793 -5.852 -4.901 -3.94  -2.97  -1.99  -1.     0.   ]
[-6.793 -5.852 -4.901 -3.94  -2.97  -1.99  -1.     0.   ]
[-6.793 -5.852 -4.901 -3.94  -2.97  -1.99  -1.     0.   ]
[-6.793 -5.852 -4.901 -3.94  -2.97  -1.99  -1.     0.   ]
[1. 1. 1. 1. 1. 1. 1. 1.]
