In [2]:
import numpy as np
import sys

## Define Environment

In [5]:
class Environment :
    def __init__(self, R, P_actionSuccess, gamma) :
        self.R = R   # m*n reward matrix
        self.P_actionSuccess = P_actionSuccess
        self.m = R.shape[0]
        self.n = R.shape[1]
        self.gamma = gamma
        self.V = np.zeros(R.shape)
        self.policy = np.array([["right", "right", "right", "right"]
                        ,["right", "right", "right", "right"]
                        ,["right", "right", "right", "right"]
                        ,["right", "right", "right", "right"]])
        self.actionList = ["left", "right", "up", "down"]
        
    def getSuccessor(self, i, j, action) :
        if(action == "left" and j > 0) :
            return self.V[i][j-1]
        elif(action == "right" and j < self.n-1)  :
            return self.V[i][j+1]
        elif(action == "up" and i > 0) :
            return self.V[i-1][j]
        elif(action == "down" and i < self.m-1) :
            return self.V[i+1][j]
        return None
        
    def Bellmann(self, i, j) :
        bestAction = "None"
        max_val = -sys.maxsize
        reward = self.R[i][j]
        for action in self.actionList :
            val = 0
            for k, p in enumerate(self.P_actionSuccess[action]) :
                V_successor = self.getSuccessor(i, j, self.actionList[k])
                if(V_successor is not None) :
                    val += (p * (reward + (gamma * V_successor)))
                else :
                    val += p*-1  # reward = -1 
            if(val > max_val) :
                max_val = val
                bestAction = action
        return max_val, bestAction
        
    def ValueIteration(self) :
        itr = 0
        while(itr < 1000) :
            itr += 1
            delta = 0
            for i in range(self.m) :
                for j in range(self.n) :
                    v = self.V[i][j]
                    self.V[i][j], self.policy[i][j] = self.Bellmann(i, j)
                    delta = max(delta, abs(v - self.V[i][j]))
            if(delta < 0.01) :
                print("Values converged....")
                break
        print("(itr, delta) = ", (itr, delta))

## Driver Program

In [9]:
P_actionSuccess = { "left" : [0.8, 0, 0.1, 0.1], "right" : [0, 0.8, 0.1, 0.1], 
                                 "up" : [0.1, 0.1, 0.8, 0], "down" : [0.1, 0.1, 0, 0.8] }
R = np.array([[0, 0.45, 1, 0.9]
               ,[0.23, 1.25, 0, 0]
               ,[0, 0.45, 0.75, 0]
               ,[0.85, 1.5, 2.5, 0.85]])
# R = np.array([[0.4, 0.1, 0.5, 0.8],
#               [0.2, 0.6, 0.19, 0.3],
#               [0.11, 0.26, 0.6, 0.7],
#               [0.10, 0.2, 0.3, 0.4]])
                   
# R = np.array([[4, 1, 5, 8],
#               [2, 6, 19, 3],
#               [11, 26, 6, 7],
#               [10, 2, 3, 4]])
gamma = 0.98
envObj = Environment(R, P_actionSuccess, gamma)
print("_________________Value Iteration_________________\n")
print("Rewards\n", R)
envObj.ValueIteration()
print("Optimal Value\n", envObj.V)
print("Optimal Policy\n", envObj.policy)

_________________Value Iteration_________________

Rewards
 [[0.   0.45 1.   0.9 ]
 [0.23 1.25 0.   0.  ]
 [0.   0.45 0.75 0.  ]
 [0.85 1.5  2.5  0.85]]
Values converged....
(itr, delta) =  (116, 0.009599949467087754)
Optimal Value
 [[46.03452537 52.34481561 52.49861186 46.92055101]
 [52.08833756 53.88631892 53.28901991 51.55621296]
 [52.12638615 53.97629378 54.80036568 52.85971308]
 [48.08469127 53.97737426 55.59698689 49.4332896 ]]
Optimal Policy
 [['right' 'down' 'down' 'left']
 ['right' 'down' 'down' 'left']
 ['right' 'right' 'down' 'left']
 ['right' 'up' 'up' 'left']]
