In [2]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

**Iterative Policy Evaluation in Grid-World**
- fix the state transfer prob matrix S
- fix the reward of each grid =-1
- policy is equiprobable

In [1]:
class GridWorld():
    
    def __init__(self,size,reward,itera):
        
        self.size = size
        self.iter = itera
        
        # instant reward, represents the first part of bellman equation
        # sum_{a}pi(a|s)*sum_{s',r}p(s',r|s,a)*r
        # r is set to be reward
        self.instant_r = reward*np.ones((self.size,self.size))
        self.instant_r[0,0]=0
        self.instant_r[-1,-1]=0
        
    def __DataStruct(self):
                
        self.actions = ['up','down','left','right']
        self.policy={}
        for a in self.actions:
            self.policy[a] = 1/self.size
        
        # Initialize value-function matrix
        self.large_Vk = np.ones((self.size+2,self.size+2))*(-1)
        self.large_Vk[1,1]=0
        self.large_Vk[self.size,self.size]=0
        
        # execute-action related arrays
        self.actions_move = {'up':(-1)*np.ones((self.size,self.size)),
                             'left':(-1)*np.ones((self.size,self.size)),
                             'down':np.ones((self.size,self.size)),
                             'right':np.ones((self.size,self.size))}
        
        self.cod = np.meshgrid(list(range(1,self.size+1)),
                               list(range(1,self.size+1)))
        
        self.move_left = [self.cod[1],(self.cod[0]+self.actions_move['up']).astype(int)]
        self.move_right = [self.cod[1],(self.cod[0]+self.actions_move['down']).astype(int)]
        self.move_up = [(self.cod[1]+self.actions_move['left']).astype(int),self.cod[0]]
        self.move_down = [(self.cod[1]+self.actions_move['right']).astype(int),self.cod[0]]
        
        self.actions_cod = {'left':self.move_left,
                            'right':self.move_right,
                            'up':self.move_up,
                            'down':self.move_down}
        
    def Act_Update(self):
        
        self.Vk_dir = {}
        sum_temp_vk = 0                
        for a in self.actions:
            
            # self.Vk_dir not used after defination. This is for debugging.
            self.Vk_dir[a] = self.large_Vk[self.actions_cod[a][0],self.actions_cod[a][1]]
            sum_temp_vk+=self.policy[a]*self.Vk_dir[a]
            
        sum_temp_vk+=self.instant_r
        self.large_Vk[self.cod[1],self.cod[0]] = sum_temp_vk
        self.large_Vk[1,1]=0
        self.large_Vk[self.size,self.size]=0
        
        self.large_Vk[0,:] = self.large_Vk[1,:]
        self.large_Vk[:,-1] = self.large_Vk[:,-2]
        self.large_Vk[-1,:] = self.large_Vk[-2,:]
        self.large_Vk[:,0] = self.large_Vk[:,1]
        
    def ReadyPlayer1(self):
        
        self.__DataStruct()
        
        for _ in range(self.iter):
            
            self.Act_Update()