In [1]:
import numpy as np

In [None]:
class GridWorld_PolicyIteration():
    
    def __init__(self,size,reward_value,delta):
        
        self.size = size
        self.reward_value = reward_value
        self.actions = ['up','down','left','right']
        self.n_action = len(self.actions)
        self.delta = np.ones((size,size))*delta
        
    def InitializePolicy(self):
        
        # policy is a n_action x N x N ndarray.  
        # Each NxN matrix is for the policy of specific action through all states
        self.policy = np.ones((self.n_action,self.size,self.size))*0.25

    def InitializeValueFunction(self):

        # The size of value_function matrix is (N+2 x N+2), 
        # this is more efficient for the following code
        self.large_Vk = np.zeros((self.size+2,self.size+2))    
        
    def Coordinates(self):
        
        # coordinates are used for indicating specific state and value_function given the state
        cod = np.meshgrid(list(range(1,self.size+1)),
                          list(range(1,self.size+1)))
        self.row_cod = cod[1]
        self.col_cod = cod[0]
        
        self.move_up = [self.row_cod,(self.col_cod-1).astype(int)]
        self.move_down = [self.row_cod,(self.col_cod+1).astype(int)]
        self.move_left = [(self.row_cod-1).astype(int),self.col_cod]
        self.move_right = [(self.row_cod+1).astype(int),self.col_cod]
        
        self.move = {0:self.move_up,
                     1:self.move_down,
                     2:self.move_left,
                     3:self.move_right}
        
    def Reward(self):
        
        # define r(s',a,s), which is also the instant reward, the reward in first term of Bellman Equation
        if isinstance(self.reward_value,int): 
            self.reward_matrix = np.ones((self.size,self.size))*self.reward_value
            self.reward_matrix[0,0] = 0
            self.reward_matrix[-1,-1] = 0

    def Policy_Eval(self):
        
        Vk_temp = 0
        for i in range(self.n_action):
            Vk_temp += self.policy[i]*self.large_Vk[self.move[i][0],self.move[i][1]]
            
        Vk_temp += self.reward_matrix
        
        self.large_Vk[self.row_cod,self.col_cod] = Vk_temp
        self.large_Vk[1,1]=0
        self.large_Vk[self.size,self.size]=0
        
        self.large_Vk[0,:] = self.large_Vk[1,:]
        self.large_Vk[:,-1] = self.large_Vk[:,-2]
        self.large_Vk[-1,:] = self.large_Vk[-2,:]
        self.large_Vk[:,0] = self.large_Vk[:,1]
        
    def Policy_Impr(self):
        
        pass
    
    def ReadyPlayer1(self):
        
        self.InitializePolicy()
        self.InitializeValueFunction()
        self.Coordinates()
        self.Reward()
        
        this_Vk = self.large_Vk[self.row_cod,self.col_cod]
        last_Vk = np.ones((self.size,self.size))*1000
        
        while (np.abs(this_Vk - last_Vk)<self.delta).all()==False:
            last_Vk=this_Vk.copy()
            self.Policy_Eval()
            this_Vk = self.large_Vk[self.row_cod,self.col_cod]

In [19]:
def InitializePolicy(size):
    
    coordinates = [(i,j) for i in range(1,size+1) for j in range(1,size+1)]
    policy1 = {}
    for i in coordinates:
        policy1[i]=[0.25,0.25,0.25,0.25]
        
    return policy

In [22]:
a = 2
isinstance(a,int)

True

In [None]:
class GridWorld_PolicyEvaluation():
    
    def __init__(self,size,reward,itera,policy):
        
        self.size = size
        self.iter = itera
        
        # instant reward, represents the first part of bellman equation
        # sum_{a}pi(a|s)*sum_{s',r}p(s',r|s,a)*r
        # r is set to be reward
        self.instant_r = reward*np.ones((self.size,self.size))
        self.instant_r[0,0]=0
        self.instant_r[-1,-1]=0
        
        self.policy = policy
        
    def __DataStruct(self):
                
        self.actions = ['up','down','left','right']
        
        # Initialize value-function matrix
        self.large_Vk = np.ones((self.size+2,self.size+2))*(-1)
        self.large_Vk[1,1]=0
        self.large_Vk[self.size,self.size]=0
        
        # execute-action related arrays
        self.cod = np.meshgrid(list(range(1,self.size+1)),
                               list(range(1,self.size+1)))
        
        self.move_left = [self.cod[1],(self.cod[0]+self.actions_move['up']).astype(int)]
        self.move_right = [self.cod[1],(self.cod[0]+self.actions_move['down']).astype(int)]
        self.move_up = [(self.cod[1]+self.actions_move['left']).astype(int),self.cod[0]]
        self.move_down = [(self.cod[1]+self.actions_move['right']).astype(int),self.cod[0]]
        
        self.actions_cod = {'left':self.move_left,
                            'right':self.move_right,
                            'up':self.move_up,
                            'down':self.move_down}
        
    def Act_Update(self):
        
        self.Vk_dir = {}
        sum_temp_vk = 0                
        for a in self.actions:
            
            # self.Vk_dir not used after defination. This is for debugging.
            self.Vk_dir[a] = self.large_Vk[self.actions_cod[a][0],self.actions_cod[a][1]]
            sum_temp_vk+=self.policy[a]*self.Vk_dir[a]
            
        sum_temp_vk+=self.instant_r
        self.large_Vk[self.cod[1],self.cod[0]] = sum_temp_vk
        self.large_Vk[1,1]=0
        self.large_Vk[self.size,self.size]=0
        
        self.large_Vk[0,:] = self.large_Vk[1,:]
        self.large_Vk[:,-1] = self.large_Vk[:,-2]
        self.large_Vk[-1,:] = self.large_Vk[-2,:]
        self.large_Vk[:,0] = self.large_Vk[:,1]
        
    def ReadyPlayer1(self):
        
        self.__DataStruct()
        
        for i in range(self.iter):
            
            self.Act_Update()

In [None]:
class Policy_Iteration():
    
    def __init__(self,S,A):
        
        self.states = S
        self.actions = A
        
    def Reward(self):
        
        pass
    
    def State_Trans_Prob(self):
        
        