In [1]:
import numpy as np

#Purpous of the game is to learn mouse to avoid traps and find the escape from the cage

class Mouse_and_Cage:
    
    def __init__(self, height=5, width=15, cage_trap_position_y = [1,1,1,1,1,1,1,1,1,1,1,1,1,1], 
                 cage_trap_position_x = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14], reward = -0.1, gamma=0.1):
        
        #initialize size of the cage
        self.height = height
        self.width = width
        self.y_final = self.height -1
        self.x_final = self.width - 1
        
        #initialize position of the traps in the cage
        self.y_traps = cage_trap_position_y
        self.x_traps = cage_trap_position_x
        
        #initialize penalties and rewards for the traps and the escape from the cage
        self.reward = reward
        self.cage_rewards = np.ones(shape=(self.height, self.width)) * self.reward
        self.cage_rewards[self.y_final, self.x_final] = 5.0
        for y_trap, x_trap in zip(self.y_traps, self.x_traps):
            self.cage_rewards[y_trap, x_trap] = -5.0
        
        #initialize possible moves, random policy and the value of the policy
        self.moves = 4
        self.policy = np.random.randint(0, self.moves, size=(self.height, self.width)).astype(np.uint8)
        self.cage_values = np.zeros(shape=(self.height, self.width))
        
        #discount for previous moves
        self.gamma = gamma
        
    def policy_evaluation(self):
        self.old_cage_values = self.cage_values.copy()
        #numbers corresponds to moves in the cage
        #if move equals 0 than go up
        #if move equals 1 than go right
        #if move equals 2 than go down
        #if move equals 3 than go left
        for i in range(self.height):
            for j in range(self.width):
                self.move = self.policy[i,j]
                if self.move == 0:
                    if i == 0:
                        y=0
                    else:
                        y = i - 1
                    x = j
                elif self.move == 1:
                    if j == self.width - 1:
                        x = self.width - 1
                    else:
                        x = j + 1
                    y = i
                elif self.move == 2:
                    if i == self.height - 1:
                        y = self.height - 1
                    else:
                        y = i + 1
                    x = j
                else:
                    if j == 0:
                        x = 0
                    else:
                        x = j - 1
                    y = i
                #y,x is the position of the next step of the mouse
                self.reward_for_move=self.cage_rewards[y, x]
                #updating values for each move
                self.cage_values[i,j]= self.reward_for_move + (self.gamma * self.old_cage_values[y,x])
                     
    def is_final(self, y, x):
        #the function checks if the mouse escaped from the cage or felt into trap
        self.y = y
        self.x = x
        if (self.x, self.y) in zip(self.x_traps, self.y_traps) or (self.x, self.y) == (self.x_final, self.y_final):
            return True
        return False
    
    def policy_improvement(self):
        #the function learn mouse to avoid the traps through maximizing the amount of reward
        for i in range(self.height):
            for j in range(self.width):
                if self.is_final(i, j):
                    continue
                self.values = np.zeros(shape=(self.moves, ))
                #assigning values for each posible move
                self.values[0] = (self.cage_rewards[i-1, j] + (self.gamma * self.cage_values[i - 1, j])) if i > 0 else - np.inf
                self.values[1] = (self.cage_rewards[i, j + 1] + (self.gamma * self.cage_values[i, j+1])) if j < self.width - 1 else - np.inf
                self.values[2] = (self.cage_rewards[i+1, j] + (self.gamma * self.cage_values[i + 1, j])) if i < self.height -1 else - np.inf
                self.values[3] = (self.cage_rewards[i, j- 1] + (self.gamma * self.cage_values[i, j-1])) if j > 0 else - np.inf
                #selecting the policy which maximize the amount of reward
                self.policy[i,j] = np.argmax(self.values).astype(np.uint8)
                
    def train(self, epochs=1000, tolerance=1e-5):
        
        self.epochs = epochs
        self.tolerance = tolerance
        self.e = 0

        while self.e < self.epochs:
            self.e+=1
            self.old_cage_values = self.cage_values.copy()
            self.policy_evaluation()
            if np.mean(np.abs(self.cage_values - self.old_cage_values)) < self.tolerance:
                self.old_policy = self.policy.copy()
                self.policy_improvement()
                if np.sum(self.policy - self.old_policy) == 0:
                    break
    
    def find_escape(self):
        #numbers corresponds to moves in the cage
        #if move equals 0 than go up
        #if move equals 1 than go right
        #if move equals 2 than go down
        #if move equals 3 than go left
        
        self.x = 0
        self.y = 0
        #initialize map to track steps of the mouse in the cage
        self.steps = [[0 for x in range(self.width)] for y in range(self.height)]
        self.steps[0][0] = 1
        while True:
            self.move = self.policy[self.y,self.x]
            if self.move == 0:
                if self.y == 0:
                    self.y=0
                else:
                    self.y = self.y - 1
                self.x = self.x
            elif self.move == 1:
                if self.x == self.width - 1:
                    self.x = self.width - 1
                else:
                    self.x = self.x + 1
                self.y = self.y
            elif self.move == 2:
                if self.y == self.height - 1:
                    self.y = self.height - 1
                else:
                    self.y = self.y + 1
                self.x = self.x
            else:
                if self.x == 0:
                    self.x = 0
                else:
                    self.x = self.x - 1
                self.y = self.y
            if (self.x == self.x_final) and (self.y == self.y_final):
                self.steps[self.y][self.x] = 1
                break
            self.steps[self.y][self.x] = 1
    
    def show_the_path_to_escape(self):
        return self.steps
    
    def show_rewards(self):
        return np.floor(self.cage_rewards).astype(int)

In [2]:
rlf = Mouse_and_Cage()
rlf.train()

In [3]:
rlf.find_escape()
rlf.show_the_path_to_escape()

[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]]

In [4]:
y = [0,1,1,1,1,1,1,1,1,1,1,1,1,1], 
x = [4,0,1,2,3,5,6,7,8,9,10,11,12,13,14]

rlf2 = Mouse_and_Cage(cage_trap_position_y = y, cage_trap_position_x = x)
rlf2.train()

rlf2.find_escape()
rlf2.show_the_path_to_escape()

[[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]]