In [53]:
import copy
from typing import Any, Tuple, Dict, Optional
import numpy as np
import gymnasium as gym 
from gymnasium import spaces

In [54]:
def clamp(v, minimal_value, maximal_value):
    return min(max(v, minimal_value), maximal_value)

In [55]:
class GridWorldEnv(gym.Env):
    def __init__(self):
        self.map = [
            list("s   "),
            list("    "),
            list("    "),
            list("gt g"),
        ]
        
        self.action_space = spaces.Discrete(4) # defining the action space (0 , 1, 2, 3)
        self.observation_space = spaces.Box(low=0, high=4, shape=(2,), dtype=np.int32) # defining the observation space with minimum value of 0 and maximum value of 4
        self.agent_position = [0, 0] # as mentioned in the problem statement, the agent starts at S which is at position [0, 0]

    def reset(self,
              *, 
              seed: Optional[int] = None,
              options : Optional[dict] = None,
    ):
        super().reset(seed=seed)
        
        self.agent_position = [0, 0] # Resets the position of the agent to [0, 0], indicating that the agent starts at the top-left corner of the grid.

        return self._observe(), {}
    
    def step(self, action: int) -> Tuple[np.ndarray, float, bool, bool, Dict[str, Any]]:
        '''This is a method definition for the step function of the GridWorldEnv.
           It takes an argument action, which represents the action the agent 
           wants to take, and returns a tuple containing the next observation, 
           reward, whether the episode is done, and additional information.'''
        
        reward = None
        done = None

        assert self.action_space.contains(action) # Asserts that the action taken by the agent is valid, ensuring it lies within the action space defined for the environment.
        match action:  # Modifies the agent's position based on the action taken: up, right, down, or left.
            case 0: # up
                self.agent_position[0] -=1
            case 1: # right
                self.agent_position[1] +=1
            case 2: # down
                self.agent_position[0] +=1
            case _: # left
                self.agent_position[1] -=1

        # Ensures that the agent's position remains within the bounds of the grid by clamping the position values between 0 and 3.
        self.agent_position[0] = clamp(self.agent_position[0], 0, 3) 
        self.agent_position[1] = clamp(self.agent_position[1], 0, 3)
        
        # Calls the _observe method to get the observation of the environment after the agent's action.
        observation = self._observe()

        reward = 0
        done = False 

        #Checks if the agent has landed on a trap or the goal.
        if 't' == self.map[self.agent_position[0]][self.agent_position[1]]:  # If the agent lands on a trap ('t'), it receives a reward of -1 and the episode terminates.
            reward = -1
            done = True
        
        if 'g' == self.map[self.agent_position[0]][self.agent_position[1]]:  # If the agent reaches the goal ('g'), it receives a reward of +1 and the episode terminates.
            reward = +1
            done = True
        
        return observation, reward, done, False, {}
    
    def render(self):
        '''visualizes the current state in a human- or agent-readable way'''
        rendered_map = copy.deepcopy(self.map)
        rendered_map[self.agent_position[0]][self.agent_position[1]] = "A"
        print("_________")
        for row in rendered_map:
            print('|', end = '')
            for cell in row:
                print("{}|".format(cell), end='')
            print()
        print("---------")
        return None
    
    def close(self):
        pass

    def _observe(self):
        '''the observation returned is a NumPy array representing the current position of the agent in the grid world. This method appears to be a simple way of representing 
        the environment's state by providing the coordinates of the agent's position.'''
        return np.array(self.agent_position)


In [56]:
import time 
#defining the actions
actions = ["Up", "Right", "Down", "Left"]

In [57]:
# Creating the environment

env = GridWorldEnv()
env.render()

_________
|A| | | |
| | | | |
| | | | |
|g|t| |g|
---------


In [58]:
# Reset
obs = env.reset()
env.render()


_________
|A| | | |
| | | | |
| | | | |
|g|t| |g|
---------


In [59]:
env.action_space.sample()

0

In [60]:
env.observation_space

Box(0, 4, (2,), int32)

In [61]:
action = env.action_space.sample() # get sample action from the action space
print("action: ",action, actions[action]) # print discrete action number and the corresponding action 
obs, reward, terminated, truncated, info = env.step(action)  # retrieve the observations, reward, terminated and truncted info from the environment based on the action
print("observation: ",obs, ",reward: ",reward, ",terminated: ",terminated,",truncated: ",truncated,",Info: ",info) # print the info
env.render() # visualize the grid based on the action

# in the below grid, the "A" starts from [0,0] position. Since the action is Right, the "A" moves to the right position that is [0,1].

action:  1 Right
observation:  [0 1] ,reward:  0 ,terminated:  False ,truncated:  False ,Info:  {}
_________
|s|A| | |
| | | | |
| | | | |
|g|t| |g|
---------


In [62]:
# Perform the 50 actions in loop

for i in range(50):
    action = env.action_space.sample()
    print(actions[action])
    obs, reward, terminated, truncated, info = env.step(action)
    # print(obs)
    env.render()

    # Uncomment this to enable slow motion mode
    #time.sleep(3.0)
    if terminated:
        print('Reset environment')
        env.reset()
        env.render()
env.close()

Down
_________
|s| | | |
| |A| | |
| | | | |
|g|t| |g|
---------
Down
_________
|s| | | |
| | | | |
| |A| | |
|g|t| |g|
---------
Right
_________
|s| | | |
| | | | |
| | |A| |
|g|t| |g|
---------
Up
_________
|s| | | |
| | |A| |
| | | | |
|g|t| |g|
---------
Right
_________
|s| | | |
| | | |A|
| | | | |
|g|t| |g|
---------
Up
_________
|s| | |A|
| | | | |
| | | | |
|g|t| |g|
---------
Left
_________
|s| |A| |
| | | | |
| | | | |
|g|t| |g|
---------
Up
_________
|s| |A| |
| | | | |
| | | | |
|g|t| |g|
---------
Down
_________
|s| | | |
| | |A| |
| | | | |
|g|t| |g|
---------
Down
_________
|s| | | |
| | | | |
| | |A| |
|g|t| |g|
---------
Down
_________
|s| | | |
| | | | |
| | | | |
|g|t|A|g|
---------
Left
_________
|s| | | |
| | | | |
| | | | |
|g|A| |g|
---------
Reset environment
_________
|A| | | |
| | | | |
| | | | |
|g|t| |g|
---------
Right
_________
|s|A| | |
| | | | |
| | | | |
|g|t| |g|
---------
Left
_________
|A| | | |
| | | | |
| | | | |
|g|t| |g|
---------
Right
_________