## Maze Problem

In [5]:
import numpy as np
import pygame

import random

# import gymnasium as gym
# from gymnasium import spaces
import gym
from gym import error, spaces, utils
from gym.utils import seeding


import traceback
import os

from stable_baselines3 import PPO, DQN, A2C, SAC
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.evaluation import evaluate_policy

In [6]:
class Agent:
    def __init__(self, x=0, y=0):
        self.name = 'MazeAgent'
        self.x = x
        self.y = y
        self.val = 6 # this is to denote that cell contains the agent
        self.action_happened = set()
        self.last_action = ''
    
    def __repr__(self):
        return f"Agent:- move: ({self.x} , {self.y}) ~ move happened: {self.action_happened} ~ last action: {self.last_action}"

In [16]:
class MazeEnv(gym.Env):
    
    # method -1
    '''
    The init method intialize all the variables needed.
    '''
    def __init__(self, conf):
        try:
            
            self.conf = conf
            
            # We have 4 actions, corresponding to "right", "up", "left", "down"
            # It describes the numerical structure of the legitimate actions that can be applied to the environment.
            self.action_space = spaces.Discrete(4)
            
            rows = self.conf['env']['rows']
            cols = self.conf['env']['cols']
            
            # observation is the x, y coordinate of the grid - agent's current cell position
            # for 4x4 maze, low pos: [0, 0] high pos: [3, 3]
            low = np.array([0, 0], dtype=np.int64)
            high = np.array([rows-1, cols-1], dtype=np.int64)
            self.observation_space = spaces.Box(low, high, shape=(2,), dtype=np.int64)

            
            # generate environment
            self.maze = np.zeros((rows, cols))

            # generate dummy env for tracking agent's visited cels
            self.visited = np.zeros((rows, cols))

            # generate Agent, Agent will always start from (0,0) cell
            self.agent = Agent(0, 0)
            self._updt_agent_pos(self.agent)

            # generate initial state of maze and agent
            self._gen_init_cnst_state()
            
        except Exception as e:
            raise e
    
    # method -2
    '''
    The reset method will be called to initiate a new episode. 
    '''
    def reset(self):
        try:
            rows = self.conf['env']['rows']
            cols = self.conf['env']['cols']
            
            # re-initializing the visited grid
            self.visited = np.zeros((rows, cols))
           
            # replacing agent at (0, 0) cell
            self._updt_agent_pos(Agent(0, 0))
            
            observation = self._get_obs()
            
            return observation
        except Exception as e:
            raise e
     
    # method -3
    '''
    The step method takes an action as an input and applies it to the environment, 
    which leads to the environment transitioning to a new state.
    action: 'up', 'down', 'right', 'left'
    '''
    def step(self, action):
        try:
            # get the direction where agent should move
            dir_num, dir_pos = self._action_to_direction(action)

            # Whether the episode has been terminated
            terminated = False
            # The reward that you can get from the environment after executing the action 
            # that was given as the input to the step function.
            reward = 0

            # agent's new position
            new_x = self.agent.x + dir_pos[0]
            new_y = self.agent.y + dir_pos[1]

            if not self._chk_pos_validity(new_x, new_y):
                reward = -1 
                self.agent.action_happened.add(action)
                if len(self.agent.action_happened) == 4: # agent can not move any more
                    terminated = True   
            else:
                self._updt_agent_pos(Agent(new_x, new_y))
                
                terminated = True if self._win() else False
                reward = 1 if terminated else 0.001

            
            self.agent.last_action = dir_num
            
            # The observation of the state of the environment.
            observation = self._get_obs()
            
            # This provides additional information depending on the environment.
            info = self._get_info()
            
            return observation, reward, terminated, info
        except Exception as e:
            raise e
            
    
    # method -4
    '''
    The render method is for rendering the environment
    '''
    def render(self):
        try:
            self._visualize()
        except Exception as e:
            raise e
    
    # method -5
    '''
    The close method should close any open resources that were used by the environment.
    '''
    def close(self):
        pass
    
    # translates the environment’s state into an observation
    def _get_obs(self):
        try:
            return np.array([self.agent.x, self.agent.y], dtype=np.int64)
        except Exception as e:
            raise e
    
    # auxiliary information
    def _get_info(self):
        try:
            return {"visited": self.visited}
        except Exception as e:
            raise e
    
    # updating agent position in maze
    def _updt_agent_pos(self, agent):
        try:
            self.agent = agent

            self.visited[self.agent.x, self.agent.y] = 1
        except Exception as e:
            raise e
    
    # checking agent position is valid or not in maze
    def _chk_pos_validity(self,  x, y):
        try:
            rows = self.conf['env']['rows']
            cols = self.conf['env']['cols']

            # agent can not visit out of bound, obstacles, already visited cells
            if x<0 or y<0 or x>=rows or y>=cols or self.maze[x][y] == -1 or self.visited[x][y] == 1:
                return False
            return True
        except Exception as e:
            raise e
    
    # condition for wining the game
    def _win(self):
        try:
            if self.maze[self.agent.x, self.agent.y] == 1: # agent reached last cell
                return True     
            return False
        except Exception as e:
            raise e
    
    # It is for initializing maze and agent position
    def _gen_init_cnst_state(self):
        try:
            
            rows = self.conf['env']['rows']
            cols = self.conf['env']['cols']
            
            # target will be always at the last cell
            self.maze[-1, -1] = 1

            # placing obstacles in maze
            self.maze[0, 1:3] = -1
            self.maze[1, 2:] = -1
            self.maze[2, 0] = -1
            self.maze[3, 0:2] = -1

        except Exception as e:
            raise e
        
    # It is for visualizing the maze's current position
    def _visualize(self):
        try:
            maze = self.maze.copy()
            maze[self.agent.x, self.agent.y] = self.agent.val
            print(self.agent)
            print(maze)
        except Exception as e:
            raise e
            
    # converting action to direction
    def _action_to_direction(self, action_num):
        try:
            ACTION = ["down", "right", "up", "left"]
            
            
            action = {
                'down': np.array([1, 0]),
                'right': np.array([0, 1]),
                'up': np.array([-1, 0]),
                'left': np.array([0, -1]),
            }
            
            return ACTION[action_num], action[ACTION[action_num]]
            
        except Exception as e:
            raise e

In [17]:
def get_conf():
    try:
        conf = {
            'env':{
                    'rows': 4,
                    'cols': 4
                },
            "model_path": "/Users/jaydeepchakraborty/JC/git-projects/model_util/Models/RL/maze_v1/",
            "model_nm": "maze_v1_gym_stable_baselines3",
            "log_path": "/Users/jaydeepchakraborty/JC/git-projects/model_util/logs/RL/maze_v1"
        }       
        return conf
    except Exception as e:
        raise e

In [18]:
def test_env(env):
    try:
        # checking using in-built method
        check_env(env)
        
        # checking using manual check
        test_episodes = 2

        for episode in range(1, test_episodes+1):

            obs = env.reset() # initial set of observation
            terminated = False
            score = 0

            env.render()
            while not terminated:
                action = env.action_space.sample()
                nxt_obs, reward, terminated, info = env.step(action)
                print(f"nxt_obs:- {nxt_obs}, reward:- {reward}")
                score += reward
                env.render()
            
            print(f"after episode:- {episode}, score:- {score}")

        env.close()
    except Exception as e:
        raise e

In [19]:
def train_model(env, conf):
    try:
        print("In Train method")
        train_episodes = 15
        log_path = conf["log_path"] 
        # MlpPolicy ~ multilayer perceptron policy
        model = PPO('MlpPolicy', env, verbose=0, tensorboard_log=log_path)
        for ep in range(train_episodes):
            model.learn(total_timesteps=10000, reset_num_timesteps=False, tb_log_name="PPO")
        return model
    except Exception as e:
        raise e

In [20]:
def save_model(model, conf):
    try:
        print("In Save method")
        model_path = conf["model_path"] + conf["model_nm"] 
        model.save(model_path)
    except Exception as e:
        raise e

In [21]:
def eval_model(conf):
    try:
        print("In Eval method")
        env = MazeEnv(conf)
        
        model_path = conf["model_path"] + conf["model_nm"] + ".zip"
        model = PPO.load(model_path, env=env) 
        
        evaluate_policy(model, env, n_eval_episodes=10, render=False)
    except Exception as e:
        raise e

In [22]:
def inference_model(conf):
    try:
        print("In Inference method")
        env = MazeEnv(conf)
        
        model_path = conf["model_path"] + conf["model_nm"] + ".zip"
        model = PPO.load(model_path, env=env)    

        terminated = False
        
        obs = env.reset()
        while not terminated:
            env.render()
            action, _states = model.predict(obs)
            action = action.item(0) # predict returns ndarray but model action is dicrete ~ int
            obs, rewards, terminated, info = env.step(action)

        env.render()
        
    except Exception as e:
        raise e

In [23]:
def main():
    try:
        # get config
        conf = get_conf()
        
        # Create the environment ~ MAZE
        m = MazeEnv(conf)
        
        # checking whether environment is working or not
        # test_env(m) 
        
        # model: train
        model = train_model(m, conf)
        
        # model: save
        save_model(model, conf)
        
        # model: evaluation
        eval_model(conf)
        
        # model: inference
        inference_model(conf)
        
        print("DONE")
        
    except Exception as e:
        traceback.print_exc()


if __name__ == '__main__':
    main()

In Train method
In Save method
In Eval method
In Inference method
Agent:- move: (0 , 0) ~ move happened: set() ~ last action: 
[[ 6. -1. -1.  0.]
 [ 0.  0. -1. -1.]
 [-1.  0.  0.  0.]
 [-1. -1.  0.  1.]]
Agent:- move: (1 , 0) ~ move happened: set() ~ last action: down
[[ 0. -1. -1.  0.]
 [ 6.  0. -1. -1.]
 [-1.  0.  0.  0.]
 [-1. -1.  0.  1.]]
Agent:- move: (1 , 1) ~ move happened: set() ~ last action: right
[[ 0. -1. -1.  0.]
 [ 0.  6. -1. -1.]
 [-1.  0.  0.  0.]
 [-1. -1.  0.  1.]]
Agent:- move: (2 , 1) ~ move happened: set() ~ last action: down
[[ 0. -1. -1.  0.]
 [ 0.  0. -1. -1.]
 [-1.  6.  0.  0.]
 [-1. -1.  0.  1.]]
Agent:- move: (2 , 2) ~ move happened: set() ~ last action: right
[[ 0. -1. -1.  0.]
 [ 0.  0. -1. -1.]
 [-1.  0.  6.  0.]
 [-1. -1.  0.  1.]]
Agent:- move: (2 , 3) ~ move happened: set() ~ last action: right
[[ 0. -1. -1.  0.]
 [ 0.  0. -1. -1.]
 [-1.  0.  0.  6.]
 [-1. -1.  0.  1.]]
Agent:- move: (3 , 3) ~ move happened: set() ~ last action: down
[[ 0. -1. -1.  0.]



# Resources
1) https://blog.paperspace.com/getting-started-with-openai-gym/
2) https://stable-baselines3.readthedocs.io/en/master/
3) https://www.youtube.com/watch?v=uKnjGn8fF70&list=PLQVvvaa0QuDf0O2DWwLZBfJeYY-JOeZB1&index=1
4) https://www.youtube.com/watch?v=Mut_u40Sqz4
5) https://www.youtube.com/watch?v=psDlXfbe6ok&list=PLd_Oyt6lAQ8RNofJqUduCqC3O0mxcyArM&index=6