# Install and Import Dependencies

In [11]:
# install modules stable-baselines3 includes gym
!pip install stable-baselines3[extra] 



In [3]:
# !pip install tensorflow gym keras keras-rl2

In [4]:
# DQN = Deep-Q-Network - maximize bellman equation, MLP Policy for model
import gym 
from gym import Env
from gym.spaces import Discrete, Box, Tuple
import numpy as np
import os
from stable_baselines3 import DQN
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_checker import check_env

# Building the Environment

In [5]:
# class for maze env where agent will learn
class MazeEnv(Env):
    
    # maze member_variables & action / observation space
    def __init__(self, DIM_COL_ROW, STARTING_CELL, WALLS):
        # declare num_col & num_row; boards are square
        self.dim_row_col = DIM_COL_ROW
        # declare starting cell
        self.starting_cell = STARTING_CELL
        # actions we can take, up, right, down, left ; NESW
        self.action_space = Discrete(4) # 0,1,2,3
        # observation 
        # option1 - discrete(36) w/ % to make 6x6 grid
        # option2 - Tuple((Discrete(DIM_COL_ROW), Discrete(DIM_COL_ROW))
        # taking option 2 bc tuple exists; vscode says stable not supported from check_env, but jupyter no error, curious
        self.observation_space = Tuple((Discrete(DIM_COL_ROW), Discrete(DIM_COL_ROW)))
        # set starting cell - note self.state updates per step
        self.state = STARTING_CELL
        # set max_steps to prevent infinite searching in maze
        self.max_steps = 1000 # 1000 baseline, will change
        # set current step for iterating action steps
        self.current_step = 0
        # set value for cell in maze that ends episode
        self.end_cell = (DIM_COL_ROW,DIM_COL_ROW)
        # set episode termination variable to false
        self.episode_terminated = False
        # set walls for maze below
        self.Walls = WALLS
        
    # note - impossible move below pass increments; optimize by logic on step
    # how about step incrementer in else (non-pass steps)
        
    # moves agent around env; how actions change states
    def step(self, action):
        # action discrete values defined below
        # 0 is down 
        # 1 is left
        # 2 is up
        # 3 is right
        
        # take action & change state cell; passes prevent action in else
        if (action == 0 or action == 2):
            # pass if at bottom and go down
            if (self.state[0] == 0 and action == 0):
                pass
            # pass if at top and go up
            elif (self.state[0] == self.dim_row_col and action == 2):
                pass
            # take action up or down when not on border
            else:
                # 2 - 1 => go up 1; 0 - 1 => go down 1 
                self.state = (self.state[0] + (action - 1), self.state[1])
        # else action = 1 or 3
        else:
            # pass if at left and go left
            if (self.state[1] == 0 and action == 1):
                pass
            # pass if at right and go right
            elif (self.state[1] == self.dim_row_col and action == 3):
                pass
            # take action up or down when not on border
            else:
                # 3 - 2 => go right 1; 1 - 2 => go left 1 
                self.state = (self.state[0], self.state[1] + (action -2))
                
        # increment max_steps as action was taken - note pass above
        self.current_step += 1
        
        # calculate reward & check if at end-condition
        if (self.state == self.end_cell):
            self.episode_terminated = True
            reward = 100
        else:
            # incentive to keep moving; reach end quickly
            reward = -1/(self.dim_row_col*self.dim_row_col)
            
        # end-condition w/ out reward is too many steps taken
        if (self.current_step >= self.max_steps):
            self.episode_terminated = True
            
        # info {} must be returned for step() for Env class; dk why
        info = {}
        
        # return step information
        return self.state, reward, self.episode_terminated, info
                
    # implements visuals of learning process
    # COULD USE PYGAME ??? - See mattchan maze_view_2d.py
    def render(self):
        # implement viz
        pass
    
    # reset sets env's params to starting values
    def reset(self):
        # reset state to starting cell
        self.state = self.starting_cell
        # reset episode_terminated to episode running
        self.episode_terminated = False
        # reset current step to no steps taken
        self.current_step = 0
        # return state to exploit; model.predict(env.reset()) in test model
        return self.state
        

In [6]:
# set dimensions of square maze & starting cell in maze
DIM_COL_ROW = 6
STARTING_CELL = (0,0)
# walls is subject to dim; hardcode changes on input
# even x indices are vert walls; up & down actions permittable if val = 0
# odd x indices are horz walls; right and left actions permittable if val = 0
# end-points of x indices are the edges of the grid
# val = 2 => noise in wall grid (wall N/A; neither T/F)
# specifically, val = 2 => vert & edges-grid top of grid
# for Walls, each row is a set of edges-grid + horz + vert borders
# for Walls, num_col = dim_col_row+1; border numbers
WALLS = np.array([
                [1,1,1,1,1,1,2],
                [1,0,0,0,0,0,1],
                [1,0,1,0,1,0,2],
                [1,1,1,1,1,1,1],
                [0,0,0,0,0,0,2],
                [1,0,1,1,1,1,1],
                [1,1,0,0,1,0,2],
                [1,0,0,0,0,0,1],
                [1,1,1,0,1,1,2],
                [1,0,0,0,1,1,1],
                [0,1,1,1,0,0,2],
                [1,0,0,0,0,0,1],
                [1,1,1,1,1,1,2]
                ])
# declare environment
env = MazeEnv(DIM_COL_ROW, STARTING_CELL, WALLS)
# ensure env functions with stable_baselines well
check_env(env, warn=True)



# Test Environment

In [12]:
episodes = 500
average_score = 0
average_steps = 0
# iterate through simulated episodes of the env with random actions
for episode in range(1, episodes+1):
    # make state at starting cell
    state = env.reset()
    # reset boolean for simulation
    done = False
    # score is metric for rewards
    score = 0 
    # logic for scores
    # simulate a singular episode with random action
    while not done:
        # render env
        env.render()
        # random actions hence .sample()
        action = env.action_space.sample()
        # step from state with action & capture step info
        n_state, reward, done, info = env.step(action)
        # increment reward metric
        score+=reward
    # display simulation results
    print('Episode:{} Score:{} Steps:{}'.format(episode, score, int((100-score)*(DIM_COL_ROW*DIM_COL_ROW))))
    # update average_score & average_steps
    average_score += score
    average_steps += int((100-score)*(DIM_COL_ROW*DIM_COL_ROW))
# display simulation average score & steps
print("--------------------------------------------------")
print("Average Score:{} Average Steps:{}".format(average_score/episodes, average_steps/episodes))
# env.close() - needed when render implemented

Episode:1 Score:96.97222222222223 Steps:108
Episode:2 Score:92.88888888888891 Steps:255
Episode:3 Score:97.61111111111111 Steps:85
Episode:4 Score:86.30555555555542 Steps:493
Episode:5 Score:92.16666666666669 Steps:281
Episode:6 Score:92.2777777777778 Steps:277
Episode:7 Score:93.33333333333336 Steps:239
Episode:8 Score:87.38888888888879 Steps:454
Episode:9 Score:86.7222222222221 Steps:478
Episode:10 Score:95.05555555555557 Steps:177
Episode:11 Score:83.80555555555534 Steps:583
Episode:12 Score:98.19444444444444 Steps:65
Episode:13 Score:93.86111111111113 Steps:220
Episode:14 Score:77.19444444444404 Steps:821
Episode:15 Score:99.08333333333333 Steps:33
Episode:16 Score:96.22222222222223 Steps:135
Episode:17 Score:95.75000000000001 Steps:152
Episode:18 Score:98.27777777777779 Steps:61
Episode:19 Score:80.52777777777747 Steps:701
Episode:20 Score:90.05555555555553 Steps:358
Episode:21 Score:97.72222222222223 Steps:81
Episode:22 Score:95.33333333333334 Steps:167
Episode:23 Score:89.833333