# Install and Import Dependencies

In [7]:
# install modules stable-baselines3 includes gym
!pip install stable-baselines3[extra] 

Collecting stable-baselines3[extra]
  Downloading stable_baselines3-1.5.0-py3-none-any.whl (177 kB)
Collecting torch>=1.8.1
  Downloading torch-1.11.0-cp39-cp39-win_amd64.whl (157.9 MB)
Collecting gym==0.21
  Downloading gym-0.21.0.tar.gz (1.5 MB)
Collecting opencv-python
  Downloading opencv_python-4.5.5.64-cp36-abi3-win_amd64.whl (35.4 MB)
Collecting ale-py~=0.7.4
  Downloading ale_py-0.7.4-cp39-cp39-win_amd64.whl (904 kB)
Collecting tensorboard>=2.2.0
  Downloading tensorboard-2.8.0-py3-none-any.whl (5.8 MB)
Collecting autorom[accept-rom-license]~=0.4.2
  Downloading AutoROM-0.4.2-py3-none-any.whl (16 kB)
Collecting importlib-resources
  Downloading importlib_resources-5.7.0-py3-none-any.whl (28 kB)
Collecting importlib-metadata>=4.10.0
  Downloading importlib_metadata-4.11.3-py3-none-any.whl (18 kB)
Collecting AutoROM.accept-rom-license
  Downloading AutoROM.accept-rom-license-0.4.2.tar.gz (9.8 kB)
  Installing build dependencies: started
  Installing build dependencies: finished w

In [9]:
# !pip install tensorflow gym keras keras-rl2

In [42]:
# DQN = Deep-Q-Network - maximize bellman equation, MLP Policy for model
import gym 
from gym import Env
from gym.spaces import Discrete, Box, Tuple
import numpy as np
import os
from stable_baselines3 import DQN
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_checker import check_env

# Building the Environment

In [46]:
# class for maze env where agent will learn
class MazeEnv(Env):
    
    # maze member_variables & action / observation space
    def __init__(self, DIM_COL_ROW, STARTING_CELL):
        # declare num_col & num_row; boards are square
        self.dim_row_col = DIM_COL_ROW
        # declare starting cell
        self.starting_cell = STARTING_CELL
        # actions we can take, up, right, down, left ; NESW
        self.action_space = Discrete(4) # 0,1,2,3
        # observation 
        # option1 - discrete(36) w/ % to make 6x6 grid
        # option2 - Tuple((Discrete(DIM_COL_ROW), Discrete(DIM_COL_ROW))
        # taking option 2 bc tuple exists; vscode says stable not supported from check_env, but jupyter no error, curious
        self.observation_space = Tuple((Discrete(DIM_COL_ROW), Discrete(DIM_COL_ROW)))
        # set starting cell - note self.state updates per step
        self.state = STARTING_CELL
        # set max_steps to prevent infinite searching in maze
        self.max_steps = 1000 # 1000 baseline, will change
        # set current step for iterating action steps
        self.current_step = 0
        # set value for cell in maze that ends episode
        self.end_cell = (DIM_COL_ROW,DIM_COL_ROW)
        # set episode termination variable to false
        self.episode_terminated = False
        
    # note - impossible move below pass increments; optimize by logic on step
    # how about step incrementer in else (non-pass steps)
        
    # moves agent around env; how actions change states
    def step(self, action):
        # action discrete values defined below
        # 0 is down 
        # 1 is left
        # 2 is up
        # 3 is right
        
        # take action & change state cell; passes prevent action in else
        if (action == 0 or action == 2):
            # pass if at bottom and go down
            if (self.state[0] == 0 and action == 0):
                pass
            # pass if at top and go up
            elif (self.state[0] == self.dim_row_col and action == 2):
                pass
            # take action up or down when not on border
            else:
                # 2 - 1 => go up 1; 0 - 1 => go down 1 
                self.state = (self.state[0] + (action - 1), self.state[1])
        # else action = 1 or 3
        else:
            # pass if at left and go left
            if (self.state[1] == 0 and action == 1):
                pass
            # pass if at right and go right
            elif (self.state[1] == self.dim_row_col and action == 3):
                pass
            # take action up or down when not on border
            else:
                # 3 - 2 => go right 1; 1 - 2 => go left 1 
                self.state = (self.state[0], self.state[1] + (action -2))
                
        # increment max_steps as action was taken - note pass above
        self.current_step += 1
        
        # calculate reward & check if at end-condition
        if (self.state == self.end_cell):
            self.episode_terminated = True
            reward = 100
        else:
            # incentive to keep moving; reach end quickly
            reward = -1/(self.dim_row_col*self.dim_row_col)
            
        # end-condition w/ out reward is too many steps taken
        if (self.current_step >= self.max_steps):
            self.episode_terminated = True
            
        # info {} must be returned for step() for Env class; dk why
        info = {}
        
        # return step information
        return self.state, reward, self.episode_terminated, info
                
    # implements visuals of learning process
    # COULD USE PYGAME ??? - See mattchan maze_view_2d.py
    def render(self):
        # implement viz
        pass
    
    # reset sets env's params to starting values
    def reset(self):
        # reset state to starting cell
        self.state = self.starting_cell
        # reset episode_terminated to episode running
        self.episode_terminated = False
        # reset current step to no steps taken
        self.current_step = 0
        # return state to exploit; model.predict(env.reset()) in test model
        return self.state
        

In [47]:
# declare environment
DIM_COL_ROW = 6
STARTING_CELL = (0,0)
env = MazeEnv(DIM_COL_ROW, STARTING_CELL)
# ensure env functions with stable_baselines well
check_env(env, warn=True)

# Test Environment

In [54]:
episodes = 5
# iterate through simulated episodes of the env with random actions
for episode in range(1, episodes+1):
    # make state at starting cell
    state = env.reset()
    # reset boolean for simulation
    done = False
    # score is metric for rewards
    score = 0 
    
    # simulate a singular episode with random action
    while not done:
        # render env
        env.render()
        # random actions hence .sample()
        action = env.action_space.sample()
        # step from state with action & capture step info
        n_state, reward, done, info = env.step(action)
        # increment reward metric
        score+=reward
    # display simulation results
    print('Episode:{} Score:{} Steps:{}'.format(episode, score, int((100-score)*(DIM_COL_ROW*DIM_COL_ROW))))
# env.close() - needed when render implemented

Episode:1 Score:98.19444444444444 Steps:65
Episode:2 Score:98.38888888888889 Steps:58
Episode:3 Score:92.91666666666669 Steps:254
Episode:4 Score:86.44444444444431 Steps:488
Episode:5 Score:97.69444444444444 Steps:83
