# Install and Import Dependencies

In [103]:
# install modules stable-baselines3 includes gym
# !pip install stable-baselines3[extra] 

In [104]:
# !pip install tensorflow gym keras keras-rl2

In [105]:
# DQN = Deep-Q-Network - maximize bellman equation, MLP Policy for model
import gym 
from gym import Env
from gym.spaces import Discrete, Box, Tuple
import numpy as np
import os
from stable_baselines3 import DQN
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold

# Building the Environment

In [106]:
# class for maze env where agent will learn
class MazeEnv(Env):
    
    # maze member_variables & action / observation space
    def __init__(self, DIM_COL_ROW, STARTING_CELL, WALLS):
        # declare num_col & num_row; boards are square
        self.dim_row_col = DIM_COL_ROW
        # declare starting cell
        self.starting_cell = STARTING_CELL
        # actions we can take, up, right, down, left ; NESW
        self.action_space = Discrete(4) # 0,1,2,3
        # observation 
        # option1 - discrete(36) w/ % to make 6x6 grid
        # option2 - Tuple((Discrete(DIM_COL_ROW), Discrete(DIM_COL_ROW))
        # option3 - box w/ np.arrays made discrete elements
        # taking option 3 bc tuple DNE
        self.observation_space = Box(np.array((0,0), dtype=int), np.array((DIM_COL_ROW-1,DIM_COL_ROW-1)), dtype=np.int64)
        # self.observation_space = Tuple((Discrete(DIM_COL_ROW), Discrete(DIM_COL_ROW)))
        # set starting cell - note self.state updates per step
        # self.state = np.array(STARTING_CELL, dtype=np.int64)
        self.state = None
        # self.state = STARTING_CELL
        # set max_steps to prevent infinite searching in maze
        self.max_steps = 100000 # 1000 baseline, will change
        # set current step for iterating action steps
        self.current_step = 0
        # set value for cell in maze that ends episode
        # self.end_cell = np.array((DIM_COL_ROW-1,DIM_COL_ROW-1),dtype=np.int64)
        self.end_cell = np.array((5,2),dtype=np.int64)
        # set episode termination variable to false
        self.episode_terminated = False
        # set walls for maze below
        self.Walls = WALLS
        
    # note - impossible move below pass increments; optimize by logic on step
    # how about step incrementer in else (non-pass steps)
        
    # moves agent around env; how actions change states
    def step(self, action):
        # added to test assertion error fix
        """
        if not (self.state):
            self.state = np.array(self.starting_cell, dtype=np.int64)
        """
        try:
            _ = self.state[0]
        except ValueError:
            #self.state = np.array(self.starting_cell, dtype=np.int64)
            self.state = self.starting_cell
        # action discrete values defined below
        # 0 is down 
        # 1 is left
        # 2 is up
        # 3 is right
        # take action & change state cell; passes prevent action in else
        # if action is down or up
        if (action == 0 or action == 2):
            # if wall DNE
            if (self.Walls[2*self.state[0]+1, self.state[1]+ (action//2)] == 0):
                # move to new state
                self.state = (self.state[0], self.state[1] + (action - 1))
                # increment steps
                self.current_step += 1    
            # if wall exists
            else:
                pass
        # else action is left or right
        else:
            # if wall DNE
            if (self.Walls[2*(self.state[0] + (action//2)) , self.state[1]] == 0):
                # move to new state
                self.state = (self.state[0] + (action - 2), self.state[1])
                # increment steps
                self.current_step += 1
            # if wall exists
            else:
                pass       
        
        # calculate reward & check if at end-condition
        #if (self.state == self.end_cell):
        if (np.array_equal(self.state,self.end_cell)):    
            self.episode_terminated = True
            reward = 100
        else:
            # incentive to keep moving; reach end quickly
            reward = -1/(self.dim_row_col*self.dim_row_col)
            
        # end-condition w/ out reward is too many steps taken
        if (self.current_step >= self.max_steps):
            self.episode_terminated = True
            
        # info {} must be returned for step() for Env class; dk why
        info = {}
        
        # return step information
        return np.array(self.state, dtype=np.int64), reward, self.episode_terminated, info
                
    # implements visuals of learning process
    # COULD USE PYGAME ??? - See mattchan maze_view_2d.py
    def render(self):
        # implement viz
        pass
    
    # reset sets env's params to starting values
    def reset(self):
        # reset state to starting cell
        self.state = np.array(self.starting_cell, dtype=np.int64)
        # reset episode_terminated to episode running
        self.episode_terminated = False
        # reset current step to no steps taken
        self.current_step = 0
        # return state to exploit; model.predict(env.reset()) in test model
        #return np.array(self.state, dtype=np.int64)
        return self.state
        

In [107]:
# set dimensions of square maze & starting cell in maze
DIM_COL_ROW = 6
STARTING_CELL = (0,1)
# walls is subject to dim; hardcode changes on input
# even x indices are vert walls; up & down actions permittable if val = 0
# odd x indices are horz walls; right and left actions permittable if val = 0
# end-points of x indices are the edges of the grid
# val = 2 => noise in wall grid (wall N/A; neither T/F)
# specifically, val = 2 => vert & edges-grid top of grid
# for Walls, each row is a set of edges-grid + vert + horz  borders
# for Walls, num_col = dim_col_row+1; border numbers
'''
WALLS = np.array([
                [1,1,1,1,1,1,2],
                [1,0,0,0,0,0,1],
                [0,0,0,0,0,0,2],
                [1,0,0,0,0,0,1],
                [0,0,0,0,0,0,2],
                [1,0,0,0,0,0,1],
                [0,0,0,0,0,0,2],
                [1,0,0,0,0,0,1],
                [0,0,0,0,0,0,2],
                [1,0,0,0,0,0,1],
                [0,0,0,0,0,0,2],
                [1,0,0,0,0,0,1],
                [1,1,1,1,1,1,2]
                ])
'''
WALLS = np.array([
                [1,1,1,1,1,1,2],
                [1,0,0,0,0,0,1],
                [1,0,1,0,1,0,2],
                [1,1,1,1,1,1,1],
                [0,0,0,0,0,0,2],
                [1,0,1,1,1,1,1],
                [1,1,0,0,1,0,2],
                [1,0,0,0,0,0,1],
                [1,1,1,0,1,1,2],
                [1,0,0,0,1,1,1],
                [0,1,1,1,0,0,2],
                [1,0,0,0,0,0,1],
                [1,1,1,1,1,1,2]
                ])


# declare environment
env = MazeEnv(DIM_COL_ROW, STARTING_CELL, WALLS)
# ensure env functions with stable_baselines well
check_env(env, warn=True)

# Test Environment

In [108]:
"""
episodes = 50
average_score = 0
average_steps = 0
# iterate through simulated episodes of the env with random actions
for episode in range(1, episodes+1):
    # make state at starting cell
    state = env.reset()
    # reset boolean for simulation
    done = False
    # score is metric for rewards
    score = 0 
    # logic for scores
    # simulate a singular episode with random action
    while not done:
        # render env
        env.render()
        # random actions hence .sample()
        action = env.action_space.sample()
        # step from state with action & capture step info
        n_state, reward, done, info = env.step(action)
        # increment reward metric
        score+=reward
    # display simulation results
    print('Episode:{} Score:{} Steps:{}'.format(episode, score, int((100-score)*(DIM_COL_ROW*DIM_COL_ROW))))
    # update average_score & average_steps
    average_score += score
    average_steps += int((100-score)*(DIM_COL_ROW*DIM_COL_ROW))
# display simulation average score & steps
print("--------------------------------------------------")
print("Average Score:{} Average Steps:{}".format(average_score/episodes, average_steps/episodes))
# env.close() - needed when render implemented
"""

'\nepisodes = 50\naverage_score = 0\naverage_steps = 0\n# iterate through simulated episodes of the env with random actions\nfor episode in range(1, episodes+1):\n    # make state at starting cell\n    state = env.reset()\n    # reset boolean for simulation\n    done = False\n    # score is metric for rewards\n    score = 0 \n    # logic for scores\n    # simulate a singular episode with random action\n    while not done:\n        # render env\n        env.render()\n        # random actions hence .sample()\n        action = env.action_space.sample()\n        # step from state with action & capture step info\n        n_state, reward, done, info = env.step(action)\n        # increment reward metric\n        score+=reward\n    # display simulation results\n    print(\'Episode:{} Score:{} Steps:{}\'.format(episode, score, int((100-score)*(DIM_COL_ROW*DIM_COL_ROW))))\n    # update average_score & average_steps\n    average_score += score\n    average_steps += int((100-score)*(DIM_COL_ROW*DI

## Train an RL Model ##

In [109]:
save_path = os.path.join('Training', 'Saved Models')
log_path = os.path.join('Training', 'Logs')

In [110]:
training_log_path = os.path.join(log_path, 'DQN_7')

In [111]:
# below line yields requirement installed on ANACONDA LOCATION
# installed through CMD tho & successfully installed instead of req satisfied
#!pip install tensorboard

In [112]:
# again command line stuff
#!tensorboard --logdir={training_log_path}

In [113]:
# callback reward_threshold 50 ; is it avg or upper bound for termination
stop_callback = StopTrainingOnRewardThreshold(reward_threshold = 50, verbose = 1)
eval_callback = EvalCallback(env,
                             callback_on_new_best=stop_callback,
                             eval_freq=10000,
                             best_model_save_path=save_path,
                             verbose=1)

In [114]:
#env = DummyVecEnv([lambda: env])
model = DQN('MlpPolicy', env, verbose=1, tensorboard_log=log_path)
#model = DQN('MlpPolicy', DummyVecEnv([lambda: env]), verbose=1)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [115]:
# callback yields IndexError: invalid index to scalar variable. with non-dummy-vec env
# model.learn(total_timesteps=20000, callback=eval_callback)
# model.learn(total_timesteps=750000) -- FOR 
model.learn(total_timesteps=250000)

Logging to Training\Logs\DQN_14
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 739      |
|    ep_rew_mean      | 79.5     |
|    exploration_rate | 0.888    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 7738     |
|    time_elapsed     | 0        |
|    total_timesteps  | 2956     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 958      |
|    ep_rew_mean      | 73.4     |
|    exploration_rate | 0.709    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 6432     |
|    time_elapsed     | 1        |
|    total_timesteps  | 7667     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1.08e+03 |
|    ep_rew_mean      | 70.2     |
|    exploration_rate | 0.51     |
| time/               |

<stable_baselines3.dqn.dqn.DQN at 0x186bf5250a0>

In [116]:
dqn_path = os.path.join('Training', 'Saved Models', 'DQN_model')

In [117]:
model.save(dqn_path)

In [118]:
model = DQN.load(dqn_path, env=env)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [119]:
obs = env.reset()
while True:
    action, _states = model.predict(obs)
    # action = env.action_space.sample()
    print("obs: ", obs)
    obs, rewards, done, info = env.step(action)
    # env.render()
    if done: 
        print('obs: ', obs)
        break

obs:  [0 1]
obs:  [0 2]
obs:  [0 3]
obs:  [1 3]
obs:  [2 3]
obs:  [3 3]
obs:  [4 3]
obs:  [4 2]
obs:  [4 1]
obs:  [4 0]
obs:  [5 0]
obs:  [5 1]
obs:  [5 2]


In [120]:
evaluate_policy(model, env, n_eval_episodes=1, render=False, reward_threshold = 70)

(99.69444444216788, 0.0)