In [1]:
from vizdoom import DoomGame  
import random
import time
import numpy as np
import gym
from gym import Env
from gym.spaces import Discrete, Box
import cv2
from matplotlib import pyplot as plt
import os
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3 import PPO

In [2]:
###################
##   CONSTANTS   ##
###################
ACTION_NUM = 3
EPISODES_NUM = 10
AGENT_MODEL_PATH_PREFIX = './agents/agent_for_'
TENSORBOARD_LOG_PATH_PREFIX = './logs/logs_for_'
CURRENT_CONFIGURATION_INDEX = 3

configurations = [{
                    'name': 'basic',
                    'scenarioConfigFilePath': 'VizDoom/scenarios/basic.cfg',
                    'actionNumber': 3,
                  }, {
                    'name': 'defend_the_center',
                    'scenarioConfigFilePath': 'VizDoom/scenarios/defend_the_center.cfg',
                    'actionNumber': 3,
                  }, {
                    'name': 'deadly_corridor',
                    'scenarioConfigFilePath': 'VizDoom/scenarios/deadly_corridor.cfg',
                    'actionNumber': 20,
                  }]


In [3]:
class VizDoomGym(Env):
  def __init__(self, envConfig, render=False):
    super().__init__()
    self.game = DoomGame()
    self.game.load_config(envConfig["scenarioConfigFilePath"])
    self.game.set_window_visible(render)
    self.game.init()

    self.action_number = envConfig["actionNumber"]
    self.action_space = Discrete(self.action_number)
    self.observation_space = Box(0, 255, [100, 160, 1], np.uint8)

  def close(self):
    self.game.close()
  
  def step(self, action):
    actions = np.identity(self.action_number, dtype=np.uint8)
    actionReward = self.game.make_action(actions[action], 5)

    done = self.game.is_episode_finished()
    state = self.game.get_state()
  
    if not state:
      return np.zeros(self.observation_space.shape), actionReward, done, {"damage_taken": 0, "hitcount": 0, "ammo": 0}
    
    health, damage_taken, hitcount, ammo = state.game_variables
    
    deltasObject = {
      'damage_taken': -damage_taken + self.rewardsObject["damage_taken"],
      'hitcount': hitcount - self.rewardsObject["hitcount"],
      'ammo': ammo - self.rewardsObject["ammo"]
    }
    
    self.rewardsObject["damage_taken"] = damage_taken
    self.rewardsObject["hitcount"] = hitcount
    self.rewardsObject["ammo"] = ammo

    reward = actionReward + deltasObject["damage_taken"]*10 + deltasObject["hitcount"]*200 + deltasObject["ammo"]*5 

    
    
    img = self.grayscale(state.screen_buffer)
    # plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
    return img, reward, done, deltasObject
  
  def reset(self):
    self.game.new_episode()
    state = self.game.get_state()
    self.rewardsObject = {
      'damage_taken': 0,
      'hitcount': 0,
      'ammo': 52
    }
    return self.grayscale(state.screen_buffer)
    
  
  def render():
    pass
  
  def grayscale(self, observation):
    grayscaled = cv2.cvtColor(np.moveaxis(observation, 0, -1), cv2.COLOR_BGR2GRAY)
    resized = cv2.resize(grayscaled, (160, 100), cv2.INTER_CUBIC)
    return np.reshape(resized, (100, 160, 1))
  

In [4]:
class AgentCallback(BaseCallback):
  def __init__(self, check_freq, save_path, verbose=1):
    super(AgentCallback, self).__init__(verbose)
    self.check_freq = check_freq
    self.save_path = save_path

  def __init_callback(self):
    if self.save_path is not None:
      os.makedirs(self.save_path, exist_ok=True)
  
  def _on_step(self):
    if self.n_calls % self.check_freq == 0:
      model_path = os.path.join(self.save_path, f"model_{self.n_calls}")
      self.model.save(model_path)
    return True
  
  
agentCallback = AgentCallback(check_freq=10000, save_path=f"{AGENT_MODEL_PATH_PREFIX}{configurations[CURRENT_CONFIGURATION_INDEX]['name']}")

In [None]:

env = VizDoomGym(envConfig=configurations[CURRENT_CONFIGURATION_INDEX], render=False)  
model = PPO('CnnPolicy', env, tensorboard_log=f"{TENSORBOARD_LOG_PATH_PREFIX}{configurations[CURRENT_CONFIGURATION_INDEX]['name']}", verbose=1, learning_rate=0.000001, n_steps=8192, clip_range=0.1)
print("Model created")
model.learn(total_timesteps=500000, callback=agentCallback)
print("Model trained")

In [5]:
env = VizDoomGym(configurations[CURRENT_CONFIGURATION_INDEX], render=False)

In [7]:
from stable_baselines3.common.evaluation import evaluate_policy
model = PPO.load(f"{AGENT_MODEL_PATH_PREFIX}{configurations[CURRENT_CONFIGURATION_INDEX]['name']}/model_120000")

In [9]:
model.set_env(env)
model.learn(total_timesteps=500000, callback=agentCallback)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.
Logging to ./logs/logs_for_deadly_corridor/PPO_8
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 22.4     |
|    ep_rew_mean     | -767     |
| time/              |          |
|    fps             | 236      |
|    iterations      | 1        |
|    time_elapsed    | 34       |
|    total_timesteps | 8192     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 22.7        |
|    ep_rew_mean          | -762        |
| time/                   |             |
|    fps                  | 56          |
|    iterations           | 2           |
|    time_elapsed         | 287         |
|    total_timesteps      | 16384       |
| train/                  |             |
|    approx_kl            | 0.001040458 |
|    clip_fraction        | 0.0

<stable_baselines3.ppo.ppo.PPO at 0x7fbdae413310>

In [None]:
evaluate_policy(model, env, n_eval_episodes=10)

In [None]:
from IPython.display import clear_output

for episode in range(10):
    obs = env.reset()
    done = False
    total_reward = 0
    while not done: 
        action, _ = model.predict(obs)
        obs, reward, done, info = env.step(action)
        clear_output()
        plt.imshow(obs)
        plt.show()
        time.sleep(0.1)
    time.sleep(1)

In [10]:
env.close()