# Create Env
This script is heavily inspired by the [policy gradient implementation](https://github.com/simoninithomas/Deep_reinforcement_learning_Course/blob/master/Policy%20Gradients/Doom/Doom%20REINFORCE%20Monte%20Carlo%20Policy%20gradients.ipynb) in Thomas Simoninis RL-course 

In [None]:
import tensorflow as tf
import numpy as np
from vizdoom import *
import random
import time
from skimage import transform
from datetime import datetime

from collections import deque
import matplotlib.pyplot as plt

import warnings

warnings.filterwarnings('ignore')

In [None]:
def create_environment(visible=False):
    game = DoomGame()
    game.load_config("../scenarios/deadly_corridor.cfg")
    game.set_window_visible(visible)
    game.init()
    possible_actions = np.identity(7, dtype=int).tolist()
    
    return game, possible_actions

In [None]:
game, possible_actions = create_environment(visible=False)

In [None]:
# INPUT SPECS
STACK_SIZE = 4
FRAME_SIZE = [84,84]

# ENV HYPERPARAMS
STATE_SIZE = FRAME_SIZE[:]; STATE_SIZE.append(STACK_SIZE)
ACTION_SIZE = game.get_available_buttons_size()

# TRAINING HYPERPARAMS
LEARNING_RATE = 0.002
NUM_EPOCHS = 500000

BATCH_SIZE = 1000
GAMMA = 0.95 # Discounting rate

# Test env

In [None]:
def test_environment(episodes):
    game, possible_actions = create_environment(visible=True)
    for i in range(episodes):
        game.new_episode()
        while not game.is_episode_finished():
            state = game.get_state()
            img = state.screen_buffer
            misc = state.game_variables
            action = random.choice(possible_actions)
            reward = game.make_action(action)
            time.sleep(0.007)
        print ("Result:", game.get_total_reward())
        time.sleep(1)
    game.close()

In [None]:
#test_environment(2)

# Define prepocessing functions

In [None]:
def preprocess_frame(frame):
    cropped_frame = frame[80:,:]
    normalized_frame = cropped_frame/255.0
    resized_frame = transform.resize(normalized_frame, FRAME_SIZE)
    return resized_frame

In [None]:
def init_deque():
    return deque([np.zeros(FRAME_SIZE, dtype=np.int) for i in range(STACK_SIZE)], maxlen=STACK_SIZE)

In [None]:
stacked_frames = init_deque()

In [None]:
def stack_frames(state, is_new_episode, stacked_frames = None):
    frame = preprocess_frame(state)
    
    if is_new_episode:
        stacked_frames = init_deque()
    else:
        stacked_frames.append(frame)
        
    stacked_state = np.stack(stacked_frames, axis=2)
    
    return stacked_state, stacked_frames

In [None]:
state = game.get_state().screen_buffer
state, stacked_frames = stack_frames(state, is_new_episode=True)

In [None]:
state.shape

# Create Policy Gradient Network model

In [None]:
# With a single method, should this even be a class?! Might have something
# to do with the way TensorFlow works? Or just simoninis noob-coding...
class PGNetwork:
    def __init__(self, name='PGNetwork'):
        
        with tf.variable_scope(name):
            with tf.name_scope("inputs"):
                tempArray = [None]
                for i in STATE_SIZE:
                    tempArray.append(i)
                
                self.inputs_= tf.placeholder(tf.float32, tempArray, name="inputs_")
                self.actions = tf.placeholder(tf.int32, [None, ACTION_SIZE], name="actions")
                self.discounted_episode_rewards_ = tf.placeholder(tf.float32, [None, ], name="discounted_episode_rewards_")
            
                
                # Variable for tensorboard
                self.mean_reward_ = tf.placeholder(tf.float32, name="mean_reward")
                
            with tf.name_scope("conv1"):
                """
                First convnet:
                CNN
                BatchNormalization
                ELU
                """
                # Input [84, 84, 4]
                self.conv1 = tf.layers.conv2d(inputs = self.inputs_,
                                             filters = 32,
                                             kernel_size = [8,8],
                                             strides = [4,4],
                                             padding = "VALID",
                                              kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                                             name = "conv1")

                self.conv1_batchnorm = tf.layers.batch_normalization(self.conv1,
                                                       training = True,
                                                       epsilon = 1e-5,
                                                         name = 'batch_norm1')

                self.conv1_out = tf.nn.elu(self.conv1_batchnorm, name="conv1_out")
                # Output [20, 20, 32]
            
            with tf.name_scope("conv2"):
                """
                Second convnet:
                CNN
                BatchNormalization
                ELU
                """
                self.conv2 = tf.layers.conv2d(inputs = self.conv1_out,
                                     filters = 64,
                                     kernel_size = [4,4],
                                     strides = [2,2],
                                     padding = "VALID",
                                    kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                                     name = "conv2")

                self.conv2_batchnorm = tf.layers.batch_normalization(self.conv2,
                                                       training = True,
                                                       epsilon = 1e-5,
                                                         name = 'batch_norm2')

                self.conv2_out = tf.nn.elu(self.conv2_batchnorm, name="conv2_out")
                # Output [9, 9, 64]
            
            with tf.name_scope("conv3"):
                """
                Third convnet:
                CNN
                BatchNormalization
                ELU
                """
                self.conv3 = tf.layers.conv2d(inputs = self.conv2_out,
                                     filters = 128,
                                     kernel_size = [4,4],
                                     strides = [2,2],
                                     padding = "VALID",
                                     kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                                     name = "conv3")

                self.conv3_batchnorm = tf.layers.batch_normalization(self.conv3,
                                                       training = True,
                                                       epsilon = 1e-5,
                                                       name = 'batch_norm3')

                self.conv3_out = tf.nn.elu(self.conv3_batchnorm, name="conv3_out")
                # Output [3, 3, 128]
            
            with tf.name_scope("flatten"):
                self.flatten = tf.layers.flatten(self.conv3_out)
                # Output [1152]
            
            with tf.name_scope("fc1"):
                self.fc = tf.layers.dense(inputs = self.flatten,
                                          units = 512,
                                          activation = tf.nn.elu,
                                          kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                          name="fc1")
            
            with tf.name_scope("logits"):
                self.logits = tf.layers.dense(inputs = self.fc,
                                              units = ACTION_SIZE, 
                                              activation=None,
                                              kernel_initializer=tf.contrib.layers.xavier_initializer())
            
            with tf.name_scope("softmax"):
                self.action_distribution = tf.nn.softmax(self.logits)

            with tf.name_scope("loss"):
                self.neg_log_prob = tf.nn.softmax_cross_entropy_with_logits_v2(logits = self.logits, labels = self.actions)
                self.loss = tf.reduce_mean(self.neg_log_prob * self.discounted_episode_rewards_) 
    
            with tf.name_scope("train"):
                self.train_opt = tf.train.RMSPropOptimizer(LEARNING_RATE).minimize(self.loss)

In [None]:
# Reset the graph
tf.reset_default_graph()

# Instantiate the PGNetwork
PGNetwork = PGNetwork()

# Initialize Session
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)

## Step 8: Watch our Agent play 👀
Now that we trained our agent, we can test it

In [None]:
num_of_episodes = 3

with tf.Session() as sess:
    game, possible_actions = create_environment(visible=True)
    
    tf.train.Saver().restore(sess, "../trained-models/modelPGN.ckpt")
    
    for i in range(num_of_episodes):
        game.new_episode()

        state = game.get_state().screen_buffer
        state, stacked_frames = stack_frames(state, is_new_episode=True)

        while not game.is_episode_finished():
        
            # Run state though policy and calculate action
            action_probability_distribution = sess.run(PGNetwork.action_distribution, 
                                                       feed_dict={PGNetwork.inputs_: state.reshape(1, *STATE_SIZE)})

            # Sample action with respect to action probabilities
            action = np.random.choice(range(action_probability_distribution.shape[1]), 
                                      p=action_probability_distribution.ravel())
            action = possible_actions[action]

            # Perform action
            reward = game.make_action(action)
            done = game.is_episode_finished()
            time.sleep(0.007)
            if done:
                break
            else:
                # If not done, the next state becomes the current state
                next_state = game.get_state().screen_buffer
                next_state, stacked_frames = stack_frames(next_state, is_new_episode=False, stacked_frames=stacked_frames)
                state = next_state
        time.sleep(1)
        

        print("Score for episode ", i, " :", game.get_total_reward())
    game.close()