# Create Env
This script is heavily inspired by the [policy gradient implementation](https://github.com/simoninithomas/Deep_reinforcement_learning_Course/blob/master/Policy%20Gradients/Doom/Doom%20REINFORCE%20Monte%20Carlo%20Policy%20gradients.ipynb) in Thomas Simoninis RL-course 

In [None]:
import tensorflow as tf
import numpy as np
from vizdoom import *
import random
import time
from skimage import transform

from collections import deque
import matplotlib.pyplot as plt

import warnings

warnings.filterwarnings('ignore')

In [None]:
def create_environment(visible=False):
    game = DoomGame()
    game.load_config("scenarios/deadly_corridor.cfg")
    game.set_window_visible(visible)
    game.init()
    return game

In [None]:
game = create_environment()

In [None]:
# INPUT SPECS
STACK_SIZE = 4
FRAME_SIZE = [84,84]

# ENV HYPERPARAMS
STATE_SIZE = [*FRAME_SIZE, STACK_SIZE]
ACTION_SIZE = game.get_available_buttons_size()
POSSIBLE_ACTIONS = np.identity(ACTION_SIZE, dtype=int).tolist()

# TRAINING HYPERPARAMS
LEARNING_RATE = 0.002
NUM_EPOCHS = 2

BATCH_SIZE = 1000
GAMMA = 0.95 # Discounting rate

# Test env

In [None]:
def test_environment(episodes):
    game = create_environment(visible=True)
    for i in range(episodes):
        game.new_episode()
        while not game.is_episode_finished():
            state = game.get_state()
            img = state.screen_buffer
            misc = state.game_variables
            action = random.choice(POSSIBLE_ACTIONS)
            reward = game.make_action(action)
            time.sleep(0.007)
        print ("Result:", game.get_total_reward())
        time.sleep(1)
    game.close()

In [None]:
#test_environment(3)

# Define prepocessing functions

In [None]:
def preprocess_frame(frame):
    cropped_frame = frame[80:,:]
    normalized_frame = cropped_frame/255.0
    resized_frame = transform.resize(normalized_frame, FRAME_SIZE)
    return resized_frame

In [None]:
def init_deque():
    return deque([np.zeros(FRAME_SIZE, dtype=np.int) for i in range(STACK_SIZE)], maxlen=STACK_SIZE)

In [None]:
stacked_frames = init_deque()

In [None]:
def stack_frames(state, is_new_episode, stacked_frames = None):
    frame = preprocess_frame(state)
    
    if is_new_episode:
        stacked_frames = init_deque()
        for _ in range(STACK_SIZE):
            stacked_frames.append(frame)
    else:
        stacked_frames.append(frame)
        
    stacked_state = np.stack(stacked_frames, axis=2)
    
    return stacked_state, stacked_frames

# Discount_and_normalize_rewards
This function is important, because we are in a Monte Carlo situation. <br>

We need to **discount the rewards at the end of the episode**. This function takes, the reward discount it, and **then normalize them** (to avoid a big variability in rewards).

In [None]:
def discount_and_normalize_rewards(episode_rewards):
    discounted_episode_rewards = np.zeros_like(episode_rewards)
    cumulative = 0.0
    for i in reversed(range(len(episode_rewards))):
        cumulative = cumulative * GAMMA + episode_rewards[i]
        discounted_episode_rewards[i] = cumulative
    
    mean = np.mean(discounted_episode_rewards)
    std = np.std(discounted_episode_rewards)
    discounted_episode_rewards = (discounted_episode_rewards - mean) / (std)

    return discounted_episode_rewards

# Create Policy Gradient Network model

In [None]:
# With a single method, should this even be a class?! Might have something
# to do with the way TensorFlow works? Or just simoninis noob-coding...
class PGNetwork:
    def __init__(self, name='PGNetwork'):
        
        with tf.variable_scope(name):
            with tf.name_scope("inputs"):
                self.inputs_= tf.placeholder(tf.float32, [None, *STATE_SIZE], name="inputs_")
                self.actions = tf.placeholder(tf.int32, [None, ACTION_SIZE], name="actions")
                self.discounted_episode_rewards_ = tf.placeholder(tf.float32, [None, ], name="discounted_episode_rewards_")
            
                
                # Variable for tensorboard
                self.mean_reward_ = tf.placeholder(tf.float32, name="mean_reward")
                
            with tf.name_scope("conv1"):
                """
                First convnet:
                CNN
                BatchNormalization
                ELU
                """
                # Input [84, 84, 4]
                self.conv1 = tf.layers.conv2d(inputs = self.inputs_,
                                             filters = 32,
                                             kernel_size = [8,8],
                                             strides = [4,4],
                                             padding = "VALID",
                                              kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                                             name = "conv1")

                self.conv1_batchnorm = tf.layers.batch_normalization(self.conv1,
                                                       training = True,
                                                       epsilon = 1e-5,
                                                         name = 'batch_norm1')

                self.conv1_out = tf.nn.elu(self.conv1_batchnorm, name="conv1_out")
                # Output [20, 20, 32]
            
            with tf.name_scope("conv2"):
                """
                Second convnet:
                CNN
                BatchNormalization
                ELU
                """
                self.conv2 = tf.layers.conv2d(inputs = self.conv1_out,
                                     filters = 64,
                                     kernel_size = [4,4],
                                     strides = [2,2],
                                     padding = "VALID",
                                    kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                                     name = "conv2")

                self.conv2_batchnorm = tf.layers.batch_normalization(self.conv2,
                                                       training = True,
                                                       epsilon = 1e-5,
                                                         name = 'batch_norm2')

                self.conv2_out = tf.nn.elu(self.conv2_batchnorm, name="conv2_out")
                # Output [9, 9, 64]
            
            with tf.name_scope("conv3"):
                """
                Third convnet:
                CNN
                BatchNormalization
                ELU
                """
                self.conv3 = tf.layers.conv2d(inputs = self.conv2_out,
                                     filters = 128,
                                     kernel_size = [4,4],
                                     strides = [2,2],
                                     padding = "VALID",
                                     kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                                     name = "conv3")

                self.conv3_batchnorm = tf.layers.batch_normalization(self.conv3,
                                                       training = True,
                                                       epsilon = 1e-5,
                                                       name = 'batch_norm3')

                self.conv3_out = tf.nn.elu(self.conv3_batchnorm, name="conv3_out")
                # Output [3, 3, 128]
            
            with tf.name_scope("flatten"):
                self.flatten = tf.layers.flatten(self.conv3_out)
                # Output [1152]
            
            with tf.name_scope("fc1"):
                self.fc = tf.layers.dense(inputs = self.flatten,
                                          units = 512,
                                          activation = tf.nn.elu,
                                          kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                          name="fc1")
            
            with tf.name_scope("logits"):
                self.logits = tf.layers.dense(inputs = self.fc,
                                              units = ACTION_SIZE, 
                                              activation=None,
                                              kernel_initializer=tf.contrib.layers.xavier_initializer())
            
            with tf.name_scope("softmax"):
                self.action_distribution = tf.nn.softmax(self.logits)

            with tf.name_scope("loss"):
                self.neg_log_prob = tf.nn.softmax_cross_entropy_with_logits_v2(logits = self.logits, labels = self.actions)
                self.loss = tf.reduce_mean(self.neg_log_prob * self.discounted_episode_rewards_) 
    
            with tf.name_scope("train"):
                self.train_opt = tf.train.RMSPropOptimizer(LEARNING_RATE).minimize(self.loss)

In [None]:
# Reset the graph
tf.reset_default_graph()

# Instantiate the PGNetwork
PGNetwork = PGNetwork()

# Initialize Session
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)

# Setup Tensorboard

In [None]:
# Setup TensorBoard Writer
writer = tf.summary.FileWriter("/tensorboard/pg/test")

## Losses
tf.summary.scalar("Loss", PGNetwork.loss)

## Reward mean
tf.summary.scalar("Reward_mean", PGNetwork.mean_reward_ )

write_op = tf.summary.merge_all()

# Batch Function

Here we'll create batches.<br>
These batches contains episodes **(their number depends on how many rewards we collect**: for instance if we have episodes with only 10 rewards we can put BATCH_SIZE/10 episodes
<br>
* Make a batch
    * For each step:
        * Choose action a
        * Perform action a
        * Store s, a, r
        * **If** done:
            * Calculate sum reward
            * Calculate GAMMA Gt

In [None]:
def make_batch():
    states, actions, rewards_of_episode, rewards_of_batch, discounted_rewards = [], [], [], [], []

    episode_num = 1
    
    game.new_episode()
        
    state = game.get_state().screen_buffer
    state, stacked_frames = stack_frames(state, is_new_episode=True)

    while True:
        # Run state through policy and calculate action
        action_probability_distribution = sess.run(PGNetwork.action_distribution, 
                                                   feed_dict={PGNetwork.inputs_: state.reshape(1, *STATE_SIZE)})
        
        # Sample an action with respect to the action probabilities
        action = np.random.choice(range(action_probability_distribution.shape[1]), 
                                  p=action_probability_distribution.ravel())
        action = POSSIBLE_ACTIONS[action]

        # Perform sampled action
        reward = game.make_action(action)
        done = game.is_episode_finished()

        # Store results
        states.append(state)
        actions.append(action)
        rewards_of_episode.append(reward)
        
        if done:
            # The episode ends so no next state
            next_state = np.zeros(FRAME_SIZE, dtype=np.int)
            next_state, stacked_frames = stack_frames(next_state, is_new_episode=False, stacked_frames=stacked_frames)
            
            rewards_of_batch.append(rewards_of_episode)
            
            # Calculate gamma
            discounted_rewards.append(discount_and_normalize_rewards(rewards_of_episode))
           
            # If the number of rewards_of_batch > BATCH_SIZE stop the minibatch creation
            # (Because we have sufficient number of episode mb)
            # Remember that we put this condition here, because we want entire episode (Monte Carlo)
            # so we can't check that condition for each step but only if an episode is finished
            if len(np.concatenate(rewards_of_batch)) > BATCH_SIZE:
                break
                
            # Reset the transition stores
            rewards_of_episode = []
            
            episode_num += 1
            
            game.new_episode()

            state = game.get_state().screen_buffer
            state, stacked_frames = stack_frames(state, is_new_episode=True)
         
        else:
            # If not done, the next_state become the current state
            next_state = game.get_state().screen_buffer
            next_state, stacked_frames = stack_frames(next_state, is_new_episode=False, stacked_frames=stacked_frames)
            state = next_state
                         
    return np.stack(np.array(states)), np.stack(np.array(actions)), np.concatenate(rewards_of_batch), np.concatenate(discounted_rewards), episode_num

* Create the Neural Network
* Initialize the weights
* Init the environment
* all_rewards = 0 # Keep track of maximum reward
* **For** epochs in range(NUM_EPOCHS):
    * Get batches
    * Optimize

# Train agent

In [None]:
# Keep track of all rewards total for each batch
all_rewards = []

total_rewards = 0
maximum_reward_recorded = 0
mean_reward_total = [] # What is this?
epoch = 1
average_reward = []

saver = tf.train.Saver()

# Load the model
#saver.restore(sess, "./models/policy_model.ckpt")

while epoch < NUM_EPOCHS + 1:
    # Gather training data
    states_mb, actions_mb, rewards_of_batch, discounted_rewards_mb, nb_episodes_mb = make_batch()

    ### These part is used for analytics
    # Calculate the total reward ot the batch
    total_reward_of_batch = np.sum(rewards_of_batch)
    all_rewards.append(total_reward_of_batch)

    # Calculate the mean reward of the batch
    # Total rewards of batch / nb episodes in that batch
    mean_reward_of_batch = np.divide(total_reward_of_batch, nb_episodes_mb)
    mean_reward_total.append(mean_reward_of_batch)

    # Calculate the average reward of all training
    # mean_reward_of_batch / epoch
    average_reward_of_all_training = np.divide(np.sum(mean_reward_total), epoch)

    # Calculate maximum reward recorded 
    maximum_reward_recorded = np.amax(all_rewards)

    print("==========================================")
    print("Epoch: ", epoch, "/", NUM_EPOCHS)
    print("-----------")
    print("Number of training episodes: {}".format(nb_episodes_mb))
    print("Total reward: {}".format(total_reward_of_batch, nb_episodes_mb))
    print("Mean Reward of that batch {}".format(mean_reward_of_batch))
    print("Average Reward of all training: {}".format(average_reward_of_all_training))
    print("Max reward for a batch so far: {}".format(maximum_reward_recorded))

    # Feedforward, gradient and backpropagation
    loss_, _ = sess.run([PGNetwork.loss, PGNetwork.train_opt], feed_dict={PGNetwork.inputs_: states_mb.reshape((len(states_mb), *STATE_SIZE)),
                                                        PGNetwork.actions: actions_mb,
                                                                 PGNetwork.discounted_episode_rewards_: discounted_rewards_mb 
                                                                })

    print("Training Loss: {}".format(loss_))

    # Write TF Summaries
    summary = sess.run(write_op, feed_dict={PGNetwork.inputs_: states_mb.reshape((len(states_mb), *STATE_SIZE)),
                                                        PGNetwork.actions: actions_mb,
                                                                 PGNetwork.discounted_episode_rewards_: discounted_rewards_mb,
                                                                PGNetwork.mean_reward_: mean_reward_of_batch
                                                                })

    #summary = sess.run(write_op, feed_dict={x: s_.reshape(len(s_),84,84,1), y:a_, d_r: d_r_, r: r_, n: n_})
    writer.add_summary(summary, epoch)
    writer.flush()

    # Save Model
    if epoch % 5 == 0:
        saver.save(sess, "./models/policy_model.ckpt")
        print("Model saved")
    epoch += 1

## Step 8: Watch our Agent play 👀
Now that we trained our agent, we can test it

In [None]:
saver = tf.train.Saver()
num_of_episodes = 3

with tf.Session() as sess:
    game = create_environment(visible=True)
    
    saver.restore(sess, "./models/policy_model.ckpt")
    
    for i in range(num_of_episodes):
        game.new_episode()

        state = game.get_state().screen_buffer
        state, stacked_frames = stack_frames(state, is_new_episode=True)

        while not game.is_episode_finished():
        
            # Run state though policy and calculate action
            action_probability_distribution = sess.run(PGNetwork.action_distribution, 
                                                       feed_dict={PGNetwork.inputs_: state.reshape(1, *STATE_SIZE)})

            # Sample action with respect to action probabilities
            action = np.random.choice(range(action_probability_distribution.shape[1]), 
                                      p=action_probability_distribution.ravel())
            action = POSSIBLE_ACTIONS[action]

            # Perform action
            reward = game.make_action(action)
            done = game.is_episode_finished()
            time.sleep(0.007)
            if done:
                break
            else:
                # If not done, the next state becomes the current state
                next_state = game.get_state().screen_buffer
                next_state, stacked_frames = stack_frames(next_state, is_new_episode=False, stacked_frames=stacked_frames)
                state = next_state
        time.sleep(1)
        

        print("Score for episode ", i, " :", game.get_total_reward())
    game.close()