# HSKA AI-Lab RL: Assignment

## Mount Google Drive as folder

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
%cd /content/drive/My\ Drive/ai-lab/rl

Im abschließenden Assignment soll nun [Atari Space Invaders](https://github.com/openai/gym/blob/4ede9280f9c477f1ca09929d10cdc1e1ba1129f1/gym/envs/atari/atari_env.py) implementiert werden. Space Invaders zählt zu den klassischen Atari 2600 Spielen und wird hier nahezu unverändert genutzt. Für Details siehe das [Paper von DeepMind](https://storage.googleapis.com/deepmind-media/dqn/DQNNaturePaper.pdf).

### Atari Pong Environment vorbereiten

In [None]:
%tensorflow_version 1.x
%pip install --upgrade pip
%pip install gym[atari]==0.12.5
%pip install pyglet==1.3.2

import gym

import random
from collections import deque
from typing import Tuple
import time
from datetime import datetime
from contextlib import suppress

import numpy as np
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))
from tensorflow.keras.layers import Conv2D, Flatten, Dense, Lambda, multiply, Input
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from tensorflow.losses import huber_loss
from tensorflow.keras.backend import set_session
from loggers import TensorBoardLogger, tf_summary_image

%pip install matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

from plot_utils import plot_statistics
from abstract_agent import AbstractAgent
from atari_helpers import LazyFrames, wrap_deepmind, make_atari

!apt-get install -y xvfb python-opengl
!python -m pip install pyvirtualdisplay
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1400, 900))
display.start()

is_ipython = 'inline' in plt.get_backend()
if is_ipython:
    from IPython import display
    from IPython.display import SVG

plt.ion()

In [None]:
env = make_atari('SpaceInvadersNoFrameskip-v4', skip=3)
env = wrap_deepmind(env, frame_stack=True)

### Assignment: Atari Space Invaders

Für das Assignment soll der `Agent` mit den Methoden `act`, `train`, `_build_model` und `_replay` implementiert werden. Das Model ist hier nicht mehr vorgegeben und kann entweder aus dem Notebook `4_atari_pong_dqn` übernommen werden oder selbst implementiert werden.

In [None]:
class Agent(AbstractAgent):

    def __init__(self, action_size: int, state_size: int,
                 gamma: float, epsilon: float, epsilon_decay: float, epsilon_min: float, 
                 alpha: float):
        self.action_size = action_size
        self.state_size = state_size

        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        self.alpha = alpha

    def _build_model(self):
        """Model used to learn e.g. action-values.
        
        Returns:
            model [Model]
        """
        raise NotImplementedError

    def _replay(self) -> None:
        """Gets random experiences from memory for batch update.
        
        Returns:
            None
        """
        raise NotImplementedError
        
    def act(self, state: LazyFrames) -> int:
        """Selects the action to be executed based on the given state.

        Implements epsilon greedy exploration strategy, i.e. with a probability of
        epsilon, a random action is selected.

        Args:
            state [LazyFrames]: LazyFrames object representing the state based on 4 stacked observations (images)

        Returns:
            action [int]
        """
        raise NotImplementedError

    def train(self, experience: Tuple[LazyFrames, int, LazyFrames, float, bool]) -> None:
        """Stores the experience in memory. If memory is full trains network by replay.

        Args:
            experience [tuple]: Tuple of state, action, next state, reward, done.

        Returns:
            None
        """
        raise NotImplementedError

In [None]:
def interact_with_environment(env, agent, n_episodes=600, max_steps=1000000, train=True, verbose=True):      
    statistics = []
    tb_logger = TensorBoardLogger(f'./logs/run-{datetime.now().strftime("%Y-%m-%d_%H:%M:%S")}')
    
    with suppress(KeyboardInterrupt):
        total_step = 0
        for episode in range(n_episodes):
            done = False
            episode_reward = 0
            state = env.reset()
            episode_start_time = time.time()
            episode_step = 0

            while not done:
                action = agent.act(state)
                next_state, reward, done, _ = env.step(action)

                if train:
                    agent.train((state, action, next_state, reward, done))

                if episode == 0:
                    # for debug purpose log every state of first episode
                    for obs in state:
                        tb_logger.log_image(f'state_t{episode_step}:', tf_summary_image(np.array(obs, copy=False)),
                                            global_step=total_step)
                state = next_state
                episode_reward += reward
                episode_step += 1
            
            total_step += episode_step

            if episode % 10 == 0:
                speed = episode_step / (time.time() - episode_start_time)
                tb_logger.log_scalar('score', episode_reward, global_step=total_step)
                tb_logger.log_scalar('epsilon', agent.epsilon, global_step=total_step)
                tb_logger.log_scalar('speed', speed, global_step=total_step)
                if verbose:
                    print(f'episode: {episode}/{n_episodes}, score: {episode_reward}, steps: {episode_step}, '
                          f'total steps: {total_step}, e: {agent.epsilon:.3f}, speed: {speed:.2f} steps/s')

            statistics.append({
                'episode': episode,
                'score': episode_reward,
                'steps': episode_step
            })
                                  
            if total_step >= max_steps:
                break
        
    return statistics

In [None]:
action_size = env.action_space.n
state_size = env.observation_space.shape[0]

# Hyperparams (adjust to your own needs!)
annealing_steps = 100000 # not episodes!
gamma = 0.99
epsilon = 1.0
epsilon_min = 0.01
epsilon_decay = (epsilon - epsilon_min) / annealing_steps
alpha = 0.0001

agent = Agent(action_size=action_size, state_size=state_size, gamma=gamma, 
                 epsilon=epsilon, epsilon_decay=epsilon_decay, epsilon_min=epsilon_min, 
                 alpha=alpha)

statistics = interact_with_environment(env, agent, n_episodes=400, verbose=True)
env.close()
plot_statistics(statistics)

#### Aufbau Keras Model
Der Aufbau des Keras-Modells kann zur Verdeutlichung nochmals geplottet werden.

In [None]:
tf.keras.utils.plot_model(agent.model, to_file='keras_plot_model.png', show_shapes=True)
display.Image('keras_plot_model.png')

#### Performanceauswertung (Video)
Der folgende Code dient zur Performancebewertung des Agenten. Der (hoffentlich) trainierte Agent wird bei seiner Ausführung gefilmt, trainiert aber nicht weiter. Anschließend wird das Video seiner besten Performance dargestellt.

In [None]:
for i in range(3):
    state = env.reset()
    img = plt.imshow(env.render(mode='rgb_array'))
    for j in range(200):
        action = agent.act(state)
        img.set_data(env.render(mode='rgb_array')) 
        plt.axis('off')
        display.display(plt.gcf())
        display.clear_output(wait=True)
        state, reward, done, _ = env.step(action)
        if done:
            break 
            
env.close()