Copyright 2021 DeepMind Technologies Limited.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use
this file except in compliance with the License. You may obtain a copy of the
License at

[https://www.apache.org/licenses/LICENSE-2.0](https://www.apache.org/licenses/LICENSE-2.0)

Unless required by applicable law or agreed to in writing, software distributed
under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.

# RL Unplugged: Offline R2D2 - DeepMind Lab

## A Colab example of an Acme R2D2 agent on DeepMind Lab data.
# <a href="https://colab.research.google.com/github/deepmind/deepmind_research/blob/master/rl_unplugged/dmlab_r2d2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


## Installation


### External dependencies

In [None]:
!apt-get install libsdl2-dev
!apt-get install libosmesa6-dev
!apt-get install libffi-dev
!apt-get install gettext
!apt-get install python3-numpy-dev python3-dev

### Bazel

In [None]:
BAZEL_VERSION = '3.6.0'
!wget https://github.com/bazelbuild/bazel/releases/download/{BAZEL_VERSION}/bazel-{BAZEL_VERSION}-installer-linux-x86_64.sh
!chmod +x bazel-{BAZEL_VERSION}-installer-linux-x86_64.sh
!./bazel-{BAZEL_VERSION}-installer-linux-x86_64.sh
!bazel --version

### DeepMind Lab

In [None]:
!git clone https://github.com/deepmind/lab.git

In [None]:
%%writefile lab/bazel/python.BUILD

# Description:
#   Build rule for Python and Numpy.
#   This rule works for Debian and Ubuntu. Other platforms might keep the
#   headers in different places, cf. 'How to build DeepMind Lab' in build.md.

cc_library(
    name = "python",
    hdrs = select(
        {
            "@bazel_tools//tools/python:PY3": glob([
                "usr/include/python3.6m/*.h",
                "usr/local/lib/python3.6/dist-packages/numpy/core/include/numpy/*.h",
            ]),
        },
        no_match_error = "Internal error, Python version should be one of PY2 or PY3",
    ),
    includes = select(
        {
            "@bazel_tools//tools/python:PY3": [
                "usr/include/python3.6m",
                "usr/local/lib/python3.6/dist-packages/numpy/core/include",
            ],
        },
        no_match_error = "Internal error, Python version should be one of PY2 or PY3",
    ),
    visibility = ["//visibility:public"],
)

alias(
    name = "python_headers",
    actual = ":python",
    visibility = ["//visibility:public"],
)

In [None]:
!cd lab && bazel build -c opt --python_version=PY3 //python/pip_package:build_pip_package

In [None]:
!cd lab && ./bazel-bin/python/pip_package/build_pip_package /tmp/dmlab_pkg

In [None]:
!pip install /tmp/dmlab_pkg/deepmind_lab-1.0-py3-none-any.whl --force-reinstall

### Python dependencies

In [None]:
!pip install dm_env
!pip install dm-acme[reverb]
!pip install dm-acme[tf]
!pip install dm-sonnet

In [None]:
# Upgrade to recent commit for latest R2D2 learner.
!pip install --upgrade git+https://github.com/deepmind/acme.git@3dfda9d392312d948906e6c567c7f56d8c911de5

## Imports and Utils

In [None]:
# @title Imports
import copy
import functools

from acme import environment_loop
from acme import specs
from acme.adders import reverb as acme_reverb
from acme.agents.tf import actors
from acme.agents.tf.r2d2 import learning as r2d2
from acme.tf import utils as tf_utils
from acme.tf import networks
from acme.utils import loggers
from acme.wrappers import observation_action_reward
import tree

import deepmind_lab
import dm_env
import numpy as np
import reverb
import sonnet as snt
import tensorflow as tf
import trfl

In [None]:
# @title Environment

_ACTION_MAP = {
    0: (0, 0, 0, 1, 0, 0, 0),
    1: (0, 0, 0, -1, 0, 0, 0),
    2: (0, 0, -1, 0, 0, 0, 0),
    3: (0, 0, 1, 0, 0, 0, 0),
    4: (-10, 0, 0, 0, 0, 0, 0),
    5: (10, 0, 0, 0, 0, 0, 0),
    6: (-60, 0, 0, 0, 0, 0, 0),
    7: (60, 0, 0, 0, 0, 0, 0),
    8: (0, 10, 0, 0, 0, 0, 0),
    9: (0, -10, 0, 0, 0, 0, 0),
    10: (-10, 0, 0, 1, 0, 0, 0),
    11: (10, 0, 0, 1, 0, 0, 0),
    12: (-60, 0, 0, 1, 0, 0, 0),
    13: (60, 0, 0, 1, 0, 0, 0),
    14: (0, 0, 0, 0, 1, 0, 0),
}

class DeepMindLabEnvironment(dm_env.Environment):
  """DeepMind Lab environment."""

  def __init__(self, level_name: str, action_repeats: int = 4):
    """Construct environment.

    Args:
      level_name: DeepMind lab level name (e.g. 'rooms_watermaze').
      action_repeats: Number of times the same action is repeated on every
        step().
    """
    config = dict(fps='30',
                  height='72',
                  width='96',
                  maxAltCameraHeight='1',
                  maxAltCameraWidth='1',
                  hasAltCameras='false')

    # seekavoid_arena_01 is not part of dmlab30.
    if level_name != 'seekavoid_arena_01':
      level_name = 'contributed/dmlab30/{}'.format(level_name)

    self._lab = deepmind_lab.Lab(level_name, ['RGB_INTERLEAVED'], config)
    self._action_repeats = action_repeats
    self._reward = 0

  def _observation(self):
    last_action = getattr(self, '_action', 0)
    last_reward = getattr(self, '_reward', 0)
    self._last_observation = observation_action_reward.OAR(
        observation=self._lab.observations()['RGB_INTERLEAVED'],
        action=np.array(last_action, dtype=np.int64),
        reward=np.array(last_reward, dtype=np.float32))
    return self._last_observation

  def reset(self):
    self._lab.reset()
    return dm_env.restart(self._observation())

  def step(self, action):
    if not self._lab.is_running():
      return dm_env.restart(self.reset())

    self._action = action.item()
    if self._action not in _ACTION_MAP:
      raise ValueError('Action not available')
    lab_action = np.array(_ACTION_MAP[self._action], dtype=np.intc)
    self._reward = self._lab.step(lab_action, num_steps=self._action_repeats)

    if self._lab.is_running():
      return dm_env.transition(self._reward, self._observation())
    return dm_env.termination(self._reward, self._last_observation)

  def observation_spec(self):
    return observation_action_reward.OAR(
        observation=dm_env.specs.Array(shape=(72, 96, 3), dtype=np.uint8),
        action=dm_env.specs.Array(shape=(), dtype=np.int64),
        reward=dm_env.specs.Array(shape=(), dtype=np.float32))

  def action_spec(self):
    return dm_env.specs.DiscreteArray(num_values=15, dtype=np.int64)

In [None]:
# @title Dataset

def _decode_images(pngs):
  """Decode tensor of PNGs."""
  decode_rgb_png = functools.partial(tf.io.decode_png, channels=3)
  images = tf.map_fn(decode_rgb_png, pngs, dtype=tf.uint8,
                     parallel_iterations=10)
  # [N, 72, 96, 3]
  images.set_shape((pngs.shape[0], 72, 96, 3))
  return images

def _tf_example_to_step_ds(tf_example: tf.train.Example,
                           episode_length: int) -> reverb.ReplaySample:
  """Create a Reverb replay sample from a TF example."""

  # Parse tf.Example.
  def sequence_feature(shape, dtype=tf.float32):
    return tf.io.FixedLenFeature(shape=[episode_length] + shape, dtype=dtype)

  feature_description = {
      'episode_id': tf.io.FixedLenFeature([], tf.int64),
      'start_idx': tf.io.FixedLenFeature([], tf.int64),
      'episode_return': tf.io.FixedLenFeature([], tf.float32),
      'observations_pixels': sequence_feature([], tf.string),
      'observations_reward': sequence_feature([]),
      # actions are one-hot arrays.
      'observations_action': sequence_feature([15]),
      'actions': sequence_feature([], tf.int64),
      'rewards': sequence_feature([]),
      'discounted_rewards': sequence_feature([]),
      'discounts': sequence_feature([]),
  }

  data = tf.io.parse_single_example(tf_example, feature_description)
  pixels = _decode_images(data['observations_pixels'])

  observation = observation_action_reward.OAR(
      observation=pixels,
      action=tf.argmax(data['observations_action'],
                       axis=1, output_type=tf.int64),
      reward=data['observations_reward'])

  data = acme_reverb.Step(
      observation=observation,
      action=data['actions'],
      reward=data['rewards'],
      discount=data['discounts'],
      start_of_episode=tf.zeros((episode_length,), tf.bool),
      extras={})

  # Keys are all zero and probabilities are all one.
  info = reverb.SampleInfo(key=tf.zeros((episode_length,), tf.int64),
                           probability=tf.ones((episode_length,), tf.float32),
                           table_size=tf.zeros((episode_length,), tf.int64),
                           priority=tf.ones((episode_length,), tf.float32))
  sample = reverb.ReplaySample(info=info, data=data)
  return tf.data.Dataset.from_tensor_slices(sample)

def subsequences(step_ds: tf.data.Dataset,
                 length: int, shift: int = 1
                 ) -> tf.data.Dataset:
  """Dataset of subsequences from a dataset of episode steps."""
  window_ds = step_ds.window(length, shift=shift, stride=1)
  return window_ds.interleave(_nest_ds).batch(length, drop_remainder=True)


def _nest_ds(nested_ds: tf.data.Dataset) -> tf.data.Dataset:
  """Produces a dataset of nests from a nest of datasets of the same size."""
  flattened_ds = tuple(tree.flatten(nested_ds))
  zipped_ds = tf.data.Dataset.zip(flattened_ds)
  return zipped_ds.map(lambda *x: tree.unflatten_as(nested_ds, x))


def make_dataset(path: str,
                 episode_length: int,
                 sequence_length: int,
                 sequence_shift: int,
                 num_shards: int = 500) -> tf.data.Dataset:
  """Create dataset of DeepMind Lab sequences."""

  filenames = [f'{path}/tfrecord-{i:05d}-of-{num_shards:05d}'
               for i in range(num_shards)]
  file_ds = tf.data.Dataset.from_tensor_slices(filenames)
  file_ds = file_ds.repeat().shuffle(num_shards)
  tfrecord_dataset = functools.partial(tf.data.TFRecordDataset,
                                       compression_type='GZIP')

  # Dataset of tf.Examples containing full episodes.
  example_ds = file_ds.interleave(tfrecord_dataset)

  # Dataset of episodes, each represented as a dataset of steps.
  _tf_example_to_step_ds_with_length = functools.partial(
      _tf_example_to_step_ds, episode_length=episode_length)
  episode_ds = example_ds.map(_tf_example_to_step_ds_with_length,
                              num_parallel_calls=tf.data.experimental.AUTOTUNE)

  # Dataset of sequences.
  training_sequences = functools.partial(subsequences, length=sequence_length,
                                         shift=sequence_shift)
  return episode_ds.interleave(training_sequences)

## Experiment

In [None]:
# task                            | episode length | run
# ----------------------------------------------------------------------------
# seekavoid_arena_01              | 301            | training_{0..2}
# seekavoid_arena_01              | 301            | snapshot_{0..1}_eps_0.0
# seekavoid_arena_01              | 301            | snapshot_{0..1}_eps_0.01
# seekavoid_arena_01              | 301            | snapshot_{0..1}_eps_0.1
# seekavoid_arena_01              | 301            | snapshot_{0..1}_eps_0.25
# explore_object_rewards_few      | 1351           | training_{0..2}
# explore_object_rewards_many     | 1801           | training_{0..2}
# rooms_select_nonmatching_object | 181            | training_{0..2}
# rooms_watermaze                 | 1801           | training_{0..2}

TASK = 'seekavoid_arena_01'
RUN = 'training_0'
EPISODE_LENGTH = 301
BATCH_SIZE = 1
DATASET_PATH = f'gs://rl_unplugged/dmlab/{TASK}/{RUN}'

In [None]:
environment = DeepMindLabEnvironment(TASK, action_repeats=2)

In [None]:
dataset = make_dataset(DATASET_PATH, num_shards=500,
                       episode_length=EPISODE_LENGTH,
                       sequence_length=120,
                       sequence_shift=40)
dataset = dataset.padded_batch(BATCH_SIZE, drop_remainder=True)

### Learning

In [None]:
# Create network.
def process_observations(x):
  return x._replace(observation=tf.image.convert_image_dtype(x.observation, tf.float32))

environment_spec = specs.make_environment_spec(environment)
num_actions = environment_spec.actions.maximum + 1
network = snt.DeepRNN([
    process_observations,
    networks.R2D2AtariNetwork(num_actions=num_actions)
])
tf_utils.create_variables(network, [environment_spec.observations])

In [None]:
# Create a logger.
logger = loggers.TerminalLogger(label='learner', time_delta=1.)

# Create the R2D2 learner.
learner = r2d2.R2D2Learner(
    environment_spec=environment_spec,
    network=network,
    target_network=copy.deepcopy(network),
    discount=0.99,
    learning_rate=1e-4,
    importance_sampling_exponent=0.2,
    target_update_period=100,
    burn_in_length=0,
    sequence_length=120,
    store_lstm_state=False,
    dataset=dataset,
    logger=logger)

In [None]:
for _ in range(5):
  learner.step()

### Evaluation

In [None]:
# Create a logger.
logger = loggers.TerminalLogger(label='evaluator', time_delta=1.)

# Create evaluation loop.
eval_network = snt.DeepRNN([
    network,
    lambda q: trfl.epsilon_greedy(q, epsilon=0.4**8).sample(),
])
eval_loop = environment_loop.EnvironmentLoop(
    environment=environment,
    actor=actors.RecurrentActor(policy_network=eval_network),
    logger=logger)

In [None]:
eval_loop.run(2)