In [1]:
from pathlib import Path

from gym import spaces
from stable_baselines.common.policies import CnnLnLstmPolicy, LstmPolicy
from stable_baselines.common.vec_env import SubprocVecEnv, DummyVecEnv
from stable_baselines import PPO2
from tesse.msgs import *
import time

from tesse_gym import get_network_config
from tesse_gym.tasks.goseek import GoSeekFullPerception

from tesse.msgs import Camera, Channels, Compression, DataRequest, DataResponse, ObjectsRequest, RemoveObjectsRequest
from tesse_gym.tasks.goseek.goseek import GoSeek

import numpy as np
from typing import Dict, Tuple, Union

import tensorflow as tf
from stable_baselines.common.policies import nature_cnn

from stable_baselines.common.callbacks import CheckpointCallback

from tesse_gym.core.utils import set_all_camera_params

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


# Update simulator class

In [2]:
# update expected observation shape
class GoSeekUpdatedResolution(GoSeekFullPerception):
    shape = (120, 160, 5)

    @property
    def observation_space(self) -> spaces.Box:
        """ Define an observation space for RGB, depth, segmentation, and pose.

       Because Stables Baselines (the baseline PPO library) does not support dictionary spaces,
       the observation images and pose vector will be combined into a vector. The RGB image
       is of shape (240, 320, 3), depth and segmentation are both (240, 320), ose is (3,), thus
       the total shape is (240 * 320 * 5 + 3).
       """
        return spaces.Box(np.Inf, np.Inf, shape=(120 * 160 * 5 + 3,))

    def form_agent_observation(self, tesse_data: DataResponse) -> np.ndarray:
        """ Create the agent's observation from a TESSE data response.

        Args:
            tesse_data (DataResponse): TESSE DataResponse object containing
                RGB, depth, segmentation, and pose.

        Returns:
            np.ndarray: The agent's observation consisting of flatted RGB,
                segmentation, and depth images concatenated with the relative
                pose vector. To recover images and pose, see `decode_observations` below.
        """
        eo, seg, depth = tesse_data.images
        seg = seg[..., 0].copy()  # get segmentation as one-hot encoding

        # See WALL_CLS comment
        seg[seg > (self.N_CLASSES - 1)] = self.WALL_CLS
        observation = np.concatenate(
            (
                eo / 255.0,
                seg[..., np.newaxis] / (self.N_CLASSES - 1),
                depth[..., np.newaxis],
            ),
            axis=-1,
        ).reshape(-1)
        pose = self.get_pose().reshape((3))
        # print("_______________________________")
        # print("OBSERVATIO SHAPe ", observation.shape)
        # print("pose shape ", pose.shape)


        img_shape = (-1, 120, 160, 5)
        #     print('observation shape', observation.shape)
        # observation = np.expand_dims(observation, axis=0)
        imgs = observation.reshape(img_shape)
        #     print('image shape', imgs.shape)
        rgb = imgs[..., :3]
        #     print('rgb shape', rgb.shape)
        gray = np.dot(rgb[..., :3], np.array([0.2989, 0.5870, 0.1140]).reshape(-1, 1))
        #     print('gray shape', gray.shape)
        segmentation = imgs[..., 3]
        segmentation = np.expand_dims(segmentation, axis=3)
        #     print('segmentation shape', segmentation.shape)
        masked_fruit = np.ma.masked_values(segmentation == 1.0, segmentation)
        #     masked_fruit = np.expand_dims(masked_fruit, axis=3)
        # print('masked_fruit', masked_fruit.shape)
        masked_furniture = np.ma.masked_inside(segmentation, 0.5, 1.0)
        #     masked_furniture = np.expand_dims(masked_furniture, axis=3)
        depth = imgs[..., 4]
        depth = np.expand_dims(depth, axis=3)
        #     print('depth shape', depth.shape)

        imgs = np.concatenate((gray, segmentation, masked_fruit, masked_furniture, depth), axis=-1)
        #     print('new imgs shape ', imgs.shape)
        observation = imgs.reshape((img_shape[1] * img_shape[2] * img_shape[3]))
        # print("observ shape before concat ", observation.shape)
        return np.concatenate((observation, pose))

    def compute_reward(
            self, observation: DataResponse, action: int
    ) -> Tuple[float, Dict[str, Union[int, bool]]]:
        targets = self.env.request(ObjectsRequest())
        """ Compute reward.

        Reward consists of:
            - Small time penalty
            - # penalty for too near objects 
            - n_targets_found * `target_found_reward` if `action` == 3.
                n_targets_found is the number of targets that are
                (1) within `success_dist` of agent and (2) within
                a bearing of `CAMERA_FOV` degrees.

        Args:
            observation (DataResponse): TESSE DataResponse object containing images
                and metadata.
            action (int): Action taken by agent.

        Returns:
            Tuple[float, dict[str, [bool, int]]
                Reward
                Dictionary with the following keys
                    - env_changed: True if agent changed the environment.
                    - collision: True if there was a collision
                    - n_found_targets: Number of targets found during step.
        """
        # If not in ground truth mode, metadata will only provide position estimates
        # In that case, get ground truth metadata from the controller
        agent_metadata = (
            observation.metadata
            if self.ground_truth_mode
            else self.continuous_controller.get_broadcast_metadata()
        )
        reward_info = {"env_changed": False, "collision": False, "n_found_targets": 0}

        # compute agent's distance from targets
        agent_position = self._get_agent_position(agent_metadata)
        target_ids, target_position = self._get_target_id_and_positions(
            targets.metadata
        )

        reward = -0.01 * self.target_found_reward  # small time penalty
        # decode data by types
        rgb, segmentation, depth, pose = self.extract_img(self.form_agent_observation(observation))
        # penalty for too near objects
        far_clip_plane = 50
        # agent_observ = self.form_agent_observation(observation)
        depth *= far_clip_plane  # convert depth to meters
        # binary mask for obj nearly 0.7 m
        masked_depth = np.ma.masked_values(depth <= 1.0, depth)
        if np.count_nonzero(masked_depth) > 7000:
            reward -= self.target_found_reward * 0.01
        # get masked fruit from segmentation
        masked_fruit = np.ma.masked_values(segmentation == 1.0, segmentation)
        # penalty for get action without fruit in camera observation
        size_masked_fruit = np.count_nonzero(masked_fruit)
        # print(f"fruit consists of {size_masked_fruit} points")
        if action == 3 and np.count_nonzero(masked_fruit) < 90:
            # print(f"sorry, you can't get it cause you don't see it")
            reward -= self.target_found_reward * 0.02

        # check for found targets
        if target_position.shape[0] > 0 and action == 3:
            found_targets = self.get_found_targets(
                agent_position, target_position, target_ids, agent_metadata
            )

            # if targets are found, update reward and related episode info
            if len(found_targets):
                self.n_found_targets += len(found_targets)
                relative_pose = self.get_pose()
                # relative_distanse = math.sqrt(relative_pose[0]**2 + relative_pose[1]**2)
                fruit_position_bonus = math.sqrt(
                    relative_pose[0] ** 2 + relative_pose[1] ** 2) * self.target_found_reward * 0.02
                # print(f"position fruit bonus is {fruit_position_bonus} relative distanse is {relative_distanse}")
                reward += self.target_found_reward * len(found_targets) + \
                          self.n_found_targets * self.target_found_reward * 0.02 + fruit_position_bonus

                self.env.request(RemoveObjectsRequest(ids=found_targets))
                reward_info["env_changed"] = True
                reward_info["n_found_targets"] += len(found_targets)

                # if all targets have been found, restart the episode
                if self.n_found_targets == self.n_targets:
                    self.done = True
            else:
                reward -= self.target_found_reward * 0.02

        self.steps += 1
        if self.steps > self.episode_length:
            # square = self.getSquare()
            # # reward for search new square
            # if square < 340.0:
            #     reward += 0.1 * square * self.target_found_reward
            # self.positions.clear()
            self.done = True

        # collision information isn't provided by the controller metadata
        if self._collision(observation.metadata):
            reward_info["collision"] = True
            reward -= self.target_found_reward * 0.02

            if self.restart_on_collision:
                self.done = True
        # else:
        #     reward += self.target_found_reward * 0.005
        # print(f"reward for action {action} is {reward}")
        return reward, reward_info

    def extract_img(
            self, observation: np.ndarray, img_shape: Tuple[int, int, int, int] = (-1, 120, 160, 5)
    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
        """ Decode observation vector into images and poses.

        Args:
            observation (np.ndarray): Shape (N,) observation array of flattened
                images concatenated with a pose vector. Thus, N is equal to N*H*W*C + N*3.
            img_shape (Tuple[int, int, int, int]): Shapes of all observed images stacked across
                the channel dimension, resulting in a shape of (N, H, W, C).
                 Default value is (-1, 240, 320, 5).

        Returns:
            Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: Arrays with the following information
                - RGB image(s) of shape (N, H, W, 3)
                - Segmentation image(s) of shape (N, H, W), in range [0, C) where C is the number of classes.
                - Depth image(s) of shape (N, H, W), in range [0, 1]. To get true depth, multiply by the
                    Unity far clipping plane (default 50).
                - Pose array of shape (N, 3) containing (x, y, heading) relative to starting point.
                    (x, y) are in meters, heading is given in degrees in the range [-180, 180].
        """
        observation = np.expand_dims(observation, axis=0)
        imgs = observation[:, :-3].reshape(img_shape)
        rgb = imgs[..., :3]
        segmentation = imgs[..., 3]
        depth = imgs[..., 4]

        pose = observation[:, -3:]

        return rgb, segmentation, depth, pose

    def getTriangle(self, x, y, tetha):
        alpha = self.CAMERA_HFOV
        radius = self.success_dist
        x1 = x + (math.sin(math.radians(tetha + (alpha / 2))) * radius)
        y1 = y + (math.cos(math.radians(tetha + (alpha / 2))) * radius)
        x2 = x + (math.sin(math.radians(tetha - (alpha / 2))) * radius)
        y2 = y + (math.cos(math.radians(tetha - (alpha / 2))) * radius)
        return Polygon([(x, y), (x1, y1), (x2, y2)])

    def getSquare(self):
        polygons = []
        for ar in self.positions:
            poly = self.getTriangle(ar.item(0), ar.item(1), ar.item(2))
            polygons.append(poly)

        result = unary_union(polygons)
        return result.area


In [3]:
# update simulator cameras on init
def set_resolution(tesse_gym):
    set_all_camera_params(tesse_gym, height_in_pixels=120, width_in_pixels=160)




# Configuration

#### Set sim path

In [4]:
filename = Path("../../goseek-challenge/simulator/goseek-v0.1.4.x86_64")
assert filename.exists(), f"Must set a valid path!"

#### Set environment parameters


__Note__ To minimize training time during initial use, we've set `total_timestamps` and `n_environments` to 1e5 and 2 respectively. Setting `total_timestamps` to 3e6 and `n_environments` to 4 should produce an agent that approximates our baseline. 

In [5]:
n_environments = 2  # number of environments to train over
total_timesteps = 100000  # number of training timesteps
scene_id = [1, 2, 3, 4, 5, 5]  # list all available scenes
n_targets = 30  # number of targets spawned in each scene
target_found_reward = 3  # reward per found target
episode_length = 400



def make_unity_env(filename, num_env):
    """ Create a wrapped Unity environment. """

    def make_env(rank):

        def _thunk():
            env = GoSeekUpdatedResolution(
                str(filename),
                network_config=get_network_config(worker_id=rank),
                n_targets=n_targets,
                episode_length=episode_length,
                scene_id=scene_id[rank],
                target_found_reward=target_found_reward,
            )

            return env

        return _thunk

    return SubprocVecEnv([make_env(i) for i in range(num_env)])

#### Launch environments.

In [6]:
env = make_unity_env(filename, n_environments)

# Define the Model 

The following network assumes an observation of consisting of RGB, segmentation, and depth images along with the agent's relative pose from start. Images are processed using the Stable Baseline default CNN. The resulting feature vector is concatenated with the pose vector and given to an LSTM.

In [7]:
import tensorflow as tf
from stable_baselines.common.policies import nature_cnn

#### Define network to consume images and pose

In [8]:
def decode_tensor_observations(observation, img_shape=(-1, 120, 160, 5)):
    """ Decode observation vector into images and poses.

    Args:
        observation (np.ndarray): Shape (N,) observation array of flattened
            images concatenated with a pose vector. Thus, N is equal to N*H*W*C + N*3.
        img_shape (Tuple[int, int, int, int]): Shapes of all images stacked in (N, H, W, C).
            Default value is (-1, 240, 320, 5).
    
    Returns:
        Tuple[tf.Tensor, tf.Tensor]: Tensors with the following information
            - Tensor of shape (N, `img_shape[1:]`) containing RGB,
                segmentation, and depth images stacked across the channel dimension.
            - Tensor of shape (N, 3) containing (x, y, heading) relative to starting point.
                (x, y) are in meters, heading is given in degrees in the range [-180, 180].
    """
    imgs = tf.reshape(observation[:, :-3], img_shape)
    pose = observation[:, -3:]

    return imgs, pose

In [9]:
def image_and_pose_network(observation, **kwargs):
    """ Network to process image and pose data.
    
    Use the stable baselines nature_cnn to process images. The resulting
    feature vector is then combined with the pose estimate and given to an
    LSTM (LSTM defined in PPO2 below).
    
    Args:
        raw_observations (tf.Tensor): 1D tensor containing image and 
            pose data.
        
    Returns:
        tf.Tensor: Feature vector. 
    """
    imgs, pose = decode_tensor_observations(observation)
    image_features = nature_cnn(imgs)
    return tf.concat((image_features, pose), axis=-1)

In [10]:
from stable_baselines.common.tf_layers import conv, linear, conv_to_fc, lstm
def image_and_pose_network2(observation, **kwargs):
    imgs, pose = decode_tensor_observations(observation)
    activ = tf.nn.relu
    layer_1 = activ(conv(imgs, 'c1', n_filters=64, filter_size=8, stride=4, init_scale=np.sqrt(2), **kwargs))
    layer_2 = activ(conv(layer_1, 'c2', n_filters=128, filter_size=4, stride=2, init_scale=np.sqrt(2), **kwargs))
    layer_3 = activ(conv(layer_2, 'c3', n_filters=128, filter_size=3, stride=1, init_scale=np.sqrt(2), **kwargs))
    layer_3 = conv_to_fc(layer_3)
    image_features =  activ(linear(layer_3, 'fc1', n_hidden=1024, init_scale=np.sqrt(2)))
    return tf.concat((image_features, pose), axis=-1)

#### Register custom network

Outputs of the network defined above will be fed into an LSTM defined below in PPO2.

In [11]:
policy_kwargs = {'cnn_extractor': image_and_pose_network}

In [12]:
class CustomCnnLnLstmPolicy(LstmPolicy):

    def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm=1024, reuse=False, **_kwargs):
        super(CustomCnnLnLstmPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm, reuse,
                                              layer_norm=True, feature_extraction="cnn", **_kwargs)

In [13]:
model = PPO2(
    CustomCnnLnLstmPolicy,
    env,
    verbose=1,
    tensorboard_log="./tensorboard/",
    n_steps = 128,
    nminibatches=2,
    cliprange = 0.2,
    gamma=0.999,
    noptepochs = 3,
    learning_rate=0.0003,
#    full_tensorboard_log=True,
    policy_kwargs=policy_kwargs,
)







Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where




# Train the Model

#### Define logging directory and callback function to save checkpoints

In [14]:
log_dir = Path("results/goseek-ppo")
log_dir.mkdir(parents=True, exist_ok=True)

In [None]:
model = PPO2.load(str( f"results/ppo2-newhyper4600k.pkl"), env,verbose=1, tensorboard_log="./tensorboard/",)
#model.set_env(env)

In [None]:
from stable_baselines.common.callbacks import CallbackList, CheckpointCallback
checkpoint_callback = CheckpointCallback(save_freq=3000, save_path='./results/',
                                         name_prefix='ppo2-5-NR5')


In [15]:
from stable_baselines.common.callbacks import CheckpointCallback
checkpoint_callback = CheckpointCallback(save_freq=1000, save_path='./results/',
                                         name_prefix='ppo2-stalin-bigmem-newhyper8')
model.learn(total_timesteps=total_timesteps,  callback=checkpoint_callback, reset_num_timesteps=False)




EOFError: 

In [None]:
model.save(str( f"results/ppo2-newhyper4700k.pkl"))

# Visualize Results

__Note__: Stable-Baselines requires that policy input dimensions be consistent across training and testing. Thus, the number of environments used for visualization must be a multiple of the number of environments used for training. The observation vector is then appropriately duplicated during inference. 

In [None]:
%matplotlib notebook
import matplotlib.pyplot as plt

#### Load model

In [None]:
MODEL_WEIGHTS_PATH = "./results/ppo2-newhyper.pkl"
assert MODEL_WEIGHTS_PATH, f"Must give a model weights path!"

#model = PPO2.load(str(MODEL_WEIGHTS_PATH))
n_train_envs = model.act_model.initial_state.shape[0]

#### Visualize all observed images

In [None]:
obs = env.reset()
rgb, segmentation, depth, pose = decode_observations(obs)
lstm_state = None

assert (
    n_train_envs % obs.shape[0] == 0
), f"The number of visualization environments must be a multiple of the training environments"

In [None]:
import sys
import numpy
np.set_printoptions(threshold=sys.maxsize)

print(segmentation[0])
fig, ax = plt.subplots(1, 3)
ax[0].imshow(rgb[0])
ax[1].imshow(segmentation[0])
ax[2].imshow(depth[0])

#### Run an episode and plot the first person agent view

In [None]:
done = False
fig, ax = plt.subplots(1, obs.shape[0])
ax = [ax] if obs.shape[0] == 1 else ax

for i in range(episode_length):
    actions, lstm_state = model.predict(
        np.concatenate((n_train_envs // obs.shape[0]) * [obs]),
        state=lstm_state,
        deterministic=False,
    )

    actions = actions[: obs.shape[0]]
    obs, reward, done, _ = env.step(actions)

    plt.cla()
    rgb, segmentation, depth, pose = decode_observations(obs)

    for i in range(obs.shape[0]):
        ax[i].imshow(rgb[i])
    fig.canvas.draw()

obs = env.reset()
rgb, segmentation, depth, pose = decode_observations(obs)
lstm_state = None