In [3]:
import os
import tensorflow as tf
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt
from sarp.utils import load_expert_data_hospital, separate_train_test, combine_nets

In [4]:
gpus = tf.config.experimental.list_physical_devices("GPU")
if gpus:
    try:
        tf.config.experimental.set_virtual_device_configuration(
            gpus[0],
            [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=5120)],
        )
    except RuntimeError as e:
        print(e)

currn_dir = os.path.dirname(os.path.abspath("2_sarp_repair.ipynb"))

This notebook provides an example of policy repair using SARP for the robot navigation in hospital. This script assumes that a pre-trained policy and a predictive model are already available. To pre-train a policy for this example run [0_pretrain_policy.py](0_pretrain_policy.py). Also to train a predictive model run [1_pretrain_predictive_model.py](1_pretrain_predictive_model.py). Here are the descriptions of models:
- policy - input: the system state that includes the robot's goal, distancc and heading toward goal, and range sensor readings - output: linear and angular velocities.
- predictive model - input: states and actions - output: collision [0, 1] or no collision [1, 0].

#### Laod dataset
First, we load the expert demonstrations for repair.

In [5]:
# load the expert data
data_dir = currn_dir + f"/data/expert_data"
num_samples = len(os.listdir(data_dir))

state, action, _, property = load_expert_data_hospital(data_dir, num_samples)
state = [tf.convert_to_tensor(s, dtype=tf.float32) for s in state]
action = [tf.convert_to_tensor(a, dtype=tf.float32) for a in action]
property = [tf.convert_to_tensor(p, dtype=tf.float32) for p in property]
train_data, test_data = separate_train_test([state, action, property], test_ratio=0.2)

state_train, action_train, property_train = train_data
state_test, action_test, property_test = test_data

loading sample 1, goal: [10. 10.]
loading sample 2, goal: [10. 10.]
loading sample 3, goal: [10. 10.]
loading sample 4, goal: [10. 10.]
loading sample 5, goal: [10. 10.]
loading sample 6, goal: [10. 10.]
loading sample 7, goal: [10. 10.]
loading sample 8, goal: [10. 10.]
loading sample 9, goal: [10. 10.]
loading sample 10, goal: [10. 10.]
loading sample 11, goal: [-10.   5.]
loading sample 12, goal: [-10.   5.]
loading sample 13, goal: [-10.   5.]
loading sample 14, goal: [-10.   5.]
loading sample 15, goal: [-10.   5.]
loading sample 16, goal: [-10.   5.]
loading sample 17, goal: [-10.   5.]
loading sample 18, goal: [-10.   5.]
loading sample 19, goal: [-10.   5.]
loading sample 20, goal: [-10.   5.]
loading sample 21, goal: [10.  5.]
loading sample 22, goal: [10.  5.]
loading sample 23, goal: [10.  5.]
loading sample 24, goal: [10.  5.]
loading sample 25, goal: [10.  5.]
loading sample 26, goal: [10.  5.]
loading sample 27, goal: [10.  5.]
loading sample 28, goal: [10.  5.]
loading s

2023-07-04 15:17:23.614112: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-07-04 15:17:24.224483: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 5120 MB memory:  -> device: 0, name: Quadro RTX 8000, pci bus id: 0000:d5:00.0, compute capability: 7.5


#### Load and megre models
Here, we load the policy and predictive models, then we merge them in a series fashion to be used in repair.

In [8]:
# load the models
model_policy_orig = keras.models.load_model(
    currn_dir
    + f"/trained_models/policy/model"
    )
model_predictive = keras.models.load_model(
    currn_dir
    + f"/trained_models/predictive_model/model"
    )

# combine the models
model_combined = combine_nets(model_policy_orig, model_predictive)

# keep only the policy part of the combined model to be trained
for layer in model_combined.layers:
    if layer.name.split("_")[0] == "policy":
        layer.trainable = True
    else:
        layer.trainable = False
_,_ = model_combined.predict(state[0][0:1])

Model: "repair_model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 policy_layer_1 (Dense)      multiple                  3840      
                                                                 
 policy_layer_2 (Dense)      multiple                  65792     
                                                                 
 policy_layer_3 (Dense)      multiple                  514       
                                                                 
 Predictive_layer_1 (Dense)  multiple                  4352      
                                                                 
 Predictive_layer_2 (Dense)  multiple                  65792     
                                                                 
 Predictive_layer_3 (Dense)  multiple                  2570      
                                                                 
 Predictive_layer_4 (Dense)  multiple               

#### Define the optimization parameters

We first define the loss function, lagrangian penalty terms and the quadratic penalty terms. We assum two constraints:
1. Constraint on linear velocity: $v\leq0.9$
2. Constraint on the output collision property: $\psi = [1,0]$

In [12]:
# optimization parameters
learning_rate = 0.00005

In [11]:
def original_loss(y_true, y_pred):
        return tf.reduce_mean(tf.square(y_true - y_pred))

def col_penalty(y):
    return tf.reduce_sum(tf.square(y[:, 1]))

def col_lagrangian(y):
    return tf.reduce_sum(y[:, 1])

def vel_penalty(y):
    return tf.reduce_sum(tf.square(tf.nn.relu(y[:, 0] - 0.9)))

def vel_lagrangian(y):
    return tf.reduce_sum(tf.nn.relu(y[:, 0] - 0.9))

def augmented_loss(
    s, a, params
):
    a_pred, p_pred = model_combined(s)
    return (
        100 * original_loss(a, a_pred)
        - params[0] * col_lagrangian(p_pred)
        + params[1] / 2 * col_penalty(p_pred)
        - params[2] * vel_lagrangian(a_pred)
        + params[3] / 2 * vel_penalty(a_pred)
    )

Now we define the optimizer and the policy update step.

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

@tf.function
def train_step(
    s, a, params
):
    with tf.GradientTape() as tape:
        loss_value = augmented_loss(
            s, a, params
        )
    grads = tape.gradient(loss_value, model_combined.trainable_variables)
    optimizer.apply_gradients(zip(grads, model_combined.trainable_variables))
    return loss_value