In [1]:
import os
import tensorflow as tf
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt
from sarp.utils import load_expert_data_hospital, separate_train_test, combine_nets, mini_batch

In [2]:
gpus = tf.config.experimental.list_physical_devices("GPU")
if gpus:
    try:
        tf.config.experimental.set_virtual_device_configuration(
            gpus[0],
            [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=5120)],
        )
    except RuntimeError as e:
        print(e)

currn_dir = os.path.dirname(os.path.abspath("2_sarp_repair.ipynb"))

This notebook provides an example of policy repair using SARP for the robot navigation in hospital. This script assumes that a pre-trained policy and a predictive model are already available. To pre-train a policy for this example run [0_pretrain_policy.py](0_pretrain_policy.py). Also to train a predictive model run [1_pretrain_predictive_model.py](1_pretrain_predictive_model.py). Here are the descriptions of models:
- policy - input: the system state that includes the robot's goal, distancc and heading toward goal, and range sensor readings - output: linear and angular velocities.
- predictive model - input: states and actions - output: collision [0, 1] or no collision [1, 0].

#### Laod dataset
First, we load the expert demonstrations for repair.

In [3]:
# load the expert data
data_dir = currn_dir + f"/data/expert_data"
num_samples = len(os.listdir(data_dir))

state, action, _, property = load_expert_data_hospital(data_dir, num_samples)
state = [tf.convert_to_tensor(s, dtype=tf.float32) for s in state]
action = [tf.convert_to_tensor(a, dtype=tf.float32) for a in action]
property = [tf.convert_to_tensor(p, dtype=tf.float32) for p in property]
train_data, test_data = separate_train_test([state, action, property], test_ratio=0.2)

state_train, action_train, property_train = train_data
state_test, action_test, property_test = test_data

loading sample 1, goal: [10. 10.]
loading sample 2, goal: [10. 10.]
loading sample 3, goal: [10. 10.]
loading sample 4, goal: [10. 10.]
loading sample 5, goal: [10. 10.]
loading sample 6, goal: [10. 10.]
loading sample 7, goal: [10. 10.]
loading sample 8, goal: [10. 10.]
loading sample 9, goal: [10. 10.]
loading sample 10, goal: [10. 10.]
loading sample 11, goal: [-10.   5.]
loading sample 12, goal: [-10.   5.]
loading sample 13, goal: [-10.   5.]
loading sample 14, goal: [-10.   5.]
loading sample 15, goal: [-10.   5.]
loading sample 16, goal: [-10.   5.]
loading sample 17, goal: [-10.   5.]
loading sample 18, goal: [-10.   5.]
loading sample 19, goal: [-10.   5.]
loading sample 20, goal: [-10.   5.]
loading sample 21, goal: [10.  5.]
loading sample 22, goal: [10.  5.]
loading sample 23, goal: [10.  5.]
loading sample 24, goal: [10.  5.]
loading sample 25, goal: [10.  5.]
loading sample 26, goal: [10.  5.]
loading sample 27, goal: [10.  5.]
loading sample 28, goal: [10.  5.]
loading s

2023-07-04 21:16:59.290433: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


#### Load and megre models
Here, we load the policy and predictive models, then we merge them in a series fashion to be used in repair.

In [4]:
# load the models
model_policy_orig = keras.models.load_model(
    currn_dir
    + f"/trained_models/policy/model"
    )
model_predictive = keras.models.load_model(
    currn_dir
    + f"/trained_models/predictive_model/model"
    )

# combine the models
model_combined = combine_nets(model_policy_orig, model_predictive)

# keep only the policy part of the combined model to be trained
for layer in model_combined.layers:
    if layer.name.split("_")[0] == "policy":
        layer.trainable = True
    else:
        layer.trainable = False
_,_ = model_combined.predict(state[0][0:1])

Model: "repair_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 policy_layer_1 (Dense)      multiple                  3840      
                                                                 
 policy_layer_2 (Dense)      multiple                  65792     
                                                                 
 policy_layer_3 (Dense)      multiple                  514       
                                                                 
 Predictive_layer_1 (Dense)  multiple                  4352      
                                                                 
 Predictive_layer_2 (Dense)  multiple                  65792     
                                                                 
 Predictive_layer_3 (Dense)  multiple                  2570      
                                                                 
 Predictive_layer_4 (Dense)  multiple                 

#### Define the optimization parameters

We first define the loss function, lagrangian penalty terms and the quadratic penalty terms. We assum two constraints:
1. Constraint on linear velocity: $v\leq0.9$
2. Constraint on the output collision property: $\psi = [1,0]$

In [5]:
# optimization parameters
learning_rate = 0.00005
batch_size = 32
epochs = 100
params = {
    "lambda_collision": tf.constant(0.0, dtype="float32"),
    "mu_collision": tf.constant(5.0, dtype="float32"),
    "eta_collision": tf.constant(0.01, dtype="float32"),
    "beta_collision": tf.constant(10, dtype="float32"),
    "lambda_velocity": tf.constant(0.0, dtype="float32"),
    "mu_velocity": tf.constant(10.0, dtype="float32"),
    "eta_velocity": tf.constant(0.0001, dtype="float32"),
    "beta_velocity": tf.constant(5.0, dtype="float32"),
}

# create data batches
batches = mini_batch(
    tf.concat(state_train,0),
    tf.concat(action_train,0), 
    tf.concat(property_train,0), 
    batch_size,
    )


In [6]:
def original_loss(y_true, y_pred):
        return tf.reduce_mean(tf.square(y_true - y_pred))

def col_penalty(y):
    return tf.reduce_sum(tf.square(y[:, 1]))

def col_lagrangian(y):
    return tf.reduce_sum(y[:, 1])

def vel_penalty(y):
    return tf.reduce_sum(tf.square(tf.nn.relu(y[:, 0] - 0.9)))

def vel_lagrangian(y):
    return tf.reduce_sum(tf.nn.relu(y[:, 0] - 0.9))

def augmented_loss(
    s, a, params
):
    a_pred, p_pred = model_combined(s)
    loss_value = (
            100 * original_loss(a, a_pred)
            - params["lambda_collision"] * col_lagrangian(p_pred)
            + params["mu_collision"] / 2 * col_penalty(p_pred)
            - params["lambda_velocity"] * vel_lagrangian(a_pred)
            + params["mu_velocity"] / 2 * vel_penalty(a_pred)
        )
    return (
            loss_value,
            original_loss(a, a_pred),
            col_lagrangian(p_pred),
            vel_lagrangian(a_pred),
        )

Now we define the optimizer and the policy update step.

In [7]:
class LearningRateScheduler():
    def __init__(self, lr, decay, patience):
        self.lr = lr
        self.patience = patience
        self.decay = decay
        self.iterations = 0

    def on_batch_end(self, batch, logs=None):
        self.iterations += 1
        self.model.optimizer.lr = self.lr * self.decay ** self.iterations

optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

@tf.function
def train_step(
    s, a, params
):
    with tf.GradientTape() as tape:
        loss_value, _, _, _ = augmented_loss(
            s, a, params
        )
    grads = tape.gradient(loss_value, model_combined.trainable_variables)
    optimizer.apply_gradients(zip(grads, model_combined.trainable_variables))
    return loss_value

#### Repair the policy 

Here we repair the policy in the loop and adjust the lagrangian multiplier and penalty coefficient accordingly. 

In [8]:
class MetricCollector():
    def __init__(self):
        self.loss = {'train':[], 'test':[]}
        self.col = {'train':[], 'test':[]}
        self.vel = {'train':[], 'test':[]}
    
    def update_state(self, s_train, a_train, s_test, a_test, params):
        _, loss_train, col_train, vel_train = augmented_loss(
            s_train, a_train, params
        )
        _, loss_test, col_test, vel_test = augmented_loss(
            s_test, a_test, params
        )
        self.loss['train'].append(loss_train.numpy())
        self.loss['test'].append(loss_test.numpy())
        self.col['train'].append(col_train.numpy())
        self.col['test'].append(col_test.numpy())
        self.vel['train'].append(vel_train.numpy())
        self.vel['test'].append(vel_test.numpy())
        

In [9]:
metric_collector = MetricCollector()
for epoch in range(epochs):
    epoch_loss = 0
    for batch in batches:
        batch_loss = train_step(batch[0], batch[1], params)
        epoch_loss += batch_loss
    print(f"Epoch {epoch+1} loss: {epoch_loss.numpy()/len(batches)}")

    # update stats
    metric_collector.update_state(
        tf.concat(state_train,0), 
        tf.concat(action_train,0), 
        tf.concat(state_test,0), 
        tf.concat(action_test,0), 
        params,
    ) 

    # update parameters
    if (epoch + 1) % 10 == 0:
        params["lambda_collision"] = params["lambda_collision"] + params["eta_collision"] * metric_collector.col['train'][-1]
        params["mu_collision"] = params["mu_collision"] * params["beta_collision"] 
        params["lambda_velocity"] = params["lambda_velocity"] + params["eta_velocity"] * metric_collector.vel['train'][-1]
        params["mu_velocity"] = params["mu_velocity"] * params["beta_velocity"]

Epoch 1 loss: 2.802249834065982
Epoch 2 loss: 2.5747073712787256
Epoch 3 loss: 2.5286790428055363
Epoch 4 loss: 2.496774997551793
Epoch 5 loss: 2.470918023154596
Epoch 6 loss: 2.448998411385794
Epoch 7 loss: 2.4297337439066853
Epoch 8 loss: 2.4129846089397633
Epoch 9 loss: 2.398401116926358
Epoch 10 loss: 2.3850675705083564
Epoch 11 loss: 6.083901407990947
Epoch 12 loss: 5.956256664563022
Epoch 13 loss: 5.898814251827995
Epoch 14 loss: 5.844591911124652
Epoch 15 loss: 5.810770613901463
Epoch 16 loss: 5.784766577080432
Epoch 17 loss: 5.7625008160689415
Epoch 18 loss: 5.742411918958914
Epoch 19 loss: 5.722550161037605
Epoch 20 loss: 5.7005829452472145
Epoch 21 loss: 39.26186564240947
Epoch 22 loss: 38.441670112291085
Epoch 23 loss: 37.802410667653206
Epoch 24 loss: 37.31985822162256
Epoch 25 loss: 37.06699653986769
Epoch 26 loss: 36.8896484375
Epoch 27 loss: 36.699866164693596
Epoch 28 loss: 36.561719294045965
Epoch 29 loss: 36.338236094185234
Epoch 30 loss: 36.109013209435936
Epoch 31 l