In [1]:
import os
import tensorflow as tf
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt
import copy
from sarp.utils import load_expert_data_hospital, separate_train_test, combine_nets, mini_batch

In [2]:
gpus = tf.config.experimental.list_physical_devices("GPU")
if gpus:
    try:
        tf.config.experimental.set_virtual_device_configuration(
            gpus[0],
            [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=5120)],
        )
    except RuntimeError as e:
        print(e)

current_dir = os.path.dirname(os.path.abspath("2_sarp_repair.ipynb"))

This notebook provides an example of policy repair using SARP for the robot navigation in hospital. This script assumes that a pre-trained policy and a predictive model are already available. To pre-train a policy for this example run [0_pretrain_policy.py](0_pretrain_policy.py). Also to train a predictive model run [1_pretrain_predictive_model.py](1_pretrain_predictive_model.py). Here are the descriptions of models:
- policy - input: the system state that includes the robot's goal, distancc and heading toward goal, and range sensor readings - output: linear and angular velocities.
- predictive model - input: states and actions - output: collision [0, 1] or no collision [1, 0].

#### Laod dataset
First, we load the expert demonstrations for repair.

In [3]:
# load the expert data
data_dir = current_dir + f"/data/expert_data"
num_samples = len(os.listdir(data_dir))

state, action, _, property = load_expert_data_hospital(data_dir, num_samples, col_remove=True)
state = [tf.convert_to_tensor(s, dtype=tf.float32) for s in state]
action = [tf.convert_to_tensor(a, dtype=tf.float32) for a in action]
property = [tf.convert_to_tensor(p, dtype=tf.float32) for p in property]
train_data, test_data = separate_train_test([state, action, property], test_ratio=0.2)

state_train, action_train, property_train = train_data
state_test, action_test, property_test = test_data

loading sample 1, goal: [10. 10.]
loading sample 2, goal: [10. 10.]
loading sample 3, goal: [10. 10.]
loading sample 4, goal: [10. 10.]
loading sample 5, goal: [10. 10.]
loading sample 6, goal: [10. 10.]
loading sample 7, goal: [10. 10.]
loading sample 8, goal: [10. 10.]
loading sample 9, goal: [10. 10.]
loading sample 10, goal: [10. 10.]
loading sample 11, goal: [-10.   5.]
loading sample 12, goal: [-10.   5.]
loading sample 13, goal: [-10.   5.]
loading sample 14, goal: [-10.   5.]
loading sample 15, goal: [-10.   5.]
loading sample 16, goal: [-10.   5.]
loading sample 17, goal: [-10.   5.]
loading sample 18, goal: [-10.   5.]
loading sample 19, goal: [-10.   5.]
loading sample 20, goal: [-10.   5.]
loading sample 21, goal: [10.  5.]
loading sample 22, goal: [10.  5.]
loading sample 23, goal: [10.  5.]
loading sample 24, goal: [10.  5.]
loading sample 25, goal: [10.  5.]
loading sample 26, goal: [10.  5.]
loading sample 27, goal: [10.  5.]
loading sample 28, goal: [10.  5.]
loading s

2023-07-08 11:49:58.756900: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-07-08 11:49:59.342580: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 5120 MB memory:  -> device: 0, name: Quadro RTX 8000, pci bus id: 0000:d5:00.0, compute capability: 7.5


#### Load and megre models
Here, we load the policy and predictive models, then we merge them in a series fashion to be used in repair.

In [4]:
# load the models
model_policy_orig = keras.models.load_model(
    current_dir
    + f"/trained_models/policy/model"
    )
model_predictive = keras.models.load_model(
    current_dir
    + f"/trained_models/predictive_model/model"
    )

# combine the models
model_combined = combine_nets(
    model_policy_orig, 
    model_predictive, 
    state_indices_passed=[
        state_train[0].shape[1]-i for i in range(
            state_train[0].shape[1], 0, -1
            )
        ]
    )

# keep only the policy part of the combined model to be trained
for layer in model_combined.layers:
    if layer.name.split("_")[0] == "policy":
        layer.trainable = True
    else:
        layer.trainable = False
_,_ = model_combined.predict(state[0][0:1])

Model: "repair_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 policy_layer_1 (Dense)      multiple                  3840      
                                                                 
 policy_layer_2 (Dense)      multiple                  65792     
                                                                 
 policy_layer_3 (Dense)      multiple                  514       
                                                                 
 Predictive_layer_1 (Dense)  multiple                  4352      
                                                                 
 Predictive_layer_2 (Dense)  multiple                  65792     
                                                                 
 Predictive_layer_3 (Dense)  multiple                  2570      
                                                                 
 Predictive_layer_4 (Dense)  multiple                 

#### Define the optimization parameters

We first define the loss function, lagrangian penalty terms and the quadratic penalty terms. We assum two constraints:
1. Constraint on linear velocity: $v\leq0.9$         $\Longrightarrow g_{vel} = ReLU(v-0.9)$
2. Constraint on the output collision property: $\psi = [1,0]$     $\Longrightarrow g_{col} = \psi[1]$

The augmented loss is formulated as 

\begin{align} 
\mathcal{L}^a =  \mathcal{L}_{original} -\lambda_{col} g_{col} + \frac{\mu_{col}}{2}g^2_{col}-\lambda_{vel} g_{vel} + \frac{\mu_{vel}}{2}g^2_{vel}\nonumber
\end{align}

In [5]:
# optimization parameters
batch_size = 32
epochs = 100
params = {
    "lambda_collision": tf.constant(0.0, dtype="float32"),
    "mu_collision": tf.constant(10.0, dtype="float32"),
    "eta_collision": tf.constant(0.001, dtype="float32"),
    "beta_collision": tf.constant(5, dtype="float32"),
    "lambda_velocity": tf.constant(0.0, dtype="float32"),
    "mu_velocity": tf.constant(5.0, dtype="float32"),
    "eta_velocity": tf.constant(0.001, dtype="float32"),
    "beta_velocity": tf.constant(5.0, dtype="float32"),
}
learning_rate = 0.001

# create data batches
batches = mini_batch(
    tf.concat(state_train,0),
    tf.concat(action_train,0), 
    tf.concat(property_train,0), 
    batch_size,
    )


In [6]:
def original_loss(y_true, y_pred):
        return tf.reduce_mean(tf.square(y_true - y_pred))

def col_penalty(y):
    return tf.reduce_sum(tf.square(y[:, 1]))

def col_lagrangian(y):
    return tf.reduce_sum(y[:, 1])

def vel_penalty(y):
    return tf.reduce_sum(tf.square(tf.nn.relu(y[:, 0] - 0.9)))

def vel_lagrangian(y):
    return tf.reduce_sum(tf.nn.relu(y[:, 0] - 0.9))

def augmented_loss(
    s, a, params
):
    a_pred, p_pred = model_combined(s)
    loss_value = (
            100 * original_loss(a, a_pred)
            - params["lambda_collision"] * col_lagrangian(p_pred)
            + params["mu_collision"] / 2 * col_penalty(p_pred)
            - params["lambda_velocity"] * vel_lagrangian(a_pred)
            + params["mu_velocity"] / 2 * vel_penalty(a_pred)
        )
    return (
            loss_value,
            original_loss(a, a_pred),
            col_lagrangian(p_pred),
            vel_lagrangian(a_pred),
        )

Now we define the optimizer and the policy update step.

In [7]:
class LearningRateScheduler():
    def __init__(self, optimizer, lr_min=5e-5, decay=0.1, patience=10, loss_tol=0.0001):
        self.lr_min = lr_min
        self.patience = patience
        self.decay = decay
        self.optimizer = optimizer
        self.loss_tol = loss_tol
        self.counter = 0
        self.loss_prev = 10000

    def on_batch_end(self, loss):
        if self.loss_prev - loss > self.loss_tol:
            pass
        else:
            self.counter += 1
            if self.counter > self.patience:
                self.counter = 0
                new_lr = self.optimizer.learning_rate * self.decay
                if new_lr.numpy() >= self.lr_min:
                    self.optimizer.learning_rate.assign(new_lr)
        
        self.loss_prev = loss


optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
lr_scheduler = LearningRateScheduler(optimizer, lr_min=5e-5, decay=0.1, patience=10, loss_tol=0.0001)

@tf.function
def train_step(
    s, a, params
):
    with tf.GradientTape() as tape:
        loss_value, _, _, _ = augmented_loss(
            s, a, params
        )
    grads = tape.gradient(loss_value, model_combined.trainable_variables)
    optimizer.apply_gradients(zip(grads, model_combined.trainable_variables))
    return loss_value

#### Repair the policy 

Here we repair the policy in the loop and adjust the lagrangian multiplier and penalty coefficient accordingly. 

In [8]:
class MetricCollector():
    def __init__(self):
        self.loss = {'train':[], 'test':[]}
        self.col = {'train':[], 'test':[]}
        self.vel = {'train':[], 'test':[]}
        self.best_weights = []
    
    def update_state(self, s_train, a_train, s_test, a_test, params):
        _, loss_train, col_train, vel_train = augmented_loss(
            s_train, a_train, params
        )
        _, loss_test, col_test, vel_test = augmented_loss(
            s_test, a_test, params
        )
        self.loss['train'].append(loss_train.numpy())
        self.loss['test'].append(loss_test.numpy())
        self.col['train'].append(col_train.numpy())
        self.col['test'].append(col_test.numpy())
        self.vel['train'].append(vel_train.numpy())
        self.vel['test'].append(vel_test.numpy())
    
    def save_best_model(self, model):
        if self.col['test'][-1] == min(self.col['test']):
            self.best_weights = model.get_weights()[:(len(model.policy_arch)-1)*2]

    def plot(self):
        _, ax = plt.subplots(3,1, figsize=(10,10))
        ax[0].plot(self.loss['train'], label='train')
        ax[0].plot(self.loss['test'], label='test')
        ax[0].set_ylabel('loss')
        ax[0].legend()
        ax[1].plot(self.col['train'], label='train')
        ax[1].plot(self.col['test'], label='test')
        ax[1].set_ylabel('collision')
        ax[1].legend()
        ax[2].plot(self.vel['train'], label='train')
        ax[2].plot(self.vel['test'], label='test')
        ax[2].set_ylabel('velocity')
        ax[2].legend()
        plt.show()

class Verbose():
    def __init__(self, metric_collector, optimizer, epochs):
        self.metric_collector = metric_collector
        self.optimizer = optimizer
        self.epochs = epochs
        self.best_model = None

    def on_batch_end(self, epoch, model):
        print(f"e: {epoch}/{self.epochs}, lr: {self.optimizer.learning_rate.numpy():.6f}, loss: {self.metric_collector.loss['train'][-1]:.4f}, col: {self.metric_collector.col['train'][-1]:.4f}, vel: {self.metric_collector.vel['train'][-1]:.4f}, loss_val: {self.metric_collector.loss['test'][-1]:.4f}, col_val: {self.metric_collector.col['test'][-1]:.4f}, vel_val: {self.metric_collector.vel['test'][-1]:.4f}")


In [9]:
metric_collector = MetricCollector()
verbose = Verbose(metric_collector, optimizer, epochs)
for epoch in range(epochs):
    epoch_loss = 0
    for batch in batches:
        batch_loss = train_step(batch[0], batch[1], params)
        epoch_loss += batch_loss

    # update stats
    metric_collector.update_state(
        tf.concat(state_train,0), 
        tf.concat(action_train,0), 
        tf.concat(state_test,0), 
        tf.concat(action_test,0), 
        params,
    ) 

    # save best model
    metric_collector.save_best_model(model_combined)

    # update parameters
    if (epoch + 1) % 10 == 0:
        params["lambda_collision"] = (
            params["lambda_collision"]
            + params["eta_collision"] * metric_collector.col["train"][-1]
        )
        params["mu_collision"] = params["mu_collision"] * params["beta_collision"]
        params["lambda_velocity"] = (
            params["lambda_velocity"]
            + params["eta_velocity"] * metric_collector.vel["train"][-1]
        )
        params["mu_velocity"] = params["mu_velocity"] * params["beta_velocity"]
    
    # print stats
    verbose.on_batch_end(epoch+1, model_combined)
    
    # update learning rate
    lr_scheduler.on_batch_end(metric_collector.col['test'][-1])

e: 1/100, lr: 0.001000, loss: 0.0224, col: 22.6735, vel: 373.4587, loss_val: 0.0298, col_val: 9.1333, vel_val: 78.9730
e: 2/100, lr: 0.001000, loss: 0.0232, col: 21.4083, vel: 325.6610, loss_val: 0.0295, col_val: 9.1373, vel_val: 72.6909
e: 3/100, lr: 0.001000, loss: 0.0232, col: 19.9839, vel: 282.3079, loss_val: 0.0315, col_val: 9.1917, vel_val: 65.3221
e: 4/100, lr: 0.001000, loss: 0.0240, col: 19.4262, vel: 323.0574, loss_val: 0.0312, col_val: 9.0611, vel_val: 72.4234
e: 5/100, lr: 0.001000, loss: 0.0225, col: 19.0552, vel: 303.4448, loss_val: 0.0329, col_val: 9.3303, vel_val: 70.3033
e: 6/100, lr: 0.001000, loss: 0.0226, col: 20.1797, vel: 323.0415, loss_val: 0.0304, col_val: 9.2217, vel_val: 76.3078
e: 7/100, lr: 0.001000, loss: 0.0219, col: 18.6406, vel: 323.0372, loss_val: 0.0314, col_val: 9.0287, vel_val: 71.0497
e: 8/100, lr: 0.001000, loss: 0.0232, col: 19.3568, vel: 330.9801, loss_val: 0.0333, col_val: 9.1561, vel_val: 71.4689
e: 9/100, lr: 0.001000, loss: 0.0242, col: 19.64

#### Save model

In [10]:
if not os.path.exists(current_dir + f"/trained_models/repaired_policy"):
    os.makedirs(current_dir + f"/trained_models/repaired_policy")

counter = 0
for l in range(len(model_policy_orig.layers)):
    if (len(model_policy_orig.layers[l].get_weights())) > 0:
        model_policy_orig.layers[l].set_weights(
            [metric_collector.best_weights[2*counter], metric_collector.best_weights[2*counter+1]]
        )
        counter += 1

keras.models.save_model(
    model_policy_orig,
    f"{current_dir}/trained_models/repaired_policy/model",
    overwrite=True,
    include_optimizer=False,
    save_format=None,
    signatures=None,
    options=None,
    save_traces=False,
)

INFO:tensorflow:Assets written to: /home/local/ASUAD/kmajd1/SARP/examples/1_hospital_simulation/trained_models/repaired_policy/model/assets
