In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
import notebook_setup

from copy import deepcopy
import pickle
import warnings
warnings.filterwarnings("error", category=UserWarning)
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # disable tensorflow warning messages
os.makedirs(os.path.expanduser('~/Data/tensorboard/'), exist_ok=True)

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import gym
import pandas as pd
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV, ParameterGrid
from sklearn.metrics import make_scorer, r2_score
import matplotlib.pyplot as plt
# from stable_baselines.common.policies import MlpPolicy
# from stable_baselines.common.vec_env import DummyVecEnv
# from stable_baselines.common.evaluation import evaluate_policy
# from stable_baselines import PPO2
from tqdm.auto import tqdm, trange
from pytorchbridge import TorchEstimator
from pystatespace import Trapezoid
from joblib import Parallel, delayed

from utils import cache_function, cache_to_episodic_rewards, cache_to_episodes, copy_tensor, copy_mlp_regressor, sanitize_filename
from systems.tanks import TanksFactory, TanksPhysicalEnv, TanksDataEnv, plot_tanks
from ppo import PPO, Memory, ActorCriticBinary
from meta import learn_env_model, meta_update, kl_div, prune_library, plot_adaption

DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
SEED = 0
NCPU = os.cpu_count() // 2

## System

In [None]:
n_tanks = 6
n_engines = 2
tstep = 1e0
seed = 0
np.random.seed(seed)
nominal_config = {
    'heights': np.ones(n_tanks),
    'cross_section':np.ones(n_tanks),
    'valves_min':np.zeros(n_tanks),
    'valves_max':np.ones(n_tanks),
    'resistances':np.ones(n_tanks) * 1e2,
    'pumps':np.ones(n_tanks) * 0.1,
    'engines':np.ones(n_engines) * 0.05
}
# this tanks object is global and modified in-place by degrade function
tanks = TanksFactory(n = n_tanks, e = n_engines, **nominal_config)

### Diagnosis

In [None]:
tanks_ = TanksFactory(n = n_tanks, e = n_engines, **nominal_config)
env_ = TanksPhysicalEnv(tanks_, tstep)
env_.reset()
d = False
i = 0
R = 0
while not d and i < 200:
    i += 1
    s, r, d, _ = env_.step(env_.action_space.sample())
    R += r
    print(i, s, '{:.2f}'.format(sum(s)), r)
print(R)

In [None]:
plot_tanks(env_)

### System model

* TODO: Check if training data tuples are actually causal

In [None]:
# Random actions which are repeated for a random number
# of time steps
def rand_u(length: int, dim: int):
    i = 0
    u = np.zeros((length, dim))
    while i < length - 1:
        subseq = min(length - i, np.random.randint(1, int(length / 2)))
        u_ = np.random.choice(2, (1, dim))
        u[i:i+subseq] = u_
        i += subseq
    return u

In [None]:
# Generate training data
def generate_training_data(tanks, episodes=50):
    system = Trapezoid(dims=n_tanks, outputs=n_tanks,
                       dx=tanks.dxdt, out=tanks.y, tstep=tstep)
    episode_duration = sum(tanks.heights * tanks.cross_section) \
                       / min(sum(tanks.pumps), sum(tanks.engines))
    episode_length = int(episode_duration / system.tstep)
    
    # An episode of length n will have n-1
    # state[i], action[i], state[i+1] tuples
    arr_len = episode_length - 1
    Xtrain = np.empty((episodes * arr_len, n_tanks * 2))
    Ytrain = np.empty((episodes * arr_len, n_tanks))
    for e in range(episodes):
        u = rand_u(episode_length, n_tanks)
        t = np.linspace(0, episode_duration, num=episode_length, endpoint=False)
        x, _ = system.predict(t, tanks.heights, u)
        Xtrain[e * arr_len: (e+1) * arr_len, :n_tanks] = x[:-1]
        Xtrain[e * arr_len: (e+1) * arr_len, n_tanks:] = u[:-1]
        Ytrain[e * arr_len: (e+1) * arr_len] = x[1:]
    return Xtrain, Ytrain

Xtrain, Ytrain = generate_training_data(tanks, episodes=50)
print(Xtrain.shape, Ytrain.shape)

In [None]:
grid = GridSearchCV(MLPRegressor(), scoring=make_scorer(r2_score, multioutput='uniform_average'),
                   param_grid={
                       'hidden_layer_sizes': ((64, 64), (128, 128)),
                       'activation': ('relu', 'logistic'),
                       'learning_rate_init': (1e-2, 1e-3),
                       'warm_start': (True,)
                   },
                   n_jobs=12, verbose=1)
grid.fit(Xtrain, Ytrain)
pd.DataFrame(grid.cv_results_).sort_values(by='rank_test_score', ascending=True, axis=0).head()

In [None]:
# Train on episodes-1, validate on 1 episode worth of instances:
est = grid.best_estimator_
est.set_params(random_state=seed)
# Plot performance
env_data = TanksDataEnv(tanks, est, tstep)
plot_tanks(env_data, plot='closed')
plt.suptitle('Modeled fuel tank levels over time');

In [None]:
module = nn.Sequential(
    nn.Linear(12, 64),
    nn.Tanh(),
    nn.Linear(64, 6),
    nn.Sigmoid()
)
o = optim.Adam(module.parameters(), lr=50e-5)
l = nn.MSELoss()
e = TorchEstimator(module, o, l, epochs=100, batch_size=32, early_stopping=True, verbose=True, max_tol_iter=10, tol=1e-5)
e.fit(Xtrain, Ytrain);
est=e # replace scklearn's estimator with pyTorch

In [None]:
torche = TanksDataEnv(tanks, e, tstep)
plot_tanks(torche)

In [None]:
plot_tanks(TanksPhysicalEnv(tanks, tstep))

### Degradation

In [None]:
def degrade(tanks: TanksFactory, time: float, tfactor: np.ndarray,
            efactor: np.ndarray, **nominal):
    if not isinstance(tfactor, (list, tuple, np.ndarray)):
        # If a single degradation factor given, assume it is
        # identical for all tanks.
        tfactor = np.ones(n_tanks) * tfactor
    if not isinstance(efactor, (list, tuple, np.ndarray)):
        # If a single degradation factor given, assume it is
        # identical for all engines.
        efactor = np.ones(n_engines) * efactor
    for i in range(n_tanks):
        tanks.pumps[i] = nominal['pumps'][i] * (1 - time / tfactor[i])
        tanks.resistances[i] = nominal['resistances'][i] + \
                               nominal['resistances'][i] * time / tfactor[i]
    for i in range(n_engines):
        tanks.engines[i] = nominal['engines'][i] + \
                           nominal['engines'][i] * time / efactor[i]


def random_degrade(tanks: TanksFactory=None, tfactors=(10, 20), efactors=(10, 20),
                   atmost_tanks=1, atmost_engines=1, random=np.random):
    n_tanks = len(tanks.heights)
    tfactor = np.ones(n_tanks) * np.inf
    if atmost_tanks > 0:
        tanks_affected = random.randint(1, atmost_tanks + 1)
        idx_affected = random.choice(n_tanks, size=tanks_affected, replace=False)
        tfactor[idx_affected] = random.randint(*tfactors, size=tanks_affected)
        
    efactor = np.ones(n_engines) * np.inf
    if atmost_engines > 0:
        engines_affected = random.randint(1, atmost_engines + 1)
        idx_affected = random.choice(n_engines, size=engines_affected, replace=False)
        efactor[idx_affected] = random.randint(*efactors, size=engines_affected)
    
    if tanks is not None:
        degrade(tanks, min([t if t != np.inf else 0 for t in tfactor]), tfactor, efactor, **nominal_config)
    return tfactor, efactor

In [None]:
# Approach - 1

# Training/Testing Loop:
#  Train agent for N steps,
#  Train metanetwork
#  Get gradients of reward w.r.t hyperparameters
#  Change hyperparameters


## Complementary MAML

In [None]:
tanks = TanksFactory(n = n_tanks, e = n_engines, **nominal_config)
env = TanksPhysicalEnv(tanks, tstep=tstep)
timesteps = 50000 #50000           # max timesteps in one episode

ppo_params = dict(
    state_dim = env.observation_space.shape[0],
    action_dim = 6,
    epochs = 5,                  # update policy for K epochs
    lr = 0.02,                  # learning rate
    n_latent_var = 64,           # number of variables in hidden layer
    betas = (0.9, 0.999),
    gamma = 0.99,                # discount factor
    eps_clip = 0.2,              # clip parameter for PPO
    update_interval = 2000      # update policy every n timesteps
)
library_size = 3
data_model = False

In [None]:
# Nominal agent trained without fault
agent = PPO(env, **ppo_params)
r = agent.learn(timesteps)
library = [copy_tensor(agent.policy.state_dict())]   # initialize library with policy
plt.plot(r)
plt.title('Rewards on system under nominal conditions');

In [None]:
# Populate library of policies
library_rewards = []
del library[1:]

tfactors = [
    [10, np.inf, np.inf, np.inf, np.inf, np.inf],
    [np.inf, np.inf, np.inf, np.inf, 10, np.inf],
    [np.inf, np.inf, 10, np.inf, np.inf, np.inf]
]
efactors = [np.inf, np.inf, np.inf]

for tfactor, efactor in tqdm(zip(tfactors, efactors), total=len(tfactors), leave=False):
    tanks_ = TanksFactory(n = n_tanks, e = n_engines, **nominal_config)
    # introduce some fault and learn data-driven model
    degrade(tanks_, time=5, tfactor=tfactor, efactor=efactor, **nominal_config)
    if data_model:
        est_ = deepcopy(est)  # copy estimator hyperparameters etc.
        x, y = generate_training_data(tanks_, episodes=50)  # random actions!
        est_.fit(x, y)
        # Train agent on data-driven model
        env_ = TanksDataEnv(tanks_, est_, tstep)
    else:
        env_ = TanksPhysicalEnv(tanks_, tstep)
    agent_ = PPO(env_, **ppo_params)
    rewards = agent_.learn(30000)
    library.append(copy_tensor(agent_.policy.state_dict()))
    library_rewards.append(rewards)

# PLot library rewards
plt.figure(figsize=(10, 8))
for i, rewards in enumerate(library_rewards):
    plt.plot(rewards, label='Policy#{}'.format(i))
plt.title('Episodic rewards on model with faults')
plt.legend()
plt.grid(True, 'both')

pth = './bin/library/'
os.makedirs(pth, exist_ok=True)
plt.savefig(pth+'library_rewards.png')
with open(pth + 'lib_rwards.pickle', 'wb') as f:
    pickle.dump(dict(library_rewards=library_rewards, tfactors=tfactors, efactors=efactors), f)

### Experiment functions

In [None]:
def adapt(agent, est, memory, library, data_model=True,
          n_inner=1, n_outer=1, alpha_inner=0.01, alpha_outer=0.1,
          **ppo_params):
    if data_model:
        est_ = learn_env_model(memory, est)
        env_ = TanksDataEnv(agent.env.tanks, est_, tstep)
    else:
        env_ = agent.env
    params = meta_update(agent.policy.state_dict(), env_, library, memory,
                         n_inner, n_outer, alpha_inner, alpha_outer,
                         data_model, **ppo_params)
    agent.policy.load_state_dict(params)
    return agent

def adapt_benchmark(agent, est, memory, library, data_model=True,
          n_inner=1, n_outer=1, alpha_inner=0.01, alpha_outer=0.1,
          **ppo_params):
    if data_model:
        est_ = learn_env_model(memory, est)
        env_ = TanksDataEnv(agent.env.tanks, est_, tstep)
    else:
        env_ = agent.env
    agent.learn(ppo_params['update_interval'], ppo_params['update_interval'])
    env_backup = agent.env
    agent.env = env_
    # If data_model, then interact for the same number of times with the data env
    # as the meta-update step does in the outer x inner loops. Otherwise, reuse
    # the buffered memory for the same number of times as the meta update step.
    if data_model:
        agent.learn(timesteps=ppo_params['update_interval'] * (len(library) * n_inner * n_outer),
                    update_interval=ppo_params['update_interval'])
    else:
        pass
        # agent.update(policy=agent.policy, memory=memory,
        #              epochs=len(library) * n_inner * n_outer, optimizer=agent.optimizer)
    agent.env = env_backup
    return agent

In [None]:
def trial(env_, est, starting_policy, library=[], data_model=True, post_steps=10000,
          n_inner=1, n_outer=1, alpha_inner=0.01, alpha_outer=0.1,
          benchmark=True):
    # Make copies of env, and agent trained on nominal system,
    # and starting library of policies (if any)
    agent_ = PPO(env_, **ppo_params)
    agent_.policy.load_state_dict(copy_tensor(starting_policy))
    if benchmark:
        agent_benchmark = PPO(env_, **ppo_params)
        agent_benchmark.policy.load_state_dict(copy_tensor(starting_policy))
    library_ = [copy_tensor(p) for p in library]
    # Fault occurs, buffer experience with environment
    memory_ = Memory()
    agent_.experience(memory_, ppo_params['update_interval'], env_, agent_.policy)
    # Use meta-learning to adapt to fault
    adapt(agent_, est, memory_, library_, data_model,
          n_inner, n_outer, alpha_inner, alpha_outer, **ppo_params)
    if benchmark:
        adapt_benchmark(agent_benchmark, est, memory_, library_, data_model,
                        n_inner, n_outer, alpha_inner, alpha_outer, **ppo_params)
    # Continue learning
    rewards = []
    agents = [agent_, agent_benchmark] if benchmark else [agent_]
    for a in tqdm(agents, desc='Post-fault training', leave=False):
        rewards.append(a.learn(post_steps))
    return rewards, agent_.policy.state_dict(), memory_

# TODO: Why do trials return different number of episodes, why is mean and std of different length
def ntrials(n=NCPU, verbose=10, *trial_args, **trial_kwargs):
    res = Parallel(n_jobs=min(n, NCPU), verbose=verbose)(
        delayed(trial)(*trial_args, **trial_kwargs) for _ in range(n)
    )
    benchmark = len(res[0][0])==2
    minlen = min([len(r[0][0]) for r in res])  # in case number of episodes / trial is different
    rewards = np.asarray([r[0][0][:minlen] for r in res]) # list of episodic rewards from each trial
    mean = np.mean(rewards, axis=0)
    std = np.std(rewards, axis=0)
    if benchmark:
        minlen = min([len(r[0][1]) for r in res])  # in case number of episodes / trial is different
        rewards_benchmark = np.asarray([r[0][1][:minlen] for r in res])
        mean_benchmark = np.mean(rewards_benchmark, axis=0)
        std_benchmark = np.std(rewards_benchmark, axis=0)
    else:
        mean_benchmark = None
        std_benchmark = None
    return (mean, mean_benchmark), (std, std_benchmark)

### Grid-search

In [None]:
grid = ParameterGrid(dict(
    alpha_inner = [1e-3, 1e-2],
    alpha_outer = [1e-2, 1e-1],
    n_inner = [4],
    n_outer = [2],
    data_model = [False, True],
    post_steps = [30000],
    library = [[], library]
))
os.makedirs(('./bin/hyperparameters'), exist_ok=True)

tanks_ = TanksFactory(n = n_tanks, e = n_engines, **nominal_config)
env_ = TanksPhysicalEnv(tanks_, tstep)
factors = random_degrade(tanks_, atmost_engines=1)
print('Tank Factors:', factors[0])
print('Engine Factors:', factors[1])

hyp_r, hyp_std, hyp_rb, hyp_stdb = [], [], [], []
ngrid = 0
for trial_params in tqdm(grid, desc='Hyperparameters', leave=False):
    (r, r_b), (std, std_b) = ntrials(4, 11, env_, est, agent.policy.state_dict(), **trial_params)
    hyp_r.append(r)
    hyp_rb.append(r_b)
    hyp_std.append(std)
    hyp_stdb.append(std_b)
    ngrid += 1   

In [None]:
ncol = 2
nrow = ngrid // ncol + (ngrid % ncol != 0)
plt.figure(figsize=(12, 3 * nrow))
for i, (grid_params, (r, r_b, std, std_b)) in enumerate(zip(grid, zip(hyp_r, hyp_rb, hyp_std, hyp_stdb))):
    plt.subplot(nrow, ncol, i + 1)
    plot_adaption(r, r_b, std, std_b)
    print([(k, len(v) if k=='library' else v) for k, v in grid_params.items() if k not in ['post_steps']])
    plt.title(i)
plt.tight_layout()

### Populated vs. empty policy library

In [None]:
# tfactor = np.asarray([np.inf, np.inf, np.inf, np.inf, np.inf, 15.])
# efactor = np.asarray([np.inf, np.inf])

# factors = (tfactor, efactor)

In [None]:
trial_params = dict(
    alpha_inner = 1e-3,
    alpha_outer = 1e-2,
    n_inner = 4,
    n_outer = 2,
    data_model = True,
    post_steps = 300
)
pth = './bin/populated_vs_empty_library/'
os.makedirs(pth, exist_ok=True)

tanks_ = TanksFactory(n = n_tanks, e = n_engines, **nominal_config)
env_ = TanksPhysicalEnv(tanks_, tstep)
factors = random_degrade(tanks_, atmost_engines=1)
# degrade(tanks_, min(factors[0][factors[0]!=np.inf]), *factors, **nominal_config)
print('Tank Factors:', factors[0])
print('Engine Factors:', factors[1])

(r, r_b), (std, std_b) = ntrials(4, 20, env_, est, agent.policy.state_dict(), library, **trial_params)

plt.figure(figsize=(10,8))
plot_adaption(r, r_b, std, std_b);

faulty_tanks = str(list((factors[0] != np.inf).nonzero()[0]))
faulty_engines = str(list((factors[1] != np.inf).nonzero()[0]))
fname = pth + 'libsize_3_f' + sanitize_filename(faulty_tanks + faulty_engines + str(trial_params))
plt.savefig(fname + '.png')
with open(fname+'.pickle', 'wb') as f:
    pickle.dump(dict(trial_params=trial_params, results=(r, r_b, std, std_b), factors=factors), f)

In [None]:
# Empty library
(r, r_b), (std, std_b) = ntrials(4, 20, env_, est, agent.policy.state_dict(), [], **trial_params)

plt.figure(figsize=(10,8))
plot_adaption(r, r_b, std, std_b);

fname = pth + 'libsize_0_f' + sanitize_filename(faulty_tanks + faulty_engines + str(trial_params))
plt.savefig(fname + '.png')
with open(fname+'.pickle', 'wb') as f:
    pickle.dump(dict(trial_params=trial_params, results=(r, r_b, std, std_b), factors=factors), f);

### Sequential abrupt faults

In [None]:
# Multi-fault trial
multi_trial_params = dict(
    starting_policy = agent.policy.state_dict(),
    n_faults = 10,
    lib_size = 3,
    library=[]
)
trial_params = dict(
    alpha_inner = 1e-3,
    alpha_outer = 1e-3,
    n_inner = 4,
    n_outer = 1,
    data_model = False,
    post_steps = 30000
)

def multi_trial(starting_policy, n_faults=5, lib_size=3, library=[], **trial_params):
    tanks_ = TanksFactory(n = n_tanks, e = n_engines, **nominal_config)
    env_ = TanksPhysicalEnv(tanks_, tstep)
    agent_ = PPO(env_, **ppo_params)
    agent_.policy.load_state_dict(copy_tensor(starting_policy))
    rewards = []
    rewards_benchmark = []
    fault_times = [0]
    for f in trange(n_faults, desc='Faults', leave=False):
        factors = random_degrade(tanks_)
        rew, params, memory = trial(env_, est, agent_.policy.state_dict(), library, **trial_params)
        library.append(params)
        library, div = prune_library(library, lib_size, memory, **ppo_params)
        agent_.policy.load_state_dict(params)
        rewards.extend(rew[0])
        rewards_benchmark.extend(rew[1])
        fault_times.append(fault_times[-1] + len(rew[0]))
    return rewards, rewards_benchmark, fault_times[:-1]

def nmulti_trials(n=NCPU, verbose=20, **params):
    res = Parallel(n_jobs=min(n, NCPU), verbose=verbose)(
        delayed(multi_trial)(**params) for _ in range(n)
    )
    benchmark = res[0][1] is not None
    minlen = min([len(r[0]) for r in res])  # in case number of episodes / trial is different
    rewards = np.asarray([r[0][:minlen] for r in res]) # list of episodic rewards from each trial
    mean = np.mean(rewards, axis=0)
    std = np.std(rewards, axis=0)
    # Benchmarks
    if benchmark:
        minlen = min([len(r[1]) for r in res])  # in case number of episodes / trial is different
        rewards_benchmark = np.asarray([r[1][:minlen] for r in res])
        mean_benchmark = np.mean(rewards_benchmark, axis=0)
        std_benchmark = np.std(rewards_benchmark, axis=0)
    else:
        mean_benchmark = None
        std_benchmark = None
    # Faults
    minlen = min([len(r[2]) for r in res])  # in case number of episodes / trial is different
    fault_times = np.asarray([r[2][:minlen] for r in res]) # list of episodic rewards from each trial
    fault_times = np.mean(fault_times, axis=0)
    return (mean, mean_benchmark), (std, std_benchmark), fault_times

(r, r_b), (std, std_b), f = nmulti_trials(n=4, verbose=20, **multi_trial_params, **trial_params)

In [None]:
pth = './bin/multi_fault/'
os.makedirs(pth, exist_ok=True)

plt.figure(figsize=(15, 6))
plot_adaption(r, r_b, std, std_b, f)

fname = pth + sanitize_filename(str({k:v for k, v in multi_trial_params.items() \
                                     if k not in ('library', 'starting_policy')}) + str(trial_params))
plt.savefig(fname + '.png')
with open(fname+'.pickle', 'wb') as fi:
    pickle.dump(dict(trial_params=trial_params, multi_trial_params=multi_trial_params,
                     results=(r, r_b, std, std_b, f)), fi);