In [None]:
import sys, os
pyStateSpace_path = os.path.abspath('../pyStateSpace')
pyTorchBridge_path = os.path.abspath('../pyTorchBridge')
try:
    import pystatespace
except ImportError:
    if pyStateSpace_path not in sys.path:
        sys.path.append(pyStateSpace_path)
try:
    import pytorchbridge
except ImportError:
    if pyTorchBridge_path not in sys.path:
        sys.path.append(pyTorchBridge_path)

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from copy import deepcopy
from multiprocessing import Pool
import warnings
warnings.filterwarnings("error", category=UserWarning)
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # disable tensorflow warning messages
os.makedirs(os.path.expanduser('~/Data/tensorboard/'), exist_ok=True)

import tensorflow as tf
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import gym
import pandas as pd
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, make_scorer
import matplotlib.pyplot as plt
from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines import PPO2
from tqdm.auto import tqdm, trange
from pytorchbridge import TorchEstimator

from utils import cache_function, cache_to_episodic_rewards, cache_to_episodes

SMALL_SIZE = 15
MEDIUM_SIZE = 17
BIGGER_SIZE = 19

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title
plt.rc('lines', linewidth = 2.5)

seed = 0

In [None]:
class MetaNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.lstm = nn.LSTM(2, 4, 1, batch_first=True)
        self.fc = nn.Sequential(
            nn.Linear(4,4),
            nn.ReLU(),
            nn.Linear(4, 1))
    def forward(self, x):
        x, h = self.lstm(x)
        x = self.fc(x)
        return x

In [None]:
def train_meta_network(estimator, hyperparameters, rewards):
    # Train the network to predict rewards in training.
    # This ensures that the meta-network can accurately
    # model rewards from hyperparameters and time.
    hyp = np.asarray(hyperparameters).reshape(1, len(hyperparameters), -1)
    rewards = np.asarray(rewards).reshape(1, -1, 1)
    estimator.fit(hyp, rewards)

def get_relative_gradients(network, hyperparameters):
    # Calculate gradients of change in reward w.r.t
    # hyperparameters. Since the reward is being accurately
    # modelled, the gradients will reflect meaningful changes
    # in hyperparameters instead of prediction error.
    hyp = np.asarray(hyperparameters).reshape(1, len(hyperparameters), -1)
    hyp = torch.tensor(hyp, requires_grad=True)
    rewards = torch.squeeze(network(hyp))
    relative = rewards[-1] - rewards[-2]
    relative.backward()
    grad = hyp[0, -1].detach().cpu().numpy()
    return grad
    
def deduce_hyperparameters_from_gradients(grad, hyperparameters):
    curr_hyp = hyperparameters[-1]
    new_hyp = np.asarray(curr_hyp + 0.1*grad)
    return new_hyp

def set_hyperparameters(agent, hyp):
    agent.learning_rate = np.clip(hyp[0], a_min=1e-5, a_max=None)
    agent.cliprange = np.clip(hyp[1], a_min=0, a_max=None)

In [None]:
def trial(factor, periods, period_length, agent, env,
          hyp_init=None, metaest=None,
          return_caches=False, degrade_fn=None,
          rtype='episodic'):
    if return_caches:
        cache = {'state': [], 'action': [], 'reward': [], 'done': []}
    rewards_rl = []
    
    # Initial hyperparameters:
    if metaest is not None:
        # For first two iterations, initial hyperparameters
        # are used.
        hyp = [np.asarray(hyp_init), np.asarray(hyp_init)]
    
    for period in trange(periods):
        if degrade_fn is not None:
            degrade_fn(env.envs[0], factor, period)

        # Online-learning + control
        agent.set_env(env)
        x_cache, u_cache, d_cache, r_cache = [], [], [], []
        agent.learn(total_timesteps=period_length,
                    callback=cache_function(x_cache, u_cache, d_cache, r_cache))

        # Accumulate rewards per period and other stats
        if rtype == 'episodic':
            rewards_rl.append(np.mean(cache_to_episodic_rewards(r_cache, d_cache)))
        elif rtype == 'temporal':
            rewards_rl.append(np.sum(r_cache))
        if return_caches:
            cache['state'].extend(x_cache)
            cache['action'].extend(u_cache)
            cache['reward'].extend(r_cache)
            cache['done'].extend(d_cache)

        # Meta-learning
        # At least 2 reward accumulations
        if metaest is not None:
            if period == 0:
                train_meta_network(metaest, hyp[:1], rewards_rl)
            if period >= 1:
                print('Period', period)
                train_meta_network(metaest, hyp, rewards_rl)
                grad = get_relative_gradients(metaest.module, hyp)
                hyp.append(deduce_hyperparameters_from_gradients(grad, hyp))
                set_hyperparameters(agent, hyp[-1])
        
        # Change environment
    
    if return_caches:
        if metaest is not None:
            return (cache, hyp)
        return cache
    if metaest is not None:
        return np.asarray(rewards_rl), hyp
    return np.asarray(rewards_rl)

In [None]:
# Approach - 1

# Training/Testing Loop:
#  Train agent for N steps,
#  Train metanetwork
#  Get gradients of reward w.r.t hyperparameters
#  Change hyperparameters


In [None]:
update_interval = 20
period_length = 500
minibatch = 10
policy_learning_rate = 1e-3
policy_arch=[128, 128, dict(vf=[32, 32], pi=[32, 32])]

In [None]:
def degrade_cartpole(env, factor, time):
    env.masscart = 1.0 + time*factor*0.1
    env.force_mag = 10.0 - time*factor*1

# env_ = DummyVecEnv([lambda: gym.make('BipedalWalker-v2')])
env_ = DummyVecEnv([lambda: gym.make('CartPole-v1')])

agent = PPO2(MlpPolicy, env_, verbose=0, n_steps=update_interval,
             nminibatches=minibatch, learning_rate=policy_learning_rate,
             policy_kwargs=dict(net_arch=policy_arch), seed=seed,
             tensorboard_log=os.path.expanduser('~/Data/tensorboard/'))

In [None]:
metanet = MetaNet().double()  # Numpy uses float64 by default
metaparams = metanet.parameters()
metaoptim = optim.Adam(metaparams, lr=1e-1)
metaloss = nn.MSELoss()
metaest = TorchEstimator(metanet, metaoptim, metaloss, epochs=1000,
                         tol=0.001, max_tol_iter=4, batch_size=1, verbose=True)

cache, hyp = trial(factor=1, periods=5, period_length=period_length,
                   agent=agent, env=env_, 
                   hyp_init=(1e-3, 0.2), metaest=metaest, return_caches=True)

In [None]:
metaest.predict(np.asarray(hyp).reshape(1, -1, 2))

In [None]:
er = cache_to_episodic_rewards(cache['reward'], cache['done'])
plt.plot(er)

In [None]:
plt.plot(np.asarray([h[0] for h in hyp]))
# plt.plot(np.asarray([h[1] for h in hyp]))

In [None]:
import time
e = env_.envs[0]
# degrade(e, 1, 10)
s = e.reset()
while True:
    a = agent.predict(s)
    print(a[0])
    s,r,d,_ = e.step(a[0])
    if d:
        e.close()
        break
    try:
        e.render()
        time.sleep(0.02)
    except:
        break

In [None]:
# Approach - 2

# Training Loop:
#  Train agent to convergence,
#  Train metanetwork
#  Change environment i.e. task

# Testing:
#  Apply agent for N 
#  Get gradients of reward w.r.t hyperparameters
#  Change hyperparameters