In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib notebook

import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
import torch
from torch.distributions import Bernoulli, MultivariateNormal, Categorical
import gym

import notebook_setup
from tqdm.auto import tqdm, trange
from systems import CartPoleEnv
from systems import CartPoleContinuousEnv
from ppo import ActorCriticDiscrete, ActorCriticMultiBinary, ActorCriticBox, PPO, DEVICE, Memory, returns

# Policies

## Discrete

In [None]:
ppo_params = dict(
    state_dim=4,
    action_dim=2,
    n_latent_var=32,
    lr=0.02,
    epochs=5,
    update_interval=500
)

In [None]:
agent = PPO(CartPoleEnv(), ActorCriticDiscrete, **ppo_params)
rewards = agent.learn(30000)
plt.scatter(np.arange(len(rewards)), rewards)

In [None]:
ActorCriticDiscrete(state_dim=4, action_dim=2, n_latent_var=32)

## Continuous

In [None]:
env = gym.make('LunarLanderContinuous-v2')

ppo_params = dict(
    state_dim=env.observation_space.shape[0],
    action_dim=env.action_space.shape[0],
    n_latent_var=64,
    lr=0.0003,
    epochs=75,
    update_interval=3000
)

In [None]:
agent = PPO(env, ActorCriticBox, **ppo_params)
rewards = agent.learn(3000)
plt.scatter(np.arange(len(rewards)), rewards)

In [None]:
env = CartPoleContinuousEnv()

ppo_params = dict(
    state_dim=env.observation_space.shape[0],
    action_dim=env.action_space.shape[0],
    n_latent_var=32,
    lr=0.02,
    epochs=25,
    update_interval=500
)

In [None]:
agent = PPO(env, ActorCriticBox, **ppo_params)
rewards = agent.learn(10000)
plt.scatter(np.arange(len(rewards)), rewards)

## Discretized Continuous

In [None]:
class ActorCriticBoxDiscrete(ActorCriticBox):
    
    def predict(self, state):
        action, logprob = super().predict(state)
        return int(np.round(np.clip(action.item(), 0, 1))), logprob

    def evaluate(self, state, action):
        action_logprobs, state_value, dist_entropy = \
            super().evaluate(state, action)
        return action_logprobs, state_value, dist_entropy

env = CartPoleEnv()

ppo_params = dict(
    state_dim=4,
    action_dim=1,
    n_latent_var=64,
    lr=0.002,
    epochs=50,
    update_interval=500
)

agent = PPO(env, ActorCriticBoxDiscrete, **ppo_params)
rewards = agent.learn(10000)
plt.scatter(np.arange(len(rewards)), rewards)

# Quadcopter

In [None]:
from systems.quadcopter import (Quadcopter, QuadcopterSupervisorEnv, Controller, plot_quadcopter,
                                QUADPARAMS, CONTROLLER_PARAMS)

In [None]:
CONTROLLER_PARAMS = {
    'Motor_limits': [4000, 9000],
    'Tilt_limits': [-10, 10],            # degrees
    'Yaw_Control_Limits': [-900, 900],
    'Z_XY_offset': 500,
    'Linear_To_Angular_Scaler': [1, 1, 0],
    'Yaw_Rate_Scaler': 0.18,
    'Linear_PID': {
        'P':[300, 300, 0],
        'I':[0.04, 0.04, 0],
        'D':[450, 450, 0]},
    'Angular_PID':{
        'P':[22000, 22000, 1500],
        'I':[0, 0, 1.2],
        'D':[12000, 12000, 0]},
    }

In [None]:
env = QuadcopterSupervisorEnv(Controller(Quadcopter(), params=CONTROLLER_PARAMS), deterministic_reset=True)
env.reset(position=(0,0,0), target=(0,0,5), linear_rate=(0,0,0), orientation=(0,0,0), angular_rate=(0,0,0))

In [None]:
T = 5000
label_ = 'No fault'
pos_ = np.zeros((T, 3))
env.reset()
env.ctrl.quadcopter.set_motor_faults([0, 0, 0, 0])
pos_[0] = env.start
R = 0.
rewards = []
for t in trange(1, T, leave=False):
    _, r, done, _ = env.step(0.)
    rewards.append(r)
    R += r
    pos_[t] = env.ctrl.quadcopter.state[:3]
    if done:
        pos_ = pos_[:t+1]
        break
print('Reward:', R)
# plt.plot(rewards)

In [None]:
fig = plt.figure(figsize=(8, 8), constrained_layout=True)
gs = fig.add_gridspec(3, 1)
ax = fig.add_subplot(gs[0:2, 0], projection='3d')
ax.plot(pos_[::10, 0], pos_[::10, 1], pos_[::10, 2], 'r.-', label=label_)
ax.text(*env.start, "start")
ax.text(*env.end, "end")
ax_lims = np.asarray([ax.get_xlim3d(), ax.get_ylim3d(), ax.get_zlim3d()])
ax.set_box_aspect(np.ptp(ax_lims, axis=1))
ax.legend()

ax = fig.add_subplot(gs[2:, 0])
ax.plot(pos_[:, 0], 'r:', label='x')
ax.plot(pos_[:, 1], 'r-', label='y')
ax.plot(pos_[:, 2], 'r--', label='z')
ax.legend()

plt.show()

In [None]:
ppo_params = dict(
    state_dim=env.observation_space.shape[0],
    action_dim=env.action_space.shape[0],
    n_latent_var=64,
    activation=torch.nn.Tanh,  # final layer activation
    lr=0.02,
    epochs=25,
    update_interval=2000
)

REWARDS = []
AGENTS = []
LABELS = []

In [None]:
# Learn supervision - no fault
env = QuadcopterSupervisorEnv(Controller(Quadcopter()), seed=0, deterministic_reset=False)
env.reset(position=(0,0,5), target=(3,3,5), linear_rate=(0,0,0), orientation=(0,0,0), angular_rate=(0,0,0))
agent = PPO(env, ActorCriticBox, **ppo_params)
for t in agent.policy.parameters():
    torch.nn.init.normal_(t, 0., 0.01)
AGENTS.append(agent)
REWARDS.append(agent.learn(25000))
LABELS.append('No fault')

In [None]:
# Learn supervision
env = QuadcopterSupervisorEnv(Controller(Quadcopter()), seed=0, deterministic_reset=False)
env.ctrl.quadcopter.set_motor_faults([0, 0, 0, 0.20])
env.reset(position=(0,0,5), target=(3,3,5), linear_rate=(0,0,0), orientation=(0,0,0), angular_rate=(0,0,0))
agent = PPO(env, ActorCriticBox, **ppo_params)
for t in agent.policy.parameters():
    torch.nn.init.normal_(t, 0., 0.01)
AGENTS.append(agent)
REWARDS.append(agent.learn(25000))
LABELS.append('M4')

In [None]:
for (rewards, label) in zip(REWARDS, LABELS):
    plt.scatter(np.arange(len(rewards)), rewards, label=label)
plt.legend()

In [None]:
env = QuadcopterSupervisorEnv(Controller(Quadcopter()), deterministic_reset=True)
env.reset(position=(0,0,25), target=(10,10,10), linear_rate=(0,0,0), orientation=(0,0,0), angular_rate=(0,0,0))
env.ctrl.quadcopter.set_motor_faults([0, 0, 0, 0])
positions = plot_quadcopter(env, *AGENTS, labels=LABELS)

In [None]:
agent = PPO(env, ActorCriticBox, **ppo_params)
for t in agent.policy.parameters():
    pass
    torch.nn.init.normal_(t, 0., 1e-9)
    print(t)
s = env.reset()
for t in range(20):
    a, _ = agent.predict(s)
    env.step(a)
    print(a)

## Clustering

In [None]:
env = QuadcopterSupervisorEnv(Controller(Quadcopter(), params=CONTROLLER_PARAMS), deterministic_reset=True)
env.reset(position=(0,0,0), target=(0,0,5), linear_rate=(0,0,0), orientation=(0,0,0), angular_rate=(0,0,0))

class PIDAgent(PPO):
    def __init__(self, env, **kwargs):
        self.env = env
        self.ctrl = env.ctrl
        self.gamma = 0.99
    def predict(self, state):
        return self.ctrl.get_control(), 1.0

pida = PIDAgent(env)

In [None]:
# For each fault, 
faults = [
    [0, 0, 0, 0.20],
    [0, 0, 0.1, 0.20],
    [0, 0.2, 0, 0.20],
    [0.1, 0, 0, 0.20],
    [0.1, 0, 0.2, 0.20],
]
env.reset()
for fault in faults:
    env.ctrl.quadcopter.set_motor_faults(fault)
    a = PIDAgent(env, **ppo_params)
    m = Memory()
    a.experience(m, ppo_params['update_interval'], env, a)

    ret = torch.tensor(returns(m.rewards, m.is_terminals, a.gamma)).float().to(DEVICE)
    ret = (ret - ret.mean()) / (ret.std() + 1e-5)
    states = torch.tensor(m.states).float().to(DEVICE).detach()
    actions = torch.tensor(m.actions).float().to(DEVICE).detach()

In [None]:
_, distances, _ = prune_library(library, len(library), m, **ppo_params)
affinities = np.exp(-distances / distances.std())
print(distances)

_, expected_returns = rank_policies(m, library, **ppo_params)
print(expected_returns)

In [None]:
from sklearn.cluster import DBSCAN, SpectralClustering, AffinityPropagation
from pprint import pprint
import pandas as pd

# normalized metrics
distancen = distances / distances.max()
affinityn = affinities / affinities.max()

clusterers = [
    (DBSCAN(eps=0.5, min_samples=2, metric='precomputed'), distancen, 'DBSCAN'),
    (SpectralClustering(n_clusters=2, affinity='precomputed'), affinityn, 'Spectral'),
    (AffinityPropagation(affinity='precomputed', random_state=SEED), affinityn, 'Affinity')
]

res = dict(expected_returns=expected_returns)
for clusterer, data, name in clusterers:
    labels = clusterer.fit_predict(data)
    res[name] = labels
res = pd.DataFrame(res)
res

## Stable baselines

In [None]:
from stable_baselines3 import PPO as PPO2
from stable_baselines3.ppo import MlpPolicy
from stable_baselines3.common.vec_env import SubprocVecEnv

In [None]:
def env_fn():
    env = QuadcopterSupervisorEnv(Controller(Quadcopter()), seed=i, deterministic_reset=False)
#     env.ctrl.quadcopter.set_motor_faults([0, 0, 0, 0.25])
    return env
envs = SubprocVecEnv([lambda: QuadcopterSupervisorEnv(Controller(Quadcopter()), seed=i, deterministic_reset=False) for i in range(4)])

In [None]:
a = PPO2(MlpPolicy, envs, verbose=1)
a.learn(total_timesteps=50000)
envs.close()

In [None]:
AGENTS.append(a)
REWARDS.append([])
LABELS.append('Baselines')