<a href="https://colab.research.google.com/github/julia-lina-tan/rl-policy-fusion/blob/main/rl_policy_fusion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# RL Policy Fusion

This notebook trains PPO agents from Stable Baselines3 in the "Cart Pole" Gym environment.

Set up git environment so we can commit local files.

In [None]:
import getpass

# Configure git
!git init
!git config — global user.email 'julialtan5838@gmail.com'
!git config — global user.name 'julia-lina-tan'
!git add -A

username='julia-lina-tan'
password=getpass.getpass('password: ')
!git remote add origin https://${username}:${password}@github.com/${username}/rl-policy-fusion.git

Clear working directory of all existing folders.

In [1]:
import os
import shutil

# Remove all non system folders
for dir in os.listdir():
    if dir in ['.config', '.ipynb_checkpoints', '.git']:
        continue
    shutil.rmtree('../content/'+dir)

# Install Stable Baselines and dependencies

In [None]:
pip install stable-baselines3[extra]

In [None]:
!apt install swig cmake
!pip install stable-baselines3[extra] box2d box2d-kengz

Additional installations/imports for rendering Gym environment.

In [None]:
!apt-get install -y xvfb x11-utils
!pip install gym[box2d]==0.17.* pyvirtualdisplay==0.2.* PyOpenGL==3.1.* PyOpenGL-accelerate==3.1.* 
!apt-get install imagemagick

import pyvirtualdisplay
_display = pyvirtualdisplay.Display(visible=False,  # use False with Xvfb
                                    size=(1400, 900))
_ = _display.start()

# Import RL policy and agent

In [3]:
import gym
import numpy as np
import matplotlib.pyplot as plt

from stable_baselines3 import PPO

# Import Gym environment and instantiate agent

Cart Pole environment: [https://gym.openai.com/envs/CartPole-v1/](https://gym.openai.com/envs/CartPole-v1/)

"*A pole is attached by an un-actuated joint to a cart, which moves along a frictionless track. The system is controlled by applying a force of +1 or -1 to the cart. The pendulum starts upright, and the goal is to prevent it from falling over.* 

*A reward of +1 is provided for every timestep that the pole remains upright. The episode ends when the pole is more than 15 degrees from vertical, or the cart moves more than 2.4 units from the center.*"


In [None]:
import importlib
import sys

# Get cartpole class (NOTE: we need to import from source code to modify env variables)
!git clone https://github.com/openai/gym.git

MODULE_PATH = '../content/gym/gym/envs/classic_control/cartpole.py' 
MODULE_NAME = 'cartpole'

spec = importlib.util.spec_from_file_location(MODULE_NAME, MODULE_PATH)
module = importlib.util.module_from_spec(spec)
sys.modules[spec.name] = module 
spec.loader.exec_module(module)

from cartpole import CartPoleEnv
from stable_baselines3.common.monitor import Monitor

Make mass and pole length configurable.

In [5]:
def config_env(env, masscart=1.0, masspole=0.1, length=0.5):
    env.masscart = masscart
    env.masspole = masspole
    env.total_mass = env.masscart + env.masspole
    env.length = length
    env.polemass_length = env.masspole * env.length

We choose the MlpPolicy because input of Cart Pole is a feature vector, not images. 

This MLP has 2 layers of 64.

In [6]:
os.makedirs('../content/agent1', exist_ok=True)

env = Monitor(CartPoleEnv())
config_env(env, length=5.0)

model = PPO('MlpPolicy', env, verbose=1, seed=1)

Using cpu device
Wrapping the env in a DummyVecEnv.


We load a [helper function](https://stable-baselines.readthedocs.io/en/master/common/evaluation.html) to evaluate the agent, and define a plotting function to help visualise the rewards.

In [7]:
from stable_baselines3.common.evaluation import evaluate_policy

def plot_rewards(mean_reward, title=None):
    plt.figure(figsize=(10,5))
    plt.title(title)
    plt.xlabel('Episodes')
    plt.ylabel('Rewards at episode')
    plt.xticks(list(range(1, len(mean_reward)+1)))
    plt.plot(list(range(1, len(mean_reward)+1)), mean_reward, marker='o')
    plt.show()

We evaluate the untrained random agent.

In [None]:
# Separate env for evaluation
eval_env = Monitor(CartPoleEnv())

# Random agent, before training
ep_rewards, ep_steps = evaluate_policy(model, env, n_eval_episodes=10, deterministic=True, return_episode_rewards=True)

print(f'mean reward={(sum(ep_rewards)/len(ep_rewards)):.2f} +/- {np.std(ep_rewards):.2f}')
plot_rewards(ep_rewards, title='Rewards over evaluation episodes')

## Train the agent and save it

In [None]:
# Train the agent
model.learn(total_timesteps=1e4)

# Save the agent
model.save('../content/agent1/model')

# Can also delete and load the model afterwards
# del model  
# model = PPO.load('../content/agent1/model')

## Evaluate the trained agent



On training environment:

In [None]:
ep_rewards, ep_steps = evaluate_policy(model, env, n_eval_episodes=10, deterministic=True, return_episode_rewards=True)

print(f'mean reward={(sum(ep_rewards)/len(ep_rewards)):.2f} +/- {np.std(ep_rewards):.2f}')
plot_rewards(ep_rewards, title='Rewards over evaluation episodes')

On evaluation environment:

In [None]:
ep_rewards, ep_steps = evaluate_policy(model, eval_env, n_eval_episodes=10, deterministic=True, return_episode_rewards=True)

print(f'mean reward={(sum(ep_rewards)/len(ep_rewards)):.2f} +/- {np.std(ep_rewards):.2f}')
plot_rewards(ep_rewards, title='Rewards over evaluation episodes')

## Render Environment

Demonstrate policy by rendering in environment over a number of evaluation episodes.

In [12]:
import cv2
from google.colab.patches import cv2_imshow
from matplotlib import animation

def save_frames_as_gif(frames, path='../content', filename='gym_animation.gif'):

    #Mess with this to change frame size
    plt.figure(figsize=(frames[0].shape[1] / 72.0, frames[0].shape[0] / 72.0), dpi=72)

    patch = plt.imshow(frames[0])
    plt.axis('off')

    def animate(i):
        patch.set_data(frames[i])

    anim = animation.FuncAnimation(plt.gcf(), animate, frames = len(frames), interval=50)
    anim.save(path + filename, writer='imagemagick', fps=60)

def render_agent_in_env(agent, env, n_eval_episodes=5, path='../content', filename='gym_animation'):
    for i in range(n_eval_episodes):
      frames = []
      obs = env.reset()
      for t in range(500):

          #Render to frames buffer
          frame = np.array(env.render('rgb_array'))
          cv2.putText(frame, text=f'Episode {i+1}', org=(50,50), fontFace=cv2.FONT_HERSHEY_DUPLEX, fontScale=0.8, color=(0,0,0))
          frames.append(frame)
          action, _states = model.predict(obs)
          obs, rewards, done, info = env.step(action)
          if done:
            break
      save_frames_as_gif(frames, path=path, filename=f'{filename}-ep{i+1}.gif')

In [None]:
render_agent_in_env(model, env, n_eval_episodes=3, path='../content/agent1/', filename='trained-agent')

# Get Policy Network

In [13]:
# Get the parameters
model_params = model.get_parameters()

def get_policy_net(model_params, net='action'):
    """
    Get either the action net or the value net representing the policy.

    :param model_params: (dict) the model parameters
    :param net: (str) the net type to return; either ``action`` or ``value``
    """
    if net != 'action' and net != 'value':
        raise ValueError('Must be either action net or value net')
    return model_params.get('policy').get(net+'_net.weight')

action_net = get_policy_net(model_params, net='action')

# Create list to store policy nets of individual agents
all_policies = []
all_policies.append(action_net)

# Introduce second agent

We use a different random seed when creating another PPO model. We also alter the environment variables in some way.

In [14]:
os.makedirs('../content/agent2', exist_ok=True)

env = Monitor(CartPoleEnv())
config_env(env, masscart=10.0, masspole=5.0)

model = PPO('MlpPolicy', env, verbose=1, seed=2)

Using cpu device
Wrapping the env in a DummyVecEnv.


## Evaluate the second agent

We evaluate the untrained agent.

In [None]:
# Random agent, before training
ep_rewards, ep_steps = evaluate_policy(model, env, n_eval_episodes=10, deterministic=True, return_episode_rewards=True)

print(f'mean reward={(sum(ep_rewards)/len(ep_rewards)):.2f} +/- {np.std(ep_rewards):.2f}')
plot_rewards(ep_rewards, title='Rewards over evaluation episodes')

We train the agent and re-evaluate.

In [None]:
# Train the agent
model.learn(total_timesteps=int(1e4))

# Save the agent
model.save('../content/agent2/model')

On training environment:

In [None]:
ep_rewards, ep_steps = evaluate_policy(model, env, n_eval_episodes=10, deterministic=True, return_episode_rewards=True)

print(f'mean reward={(sum(ep_rewards)/len(ep_rewards)):.2f} +/- {np.std(ep_rewards):.2f}')
plot_rewards(ep_rewards, title='Rewards over evaluation episodes')

On evaluation environment:

In [None]:
ep_rewards, ep_steps = evaluate_policy(model, eval_env, n_eval_episodes=10, deterministic=True, return_episode_rewards=True)

print(f'mean reward={(sum(ep_rewards)/len(ep_rewards)):.2f} +/- {np.std(ep_rewards):.2f}')
plot_rewards(ep_rewards, title='Rewards over evaluation episodes')

We can also render the agent in the evaluation environment.

In [None]:
render_agent_in_env(model, eval_env, n_eval_episodes=3, path='../content/agent2/', filename='trained-agent')

## Get policy net of second agent

In [20]:
model_params = model.get_parameters()
action_net = get_policy_net(model_params, net='action')
all_policies.append(action_net)

# Align policy nets using OT

Compute the OT transport maps between the policy nets, then align the matching neurons to each other using this map.

In [None]:
# Install optimal transport requirements
!pip install pot
import ot

In [22]:
import tensorflow as tf

def group_policy_layers(all_policies):

    k_layers = list(all_policies[0].shape)[0]
    m_policies = len(all_policies)
    n_neurons = list(all_policies[0].shape)[1]

    # Create kxn matrix for k layers and m policies 
    policy_layers = np.empty((k_layers, m_policies, n_neurons))

    # Fill each layer with the policies
    for i in range(k_layers):
        
        for j, policy in enumerate(all_policies):

            # Convert current layer in tensor to a vector
            layer = tf.slice(policy, [i,0], [1,-1])
            vector = tf.reshape(layer,[-1]).numpy()

            # Put vector in matrix
            policy_layers[i, j, :] = vector
    return policy_layers
  

policy_layers = group_policy_layers(all_policies)

Compute all layer-wise transport maps and align neurons in each policy.

In [None]:
def align_neurons(policy_layers):

    # Compute layer-wise transport maps
    for i, layer in enumerate(policy_layers):
        n = layer[0].shape[0]

        # Align second policy (source) to first policy (target)
        Xs = np.stack((layer[1], np.zeros(n)), axis=-1)
        Xt = np.stack((layer[0], np.zeros(n)), axis=-1)

        # Compute optimal transport map using EMD, since matching whole neurons
        ot_emd = ot.da.EMDTransport()
        ot_emd.fit(Xs=Xs, Xt=Xt)

        # Get source policy aligned to neuron positions of target policy
        transp_Xt_emd = ot_emd.inverse_transform(Xt=Xt)
        aligned_Xs = transp_Xt_emd[:,0]
        
        # Replace original with aligned policy
        policy_layers[i,1,:] = aligned_Xs

    return policy_layers

aligned_layers = align_neurons(policy_layers)

## Fuse policy nets 
Fuse policies using averaging within each layer.

In [None]:
def fuse_policies_by_layers(aligned_layers):

    k_layers = aligned_layers.shape[0]
    n_neurons = aligned_layers.shape[2]
    
    # Create kxn matrix for k layers and n neurons
    fused_policy = np.empty((k_layers, n_neurons))
    
    # Fuse each layer separately
    for i, layer in enumerate(aligned_layers):
        a = layer[0]
        b = layer[1]

        # Average neurons across each policy per position
        fused = np.mean([a, b], axis=0)

        # Add to new fused policy
        fused_policy[i,:] = fused

    return fused_policy


fused_policy = fuse_policies_by_layers(aligned_layers)

## Create new agent with fused policy

# Push changed files to git

In [None]:
commit_msg = input('Enter a commit message: ')
!git commit -m ${commit_msg}
!git push -u origin main