# Optimization - CEM

<img src="https://raw.githubusercontent.com/jeremiedecock/polytechnique-inf581-2021/master/logo.jpg" style="float: left; width: 15%" />

[INF581-2021](https://moodle.polytechnique.fr/course/view.php?id=9352) Lab session #8

2019-2021 Jérémie Decock

[![Open in Google Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/jeremiedecock/polytechnique-inf581-2021/blob/master/lab8_optim_cem_answers.ipynb)

[![My Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/jeremiedecock/polytechnique-inf581-2021/master?filepath=lab8_optim_cem_answers.ipynb)

[![NbViewer](https://raw.githubusercontent.com/jupyter/design/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.jupyter.org/github/jeremiedecock/polytechnique-inf581-2021/blob/master/lab8_optim_cem_answers.ipynb)

[![Local](https://img.shields.io/badge/Local-Save%20As...-blue)](https://github.com/jeremiedecock/polytechnique-inf581-2021/raw/master/lab8_optim_cem_answers.ipynb)

**Notice**: this notebook requires the following libraries: OpenAI *Gym*, NumPy, Pandas, Seaborn and imageio.

You can install them with the following command (the next cells do this for you if you use the Google Colab environment):

``
pip install gym[box2d] numpy pandas seaborn imageio
``

C.f. https://github.com/openai/gym#installing-everything

In [None]:
%matplotlib inline

import subprocess

try:
    from inf581 import *
except ModuleNotFoundError:
    process = subprocess.Popen("pip install inf581".split(), stdout=subprocess.PIPE)

    for line in process.stdout:
        print(line.decode().strip())

    from inf581 import *

import matplotlib.pyplot as plt

import gym
import math
import numpy as np
import pandas as pd
import seaborn as sns
import itertools
import json

from IPython.display import Image   # To display GIF images in the notebook

In [None]:
#from inf581 import lab8

In [None]:
sns.set_context("talk")

## Exercise 2: Implement a policy for environments having a continuous action space

In [None]:
###############################################################################
# Parametric Stochastic Policy ################################################
###############################################################################

# Activation functions ########################################################

def identity(x):
    return x

def tanh(x):
    return np.tanh(x)

def relu(x):
    x_and_zeros = np.array([x, np.zeros(x.shape)])
    return np.max(x_and_zeros, axis=0)

# Dense Multi-Layer Neural Network ############################################

class NeuralNetworkPolicy:

    def __init__(self, activation_functions, shape_list):
        self.activation_functions = activation_functions
        self.shape_list = shape_list

    def __call__(self, state, theta):
        weights = unflatten_weights(theta, self.shape_list)

        return feed_forward(inputs=state,
                            weights=weights,
                            activation_functions=self.activation_functions)


def feed_forward(inputs, weights, activation_functions, verbose=False):
    x = inputs.copy()
    for layer_weights, layer_activation_fn in zip(weights, activation_functions):

        y = np.dot(x, layer_weights[1:])
        y += layer_weights[0]
        
        layer_output = layer_activation_fn(y)

        if verbose:
            print("x", x)
            print("bias", layer_weights[0])
            print("W", layer_weights[1:])
            print("y", y)
            print("z", layer_output)

        x = layer_output

    return layer_output


def weights_shape(weights):
    return [weights_array.shape for weights_array in weights]


def flatten_weights(weights):
    """Convert weight parameters to a 1 dimension array (more convenient for optimization algorithms)"""
    nested_list = [weights_2d_array.flatten().tolist() for weights_2d_array in weights]
    flat_list = list(itertools.chain(*nested_list))
    return flat_list


def unflatten_weights(flat_list, shape_list):
    """The reverse function of `flatten_weights`"""
    length_list = [shape[0] * shape[1] for shape in shape_list]

    nested_list = []
    start_index = 0

    for length, shape in zip(length_list, shape_list):
        nested_list.append(np.array(flat_list[start_index:start_index+length]).reshape(shape))
        start_index += length

    return nested_list

In [None]:
###############################################################################
# Objective function ##########################################################
###############################################################################

class ObjectiveFunction:

    def __init__(self, env, policy, ndim, num_episodes=1, max_time_steps=float('inf'), minimization_solver=False):
        self.ndim = ndim
        self.env = env
        self.policy = policy
        self.num_episodes = num_episodes
        self.max_time_steps = max_time_steps
        self.minimization_solver = minimization_solver

        self.num_evals = 0
        self.hist = []
        self.hist_policy = []

        
    def eval(self, policy_params, num_episodes=None, max_time_steps=None, render=False):
        """Evaluate a policy"""

        self.num_evals += 1

        if num_episodes is None:
            num_episodes = self.num_episodes

        if max_time_steps is None:
            max_time_steps = self.max_time_steps

        average_total_rewards = 0

        for i_episode in range(num_episodes):

            total_rewards = 0.
            state = self.env.reset()

            for t in range(max_time_steps):
                if render:
                    self.env.render_wrapper.render()

                action = self.policy(state, policy_params)
                state, reward, done, info = self.env.step(action)
                total_rewards += reward
                
                if done:
                    break

            average_total_rewards += float(total_rewards) / num_episodes

            if render:
                print("Test Episode {0}: Total Reward = {1}".format(i_episode, total_rewards))

        if self.minimization_solver:
            average_total_rewards *= -1.

        return average_total_rewards   # Optimizers do minimization by default...

    
    def __call__(self, policy_params):
        return self.eval(policy_params)

## Exercise 3: solve the LunarLander problem (continuous version) with CEM

**Task 3:** Train the agent

In [None]:
%%time

VERBOSE = False
LOG_SCORES = True
LOG_POLICIES = False
LOG_RECORD_INTERVAL = 1000   # TODO

GYM_ENVIRONMENT = "LunarLanderContinuous-v2"

###############################################################################
# Main ########################################################################
###############################################################################

env = gym.make(GYM_ENVIRONMENT)
RenderWrapper.register(env, force_gif=True)

observation_space_dim = env.observation_space.shape[0]

# Init parameters to random
theta_init = np.random.randn(observation_space_dim)
init_mean_array = np.zeros(178)
init_var_array = np.ones(178) * 1000.

# Make a neural network with 1 hidden layer of 16 units
h_size = 16     # Number of neurons on the hidden layer
weights = (np.zeros([env.observation_space.shape[0] + 1, h_size]),
           np.zeros([h_size + 1, env.action_space.shape[0]]))

# Set the neural network activation functions (one function per layer)
activation_functions = (relu, tanh)

flat_weights_list = flatten_weights(weights)
num_params = len(flat_weights_list)
print("Number of parameters (neural network weights) to optimize:", num_params)
w_shape = weights_shape(weights)
print("Number of parameters per layer:", w_shape)

nn_policy = NeuralNetworkPolicy(activation_functions, w_shape)

## Bonus exercise 2: test the CMAES algorithm

PyCMA: Python implementation of CMA-ES (from Nikolaus Hansen).

Source code:

- http://cma.gforge.inria.fr/cmaes_sourcecode_page.html#python
- https://github.com/CMA-ES/pycma
- https://pypi.org/project/cma/

Official documentation:

- http://cma.gforge.inria.fr/apidocs-pycma/

In [None]:
import cma

In [None]:
objective_function = ObjectiveFunction(env=env,
                                       policy=nn_policy,
                                       ndim=num_params,  # number of dimensions of the parameter (weights) space
                                       num_episodes=1,
                                       max_time_steps=500,
                                       minimization_solver=True)

In [None]:
x_optimal, es = cma.fmin2(objective_function, x0=np.random.random(num_params), sigma0=10., options={'maxfevals': 1500})
theta = x_optimal

In [None]:
theta

In [None]:
sns.set_context("talk")
plt.rcParams['figure.figsize'] = 20,10

cma.plot();  # shortcut for es.logger.plot()

In [None]:
# Test final policy
objective_function.eval(theta, num_episodes=3, render=True)

objective_function.env.close()

objective_function.env.render_wrapper.make_gif("ex_pycma_ll")