## Neural networks

In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from mushroom_rl.core import Core


class CriticNetwork(nn.Module):
    def __init__(self, input_shape, output_shape, n_features, **kwargs):
        super().__init__()

        n_input = input_shape[-1]
        n_output = output_shape[0]

        self._h1 = nn.Linear(n_input, n_features)
        self._h2 = nn.Linear(n_features, n_features)
        self._h3 = nn.Linear(n_features, n_output)

        nn.init.xavier_uniform_(self._h1.weight,
                                gain=nn.init.calculate_gain('relu'))
        nn.init.xavier_uniform_(self._h2.weight,
                                gain=nn.init.calculate_gain('relu'))
        nn.init.xavier_uniform_(self._h3.weight,
                                gain=nn.init.calculate_gain('linear'))

    def forward(self, state, action):
        state_action = torch.cat((state.float(), action.float()), dim=1)
        features1 = F.relu(self._h1(state_action))
        features2 = F.relu(self._h2(features1))
        q = self._h3(features2)

        return torch.squeeze(q)


class ActorNetwork(nn.Module):
    def __init__(self, input_shape, output_shape, n_features, **kwargs):
        super(ActorNetwork, self).__init__()

        n_input = input_shape[-1]
        n_output = output_shape[0]

        self._h1 = nn.Linear(n_input, n_features)
        self._h2 = nn.Linear(n_features, n_features)
        self._h3 = nn.Linear(n_features, n_output)

        nn.init.xavier_uniform_(self._h1.weight,
                                gain=nn.init.calculate_gain('relu'))
        nn.init.xavier_uniform_(self._h2.weight,
                                gain=nn.init.calculate_gain('relu'))
        nn.init.xavier_uniform_(self._h3.weight,
                                gain=nn.init.calculate_gain('linear'))

    def forward(self, state):
        features1 = F.relu(self._h1(torch.squeeze(state, 1).float()))
        features2 = F.relu(self._h2(features1))
        a = self._h3(features2)

        return a

## DDPG Agent

In [11]:
import numpy as np

from mushroom_rl.algorithms.actor_critic.deep_actor_critic import DeepAC
from mushroom_rl.policy import Policy
from mushroom_rl.approximators import Regressor
from mushroom_rl.approximators.parametric import TorchApproximator
from mushroom_rl.utils.replay_memory import ReplayMemory
from mushroom_rl.utils.parameters import Parameter, to_parameter

from copy import deepcopy


class DDPG(DeepAC):
    """
    Deep Deterministic Policy Gradient algorithm.
    "Continuous Control with Deep Reinforcement Learning".
    Lillicrap T. P. et al.. 2016.
    """
    def __init__(self, mdp_info, policy_class, policy_params,
                 actor_params, actor_optimizer, critic_params, batch_size,
                 initial_replay_size, max_replay_size, tau, policy_delay=1,
                 critic_fit_params=None, actor_predict_params=None, critic_predict_params=None):
        """
        Constructor.
        Args:
            policy_class (Policy): class of the policy;
            policy_params (dict): parameters of the policy to build;
            actor_params (dict): parameters of the actor approximator to
                build;
            actor_optimizer (dict): parameters to specify the actor optimizer
                algorithm;
            critic_params (dict): parameters of the critic approximator to
                build;
            batch_size ([int, Parameter]): the number of samples in a batch;
            initial_replay_size (int): the number of samples to collect before
                starting the learning;
            max_replay_size (int): the maximum number of samples in the replay
                memory;
            tau ((float, Parameter)): value of coefficient for soft updates;
            policy_delay ([int, Parameter], 1): the number of updates of the critic after
                which an actor update is implemented;
            critic_fit_params (dict, None): parameters of the fitting algorithm
                of the critic approximator;
            actor_predict_params (dict, None): parameters for the prediction with the
                actor approximator;
            critic_predict_params (dict, None): parameters for the prediction with the
                critic approximator.
        """
        self._critic_fit_params = dict() if critic_fit_params is None else critic_fit_params
        self._actor_predict_params = dict() if actor_predict_params is None else actor_predict_params
        self._critic_predict_params = dict() if critic_predict_params is None else critic_predict_params

        self._batch_size = to_parameter(batch_size)
        self._tau = to_parameter(tau)
        self._policy_delay = to_parameter(policy_delay)
        self._fit_count = 0

        self._replay_memory = ReplayMemory(initial_replay_size, max_replay_size)

        target_critic_params = deepcopy(critic_params)
        self._critic_approximator = Regressor(TorchApproximator,
                                              **critic_params)
        self._target_critic_approximator = Regressor(TorchApproximator,
                                                     **target_critic_params)

        target_actor_params = deepcopy(actor_params)
        self._actor_approximator = Regressor(TorchApproximator,
                                             **actor_params)
        self._target_actor_approximator = Regressor(TorchApproximator,
                                                    **target_actor_params)

        self._init_target(self._critic_approximator,
                          self._target_critic_approximator)
        self._init_target(self._actor_approximator,
                          self._target_actor_approximator)

        policy = policy_class(self._actor_approximator, **policy_params)

        policy_parameters = self._actor_approximator.model.network.parameters()

        self._add_save_attr(
            _critic_fit_params='pickle',
            _critic_predict_params='pickle',
            _actor_predict_params='pickle',
            _batch_size='mushroom',
            _tau='mushroom',
            _policy_delay='mushroom',
            _fit_count='primitive',
            _replay_memory='mushroom',
            _critic_approximator='mushroom',
            _target_critic_approximator='mushroom',
            _target_actor_approximator='mushroom'
        )

        super().__init__(mdp_info, policy, actor_optimizer, policy_parameters)

    def fit(self, dataset):
        self._replay_memory.add(dataset)
        if self._replay_memory.initialized:
            state, action, reward, next_state, absorbing, _ =\
                self._replay_memory.get(self._batch_size())

            q_next = self._next_q(next_state, absorbing)
            q = reward + self.mdp_info.gamma * q_next

            self._critic_approximator.fit(state, action, q,
                                          **self._critic_fit_params)

            
            #################################################
            ##### This will change for our actor gradient step
            if self._fit_count % self._policy_delay() == 0:
                loss = self._loss(state)
                self._optimize_actor_parameters(loss)
            
            #################################################

            self._update_target(self._critic_approximator,
                                self._target_critic_approximator)
            self._update_target(self._actor_approximator,
                                self._target_actor_approximator)

            self._fit_count += 1

    def _loss(self, state):
        action = self._actor_approximator(state, output_tensor=True, **self._actor_predict_params)
        q = self._critic_approximator(state, action, output_tensor=True, **self._critic_predict_params)

        return -q.mean()

    def _next_q(self, next_state, absorbing):
        """
        Args:
            next_state (np.ndarray): the states where next action has to be
                evaluated;
            absorbing (np.ndarray): the absorbing flag for the states in
                ``next_state``.
        Returns:
            Action-values returned by the critic for ``next_state`` and the
            action returned by the actor.
        """
        a = self._target_actor_approximator.predict(next_state, **self._actor_predict_params)

        q = self._target_critic_approximator.predict(next_state, a, **self._critic_predict_params)
        q *= 1 - absorbing

        return q

    def _post_load(self):
        self._actor_approximator = self.policy._approximator
        self._update_optimizer_parameters(self._actor_approximator.model.network.parameters())

## Core (trainer):

In [12]:
from tqdm import tqdm


class Core(object):
    """
    Implements the functions to run a generic algorithm.
    """
    def __init__(self, agent, mdp, callbacks_fit=None, callback_step=None,
                 preprocessors=None):
        """
        Constructor.
        Args:
            agent (Agent): the agent moving according to a policy;
            mdp (Environment): the environment in which the agent moves;
            callbacks_fit (list): list of callbacks to execute at the end of
                each fit;
            callback_step (Callback): callback to execute after each step;
            preprocessors (list): list of state preprocessors to be
                applied to state variables before feeding them to the
                agent.
        """
        self.agent = agent
        self.mdp = mdp
        self.callbacks_fit = callbacks_fit if callbacks_fit is not None else list()
        self.callback_step = callback_step if callback_step is not None else lambda x: None
        self._preprocessors = preprocessors if preprocessors is not None else list()

        self._state = None

        self._total_episodes_counter = 0
        self._total_steps_counter = 0
        self._current_episodes_counter = 0
        self._current_steps_counter = 0
        self._episode_steps = None
        self._n_episodes = None
        self._n_steps_per_fit = None
        self._n_episodes_per_fit = None

    def learn(self, n_steps=None, n_episodes=None, n_steps_per_fit=None,
              n_episodes_per_fit=None, render=False, quiet=False):
        """
        This function moves the agent in the environment and fits the policy
        using the collected samples. The agent can be moved for a given number
        of steps or a given number of episodes and, independently from this
        choice, the policy can be fitted after a given number of steps or a
        given number of episodes. By default, the environment is reset.
        Args:
            n_steps (int, None): number of steps to move the agent;
            n_episodes (int, None): number of episodes to move the agent;
            n_steps_per_fit (int, None): number of steps between each fit of the
                policy;
            n_episodes_per_fit (int, None): number of episodes between each fit
                of the policy;
            render (bool, False): whether to render the environment or not;
            quiet (bool, False): whether to show the progress bar or not.
        """
        assert (n_episodes_per_fit is not None and n_steps_per_fit is None)\
            or (n_episodes_per_fit is None and n_steps_per_fit is not None)

        self._n_steps_per_fit = n_steps_per_fit
        self._n_episodes_per_fit = n_episodes_per_fit

        if n_steps_per_fit is not None:
            fit_condition =\
                lambda: self._current_steps_counter >= self._n_steps_per_fit
        else:
            fit_condition = lambda: self._current_episodes_counter\
                                     >= self._n_episodes_per_fit

        self._run(n_steps, n_episodes, fit_condition, render, quiet)

    def evaluate(self, initial_states=None, n_steps=None, n_episodes=None,
                 render=False, quiet=False):
        """
        This function moves the agent in the environment using its policy.
        The agent is moved for a provided number of steps, episodes, or from
        a set of initial states for the whole episode. By default, the
        environment is reset.
        Args:
            initial_states (np.ndarray, None): the starting states of each
                episode;
            n_steps (int, None): number of steps to move the agent;
            n_episodes (int, None): number of episodes to move the agent;
            render (bool, False): whether to render the environment or not;
            quiet (bool, False): whether to show the progress bar or not.
        """
        fit_condition = lambda: False

        return self._run(n_steps, n_episodes, fit_condition, render, quiet,
                         initial_states)

    def _run(self, n_steps, n_episodes, fit_condition, render, quiet,
             initial_states=None):
        assert n_episodes is not None and n_steps is None and initial_states is None\
            or n_episodes is None and n_steps is not None and initial_states is None\
            or n_episodes is None and n_steps is None and initial_states is not None

        self._n_episodes = len(
            initial_states) if initial_states is not None else n_episodes

        if n_steps is not None:
            move_condition =\
                lambda: self._total_steps_counter < n_steps

            steps_progress_bar = tqdm(total=n_steps,
                                      dynamic_ncols=True, disable=quiet,
                                      leave=False)
            episodes_progress_bar = tqdm(disable=True)
        else:
            move_condition =\
                lambda: self._total_episodes_counter < self._n_episodes

            steps_progress_bar = tqdm(disable=True)
            episodes_progress_bar = tqdm(total=self._n_episodes,
                                         dynamic_ncols=True, disable=quiet,
                                         leave=False)

        return self._run_impl(move_condition, fit_condition, steps_progress_bar,
                              episodes_progress_bar, render, initial_states)

    def _run_impl(self, move_condition, fit_condition, steps_progress_bar,
                  episodes_progress_bar, render, initial_states):
        self._total_episodes_counter = 0
        self._total_steps_counter = 0
        self._current_episodes_counter = 0
        self._current_steps_counter = 0

        dataset = list()
        last = True
        while move_condition():
            if last:
                self.reset(initial_states)

            sample = self._step(render)

            self.callback_step([sample])

            self._total_steps_counter += 1
            self._current_steps_counter += 1
            steps_progress_bar.update(1)

            if sample[-1]:
                self._total_episodes_counter += 1
                self._current_episodes_counter += 1
                episodes_progress_bar.update(1)

            dataset.append(sample)
            if fit_condition():
                self.agent.fit(dataset)
                self._current_episodes_counter = 0
                self._current_steps_counter = 0

                for c in self.callbacks_fit:
                    c(dataset)

                dataset = list()

            last = sample[-1]

        self.agent.stop()
        self.mdp.stop()

        steps_progress_bar.close()
        episodes_progress_bar.close()

        return dataset

    def _step(self, render):
        """
        Single step.
        Args:
            render (bool): whether to render or not.
        Returns:
            A tuple containing the previous state, the action sampled by the
            agent, the reward obtained, the reached state, the absorbing flag
            of the reached state and the last step flag.
        """
        action = self.agent.draw_action(self._state)
        next_state, reward, absorbing, _ = self.mdp.step(action)

        self._episode_steps += 1

        last = not(
            self._episode_steps < self.mdp.info.horizon and not absorbing)

        state = self._state
        next_state = self._preprocess(next_state.copy())
        self._state = next_state

        return state, action, reward, next_state, absorbing, last

    def reset(self, initial_states=None):
        """
        Reset the state of the agent.
        """
        if initial_states is None\
            or self._total_episodes_counter == self._n_episodes:
            initial_state = None
        else:
            initial_state = initial_states[self._total_episodes_counter]

        self._state = self._preprocess(self.mdp.reset(initial_state).copy())
        self.agent.episode_start()
        self.agent.next_action = None
        self._episode_steps = 0

    def _preprocess(self, state):
        """
        Method to apply state preprocessors.
        Args:
            state (np.ndarray): the state to be preprocessed.
        Returns:
             The preprocessed state.
        """
        for p in self._preprocessors:
            state = p(state)

        return state

In [13]:
from mushroom_rl.environments.dm_control_env import DMControl
from mushroom_rl.policy import DeterministicPolicy

# MDP
horizon = 500
gamma = 0.99
gamma_eval = 1.
mdp = DMControl('walker', 'stand', horizon, gamma)

# Policy 

policy_class = DeterministicPolicy
policy_params = dict()

# Settings
initial_replay_size = 500
max_replay_size = 5000
batch_size = 200
n_features = 80
tau = .001

In [14]:
# Approximator
actor_input_shape = mdp.info.observation_space.shape
actor_params = dict(network=ActorNetwork,
                    n_features=n_features,
                    input_shape=actor_input_shape,
                    output_shape=mdp.info.action_space.shape)

actor_optimizer = {'class': optim.Adam,
                   'params': {'lr': 1e-5}}

critic_input_shape = (actor_input_shape[0] + mdp.info.action_space.shape[0],)
critic_params = dict(network=CriticNetwork,
                     optimizer={'class': optim.Adam,
                                'params': {'lr': 1e-3}},
                     loss=F.mse_loss,
                     n_features=n_features,
                     input_shape=critic_input_shape,
                     output_shape=(1,))

In [15]:
from mushroom_rl.approximators import Regressor
from mushroom_rl.approximators.parametric import TorchApproximator
from mushroom_rl.utils.replay_memory import ReplayMemory

replay_memory = ReplayMemory(initial_replay_size, max_replay_size)
critic_approximator = Regressor(TorchApproximator,
                                              **critic_params)

In [16]:
import numpy as np
import torch

#action = np.zeros((6,))
action = torch.zeros((6,))
mdp.reset()
next_state, reward, absorbing, _ = mdp.step(action)

#[state, action, reward, next_state, absorbing, last] : DATAAAA



    
next_state

array([-7.76146949e-01,  6.30552070e-01,  3.51807054e-01,  9.36072538e-01,
       -9.60701493e-01,  2.77583577e-01, -7.47872774e-01,  6.63842085e-01,
        1.37830244e-01,  9.90455867e-01, -6.27902866e-01,  7.78291714e-01,
       -9.33372077e-01,  3.58910248e-01,  1.29662781e+00, -2.45250000e-01,
       -8.78463968e-17,  5.81756865e-16,  7.63833441e-16, -7.77156117e-17,
       -7.28306304e-16,  5.86197757e-16,  8.88178420e-18,  9.94759830e-16])

In [17]:
# actor_params = 0

# for n_episodes :

#     
#     data = run_noisy_params 
    
#     fit critic on data
    
#     compute actor gradients
    
#     actor_param = params + grads
    

In [18]:
### Pseudo code for ES :


params = {
    "n_samples":500,
    "n_workers":n_workers,
    "noise_distribution":PytorchDistrib,
    "n_eval_episodes":
}


# in _step : one step would correspond to an Gradnum full step + Critic fit ==> ONE GRADIENT STEP


SyntaxError: expression expected after dictionary key and ':' (101406557.py, line 8)

In [None]:
from mushroom_rl.environments.gym_env import Gym
from mushroom_rl.utils.spaces import Discrete,Box


env = Gym("Swimmer-v4")
env.info.action_space
action_space = env.info.action_space

if type(action_space) == Discrete:
    ###output one action env will discretize
    n_actions = 1 
elif type(action_space) == Box:
    n_actions = action_space.shape
else : raise ValueError(f'Unkown action space{action_space}')

In [4]:
%cd /home/q123/Desktop/explo/

from src.optimizers.es_pytorch import ESOptimizer
from src.helpers import setup_experiment
from src.config import get_configs
import torch

env_name = "Swimmer-v4"
kernel_name = "rbf"

env_config,likelihood_config,kernel_config,optimizer_config,trainer_config = get_configs(env_name,kernel_name)
_,env = setup_experiment(env_config,kernel_config,likelihood_config,additional_layers=[20,20,20])

optimizer = ESOptimizer(env,torch.zeros(env.mlp.len_params),sigma=1e-2,
                params_per_step=50,episodes_per_param=1,n_workers=8)


for i in range(100):
    
    optimizer.step()
    
    if i % 3 == 0:
        avg_reward,_ = env.run_many(optimizer.policy_params,5)
        print(f'avg_rewarad {avg_reward} ')
        print(f'policy_params : {optimizer.policy_params}')


/home/q123/Desktop/explo
Using ard_num_dims = 1062
avg_rewarad tensor([-0.0636]) 
policy_params : tensor([-0.0010, -0.0010, -0.0010,  ..., -0.0010, -0.0010,  0.0010])
avg_rewarad tensor([-0.0285]) 
policy_params : tensor([-0.0040, -0.0040,  0.0004,  ..., -0.0038, -0.0039,  0.0040])
avg_rewarad tensor([-0.0372]) 
policy_params : tensor([-0.0069, -0.0069,  0.0020,  ..., -0.0068, -0.0066,  0.0068])
avg_rewarad tensor([-0.0360]) 
policy_params : tensor([-0.0099, -0.0099,  0.0044,  ..., -0.0097, -0.0094,  0.0095])
avg_rewarad tensor([-0.0323]) 
policy_params : tensor([-0.0128, -0.0128,  0.0067,  ..., -0.0127, -0.0122,  0.0124])
avg_rewarad tensor([-0.0357]) 
policy_params : tensor([-0.0157, -0.0157,  0.0091,  ..., -0.0155, -0.0151,  0.0154])
avg_rewarad tensor([-0.0437]) 
policy_params : tensor([-0.0185, -0.0186,  0.0111,  ..., -0.0183, -0.0181,  0.0184])
avg_rewarad tensor([-0.0335]) 
policy_params : tensor([-0.0212, -0.0215,  0.0137,  ..., -0.0212, -0.0209,  0.0215])
avg_rewarad tensor([-

Process ForkPoolWorker-411:
Process ForkPoolWorker-412:
Process ForkPoolWorker-413:
Process ForkPoolWorker-414:
Traceback (most recent call last):
Process ForkPoolWorker-409:
Process ForkPoolWorker-415:
Traceback (most recent call last):
Process ForkPoolWorker-410:
  File "/home/q123/miniconda3/envs/boptim/lib/python3.10/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/q123/miniconda3/envs/boptim/lib/python3.10/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/home/q123/miniconda3/envs/boptim/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/q123/miniconda3/envs/boptim/lib/python3.10/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
Process ForkPoolWorker-416:
Traceback (most recent call last):
  File "/home/q123/miniconda3/envs/boptim/

Process ForkPoolWorker-5:
Process ForkPoolWorker-8:
Process ForkPoolWorker-3:
Process ForkPoolWorker-7:
Process ForkPoolWorker-1:
Process ForkPoolWorker-6:
Process ForkPoolWorker-4:
Process ForkPoolWorker-2:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/q123/miniconda3/envs/boptim/lib/python3.10/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/home/q123/miniconda3/envs/boptim/lib/python3.10/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/home/q123/miniconda3/envs/boptim/lib/python3.10/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/home/q123/miniconda3/envs/boptim/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/q123/miniconda3/envs/boptim/lib/python3.10/multiprocessing/process.py", line 10

KeyboardInterrupt: 

  File "/home/q123/miniconda3/envs/boptim/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/q123/Desktop/explo/src/environments/objective.py", line 82, in run
    action = self.mlp(params,states[t].unsqueeze(0)).squeeze()
  File "/home/q123/miniconda3/envs/boptim/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/q123/Desktop/explo/src/environments/objective.py", line 113, in run_many
    reward,states = self.run(params)
  File "/home/q123/Desktop/explo/src/environments/objective.py", line 89, in run
    state, reward_tmp, done, _ = self.env.step(action.detach().numpy())
  File "/home/q123/miniconda3/envs/boptim/lib/python3.10/site-packages/torch/autograd/grad_mode.py", line 131, in __exit__
    torch.set_grad_enabled(self.prev)
  File "/home/q123/Desktop/explo/src/environments/gym_env.py", line 98, in step
    