## My Trainer

In [11]:
%cd /home/q123/Desktop/explo/

from mushroom_rl.environments.dm_control_env import DMControl
from mushroom_rl.policy import DeterministicPolicy
from src.ddpg import DDPG
from src.helpers import setup_experiment
from src.config import get_configs
import torch
from src.approximators.actor import ActorNetwork
from src.approximators.critic import CriticNetwork

import torch.nn.functional as F

# MDP
horizon = 500
gamma = 0.99
gamma_eval = 1.
#mdp = DMControl('walker', 'stand', horizon, gamma)


# Settings
initial_replay_size = 500
max_replay_size = 5000
batch_size = 200
n_features = 80
tau = .001


from os import path
import logging
log_file_path = path.join("/home/q123/Desktop/explo/logging.conf")
logging.config.fileConfig(log_file_path)

log_file_path

/home/q123/Desktop/explo


'/home/q123/Desktop/explo/logging.conf'

In [12]:
#import multiprocessing as mp
import torch.multiprocessing as pmp
from copy import deepcopy
import torch


class ESQOptimizer(object):
    
    
    def __init__(self,critic,actor,
                        sigma,params_per_step,n_workers=None):
        
        if n_workers is None : n_workers = mp.cpu_count()
        
        self.args = locals()
        self.args.pop("self")
        
        
    
        self.actor = actor
        self.optimizer = torch.optim.Adam(actor.model.network.parameters())
        #self.__dict__.update(locals())
        

    def run_noisy_advantage(self,states,critic,actor,
                            sigma,seed,grad_buffer=None):
        
        torch.manual_seed(seed)
        n_params = actor.model.network.n_params
        eps = torch.randn(n_params) * sigma
        actor.model.network.add_noise(eps)
        #actor2 = actor.model.network.add_noise(-eps)
    
        noisy_actions = actor(states,output_tensor=True)
        noisy_q = critic(states,noisy_actions,output_tensor=True) ##  add absorbing flag
        noisy_q = torch.sum(noisy_q)
        noisy_grad = torch.autograd.grad(noisy_q,actor.model.network.parameters()) ## hotfix
        
        #return grad1+grad2
        
        if grad_buffer : grad_buffer.append(noisy_grad)
        
        return noisy_grad
    
    def run_parallel_advantage_p(self,states,critic,actor,
                               sigma,params_per_step,n_workers):
        
        processes = []
        self.grad_buffer = []
        #grad_buffer = None
        
        args = [(states,critic,actor,sigma,seed,self.grad_buffer) for seed in range(params_per_step)]
        
        critic.model.network.share_memory()
        actor.model.network.share_memory()
        
        ctx = pmp.get_context('spawn')
        
        for i in range(n_workers):
            
            p = ctx.Process(target=self.run_noisy_advantage, args=args[i])
            p.start()
            processes.append(p)
            
        for p in processes:
            p.join()
            
        return self.grad_buffer
    
        
    def run_parallel_advantage(self,states,critic,actor,
                               sigma,params_per_step,n_workers):
        
        args = [(states,critic,actor,sigma,seed) for seed in range(params_per_step)]
       
        ctx = mp.get_context('fork')
        
        # Step 1: Init multiprocessing.Pool()

        with ctx.Pool(n_workers) as pool:
        
            # Step 2:  Run processes (we might need to use mapreduce to avoid big memory usage)
            grads = pool.starmap(self.run_noisy_advantage,args) ## list of [(reward*eps)]

            # Step 3: Wait for workers to run then close pool
            pool.close()
            pool.join()
            
        
        return grads
    
    def compute_grads(self,states,aggregate=False):
        
        grads = self.run_parallel_advantage_p(**self.args,states=states)  
        #grads = self.run_parallel_advantage(**self.args,states=states)  
        if aggregate:
            
            grad_stack = []
            
            for i in range(len(grads[0])):
                
                grad_stack.append(torch.stack([grad[i] for grad in grads]))
                
            grads = tuple(torch.sum(stack,dim=0) for stack in grad_stack)
            
        return grads

    def step(self,states):
        
        self.optimizer.zero_grad()  
        actor_grad = self.compute_grads(states)      
        self.actor.model.network.grad = actor_grad ## optimizer usually minimizes (add -)
        self.optimizer.step()
      

In [13]:
def critic_step(agent,transitions):
    
    agent._replay_memory.add(transitions)

    state, action, reward, next_state, absorbing, _ =\
        agent._replay_memory.get(agent._batch_size())

    q_next = agent._next_q(next_state, absorbing)
    q_target = reward + agent.mdp_info.gamma * q_next

    agent._critic_approximator.fit(state, action, q_target,
                                    **agent._critic_fit_params)
    
    agent._update_target(agent._critic_approximator,
                        agent._target_critic_approximator)

In [14]:
%cd /home/q123/Desktop/explo/src

from src.optimizers.esq_pytorch import ESQOptimizer


env_name = "Swimmer-v4"
kernel_name = "rbf"


if __name__ == '__main__':
    
    ## Setup environment
    env_config,likelihood_config,kernel_config,optimizer_config,trainer_config = get_configs(env_name,kernel_name)
    _,env = setup_experiment(env_config,kernel_config,likelihood_config,additional_layers=[])


    # Setup DDPG
    mdp = env.env

    policy_class = DeterministicPolicy
    policy_params = dict()

    actor_input_shape = mdp.info.observation_space.shape
    actor_params = dict(network=ActorNetwork,
                        n_features=n_features,
                        input_shape=actor_input_shape,
                        output_shape=mdp.info.action_space.shape)

    actor_optimizer = {'class': torch.optim.Adam,
                    'params': {'lr': 1e-5}}

    critic_input_shape = (actor_input_shape[0] + mdp.info.action_space.shape[0],)
    critic_params = dict(network=CriticNetwork,
                        optimizer={'class': torch.optim.Adam,
                                    'params': {'lr': 1e-3}},
                        loss=F.mse_loss,
                        n_features=n_features,
                        input_shape=critic_input_shape,
                        output_shape=(1,))

    agent = DDPG(mdp.info, policy_class,policy_params,
                actor_params, actor_optimizer, 
                critic_params,
                batch_size, initial_replay_size, max_replay_size,
                tau)

    esq_optimizer = ESQOptimizer(
                            critic=agent._critic_approximator.model.network,
                            actor = agent._actor_approximator.model.network,
                            sigma=1,
                            params_per_step=1,
                            n_workers=1)

        
        
    for i in range(1):
        
        
        _,states,transitions = env.run_many(agent._actor_approximator,1)
        print("done collecting states")
        critic_step(agent,transitions)
        print("done training critic")
        grads = esq_optimizer.compute_grads(states)
        print("dooooone")
        
    #     #print(optimizer.actor.parameters())

/home/q123/Desktop/explo/src
Using ard_num_dims = 18
done collecting states
done training critic
running parallel advantage


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  np.int)


done running parallel advantage
dooooone


In [15]:
# from mushroom_rl.approximators import Regressor
# from mushroom_rl.approximators.parametric import TorchApproximator
# from mushroom_rl.utils.replay_memory import ReplayMemory

# replay_memory = ReplayMemory(initial_replay_size, max_replay_size)
# critic_approximator = Regressor(TorchApproximator,
#                                               **critic_params)


In [16]:
# optimizer = ESOptimizer(env,torch.zeros(env.mlp.len_params),sigma=1e-2,
#                 params_per_step=50,episodes_per_param=1,n_workers=8)
# # for i in range(1):
    
# #     optimizer.step()
    
# #     if i % 3 == 0:
# #         avg_reward,_,transitions = env.run_many(optimizer.policy_params,5)
# #         # print(f'avg_rewarad {avg_reward} ')
# #         # print(f'policy_params : {optimizer.policy_params}')
