In [6]:
import torch.nn as nn

class Policy(nn.Module):
  def __init__(self,input_shape):
    super().__init__()
    print("input_shape",input_shape)
    self.model = nn.Sequential(
        nn.Linear(input_shape[0],64),
        nn.ReLU(),
        nn.Linear(64,32),
        nn.ReLU(),
        nn.Linear(32,input_shape[0]),
    )
  def forward(self,x):
    return self.model(x)    

In [54]:
from pymoo.core.algorithm import Algorithm
from pymoo.operators.sampling.rnd import FloatRandomSampling
from torch.distributions import Categorical,Normal
from pymoo.core.initialization import Initialization
from pymoo.core.population import Population
from pymoo.core.repair import NoRepair
from torch import optim
import torch
import numpy as np
from gym import spaces

class MonteCarloGradientPolicyAlgorithm(Algorithm):
    def __init__(self,
                 gamma=0.99,
                 alpha=0.01,
                 num_rounds=100,
                 sample_size=500,
                 sampling=FloatRandomSampling(),
                 repair=NoRepair(),
                 **kwargs):
        
        """
        Parameters
        ----------
        env : 
            The environment to be used in the algorithm.
        policy : {Policy}
            The policy to be used in the algorithm.
        gamma : float, optional
            The discount factor used in the algorithm. The default is 0.99.
        alpha : float, optional
            The learning rate used in the algorithm. The default is 0.01.
        num_episodes : int, optional
            The number of episodes to be run in the algorithm. The default is 100.
        sample_size : int, optional
            The number of samples to be generated from the problems and used in the acquisition function. 
            The default is 10.
        sampling : {Sampling}, optional
            The sampling method used to generate the initial samples. The default is FloatRandomSampling().
        """
         
        super().__init__(**kwargs)

        self.gamma = gamma
        self.alpha = alpha
        self.num_rounds = num_rounds
        self.sample_size = sample_size
        self.sampling = sampling
        self.repair = repair
        
        self.initialization = Initialization(sampling)
        self.is_constraint_model = False
        self.optimizer = None
        self.data_set_X = None
        self.model = None
        self.states = []
        self.actions = []
        self.probs = []
        self.rewards = []
        self.steps_taken = []

    def _setup(self, problem, **kwargs):
        self.is_constraint_model = False
        self.model = Policy(np.array([self.problem.n_var]))
        self.optimizer = optim.Adam(self.model.parameters(), lr = 1e-2)
        
    def _initialize_infill(self):
        return self.initialization.do(self.problem, 1, algorithm=self)

    def _initialize_advance(self, infills=None, **kwargs):
        self.data_set_X = self.pop.get("X")
        #super()._initialize_advance(infills=infills, **kwargs)

    def _infill(self):
        state = self.sampling.do(self.problem, 1, algorithm=self).get("X")[0]
        steps = 0
        ep_rewards = 0
        batch_rewards = []
        log_probs = []
        
        while self.opt.get("F")[0][0] < self.problem.evaluate(np.array(state))[0] or steps < 50:
            #print("self.opt.get(F)",self.opt.get("F"))
           # print("self.problem.evaluate(np.array(state))[0]",self.problem.evaluate(np.array(state))[0])
            a, log_p = self.action(self.model, torch.Tensor(state).unsqueeze(0))
            log_probs.append(log_p)
            new_state, reward = self.step(a,state)
            batch_rewards.append(reward)
            ep_rewards += reward
            steps +=1 
            #print("state:", state)
            state = new_state
      
        
        self.rewards.append(ep_rewards)
        self.steps_taken.append(steps)
        print("Episode: {} --- Rewards: {} --- Steps: {}".format(self.n_iter, ep_rewards, steps))
        self.update_policy(self.n_iter, self.optimizer, batch_rewards, log_probs)

        
        return self.pop
        
    def _advance(self, infills=None, **kwargs):
        return super()._advance(infills=infills, **kwargs)
    
    
    def _finalize(self):
        return super()._finalize()
    
    def action(self, model, s):
        # simple pytorch aproach for action-selection and log-prob calc 
        action_parameters = model(s)
        #print("action_parameters:", action_parameters)
        #m = Categorical(prob)
        #print("action_parameters[:, :1]:", action_parameters[:, :1])
        #print("action_parameters[:, 1:]:", action_parameters[:, 1:])
        #mu, sigma = action_parameters[:, :1], torch.exp(action_parameters[:, 1:])
        mu, c = action_parameters[:, :1], 1
        #print("mu:", mu)
        
        m = Normal(action_parameters, torch.Tensor([[1,1]]))
        #print("m:", m)
        a = m.sample()
        #print("a:", a.tolist())
        # log p(a∣π(s))
        log_p = m.log_prob(a)
        #print("log_p:", log_p)
        #print(a.item(), log_p)
        return a.tolist(), log_p
    
    def step(self, action: np.ndarray, state):
        current_X = state
        X_new = current_X[0] + action[0]
        xl, xu = self.problem.bounds()
        reward = 0
        #print("current_X:", current_X)
        #print("action:", action)
        #print("X_new:", X_new)
        if np.any(X_new < xl) or np.any(X_new > xu):
            reward = -(max(max(xl - X_new), max(X_new - xu)))
            X_new = self.sampling.do(self.problem, 1, algorithm=self).get("X")[0]
        else:
            evlaution_of_new_points = self.problem.evaluate(np.array(X_new))
            Y_new = evlaution_of_new_points[0]    
            if Y_new < self.opt.get("F"):
                reward = self.opt.get("F")[0][0] - Y_new
                self.data_set_X = np.vstack((self.data_set_X, X_new))
                off = Population.new(X=self.data_set_X)
                self.pop = off
                self.repair(self.problem, off)

        
        
        #if self.is_constraint_model or self.problem.n_eq_constr > 0:
        #    Y_new = evlaution_of_new_points[0]
        #    Constraint_new = evlaution_of_new_points[1]
        #else:
        
        state = np.array(X_new)
        #print("rewards",reward)
        return state, reward
    
    def update_policy(self, ep, optimizer,batch_rewards,log_probs):
        R = 0
        gamma = 0.99
        policy_loss = []
        rewards = []
        #calc discounted Rewards
        for r in batch_rewards[::-1]: # reverses the list of rewards 
            R = r + gamma * R
            rewards.insert(0, R) # inserts the current reward to first position
        
        #print("rewards",rewards)
        rewards = torch.tensor(rewards)
        # standardization to get data of zero mean and varianz 1, stabilizes learning 
        #-- attention scaling rewards looses information of special events with higher rewards - addapting on different environments  
        rewards = (rewards - rewards.mean()) / (rewards.std() + ep)
        for log_prob, reward in zip(log_probs, rewards):
            policy_loss.append(-log_prob * reward) #baseline+
        
        optimizer.zero_grad()
        policy_loss = torch.cat(policy_loss).sum()
        policy_loss.backward()
        optimizer.step()
        """G = 0
        for t in reversed(range(len(states))):
            G = self.gamma * G + rewards[t]
            state = states[t]
            action = actions[t]
            grad_log_prob = self.policy.grad_log_prob(state, action)
            self.policy.theta += self.alpha * G * grad_log_prob"""

    """def run_episode(self):
        states = []
        actions = []
        rewards = []
        state = self.env.reset()
        done = False
        while not done:
            action = self.policy.sample_action(state)
            next_state, reward, done, _ = self.env.step(action)
            states.append(state)
            actions.append(action)
            rewards.append(reward)
            state = next_state
        return states, actions, rewards"""
    
    

In [55]:
from pymoo.problems import get_problem
from pymoo.optimize import minimize
from pymoo.indicators.gd import GD
from pymoo.indicators.igd import IGD
import matplotlib.pyplot as plt
import numpy as np
from pymoo.algorithms.soo.nonconvex.pso import PSO
from pymoo.algorithms.soo.nonconvex.ga import GA
from pymoo.termination import get_termination
from random import randint
from pymoo.constraints.as_obj import ConstraintsAsObjective
from pymoo.termination.ftol import SingleObjectiveSpaceTermination
from pymoo.termination.robust import RobustTermination
from pymoo.termination.default import DefaultMultiObjectiveTermination
import random

torch.manual_seed (0)

problem = get_problem("ackley", n_var=2, a=20, b=1/5, c=2 * np.pi)
problem1 = get_problem("Griewank", n_var=2)
algorithm3 = MonteCarloGradientPolicyAlgorithm()
res = minimize( problem1,
                algorithm3,
                save_history=True,
                termination=('n_iter', 5),
                seed = 1,
                return_least_infeasible=True,
                verbose=True)

pf = problem.pareto_front()
print("PF",pf[0])
ind = GD(pf)
print("GD", ind(res.F))
ind2 = IGD(pf)
print("IGD", ind2(res.F))


n_evals = np.array([e.evaluator.n_eval for e in res.history])
opt = np.array([e.opt[0].F for e in res.history])
print(opt)
plt.title("Convergence")
plt.plot(n_evals, opt, "--")
plt.plot(n_evals, np.repeat(pf[0],len(n_evals)), 'k-', lw=1,dashes=[2, 2])
#plt.yscale("log")
plt.show()

input_shape [2]
Episode: 2 --- Rewards: 1.6523791353335362 --- Steps: 12
Episode: 3 --- Rewards: 1.9120848794820198 --- Steps: 5
Episode: 4 --- Rewards: 6.586166769446875 --- Steps: 6
Episode: 5 --- Rewards: 0 --- Steps: 0


RuntimeError: torch.cat(): expected a non-empty list of Tensors

In [None]:
torch.exp(torch.tensor([102.5045]))

In [None]:
np.exp(102.5045)