## Synthetic cases

In this part, we show the implementation of the Quadratic- and Cosine-type rewards: 

- Quadratic reward: 
$$
\mathbb{E}[r^{i}_{t,k}] = \mathbf{x}_{t,k}^\top \mathbf{A}_i^\top \mathbf{A}_i \mathbf{x}_{t,k}, \forall i\in[m],
$$

- Cosine reward: 
$$
\mathbb{E}[r^{i}_{t,k}] = \cos\left(3\,\mathbf{x}_{t,k}^\top \boldsymbol{\theta}^*_i\right), \forall i\in[m],
$$

In [1]:
import numpy as np
import torch

In [2]:
from environments import moContextMABSimulator

# Quadratic
class quadEnv(moContextMABSimulator): 
    def __init__(self, num_arm = None, num_dim = None, num_obj = None, arm_context = None, obj_preference = 'scalarization', sclarization_type='weighted_sum', opt_type = 'max', vary_context = False, noise_var = 0.1):
        super().__init__(num_arm, num_dim, num_obj, arm_context, obj_preference, sclarization_type, opt_type, vary_context, noise_var)

    def reset(self, num_arm: int = None, num_dim: int = None, num_obj: int = None, noise_var: float = None, seed: int = None, verbose: bool = False) -> None:
        self.quad_A = [np.random.normal(size=(self.d,self.d)) for _ in range(self.m)]
        return super().reset(num_arm, num_dim, num_obj, noise_var, seed, verbose)
    
    def _sample_context(self):
        self.A = np.random.rand(self.K, self.d)
    
    def _eval_expected_reward(self, arm):
        return np.hstack([arm @ self.quad_A[i].T @ self.quad_A[i] @ arm.T for i in range(self.m)])

# Cosine
class cosiEnv(moContextMABSimulator): 
    def __init__(self, num_arm = None, num_dim = None, num_obj = None, arm_context = None, obj_preference = 'scalarization', sclarization_type='weighted_sum', opt_type = 'max', vary_context = False, noise_var = 0.1):
        super().__init__(num_arm, num_dim, num_obj, arm_context, obj_preference, sclarization_type, opt_type, vary_context, noise_var)
        
    def reset(self, num_arm: int = None, num_dim: int = None, num_obj: int = None, noise_var: float = None, seed: int = None, verbose: bool = False) -> None:
        self.cos_theta = [np.random.uniform(size=(self.d, )) for _ in range(self.m)]
        return super().reset(num_arm, num_dim, num_obj, noise_var, seed, verbose)
    
    def _sample_context(self):
        self.A = np.random.rand(self.K, self.d)
    
    def _eval_expected_reward(self, arm):
        return np.hstack([np.cos(arm @ self.cos_theta[i]) for i in range(self.m)])

In [3]:
# **Parameters**

# number of objectives
m = 2
# number of dimension of the context
d = 10
# number of arms
K = 10
# number of rounds
T = 3000

- Define the agent and the environment

In [4]:
from agents import MONeural

# learning agents
mon_ucb = MONeural(
    num_arm=K, 
    num_dim=d, 
    num_obj=m,
    opt_type='min',
    style='ucb', # method type: "ucb" or "ts"
    lamda=1., 
    delta=.05, 
    hidden_size=256, 
    hidden_layer=1, 
    rho=0.05, # hyperparameter for exploration
    lr=1e-2, 
)

# simulation environment
env = quadEnv(
    num_arm=K, 
    num_dim=d, 
    num_obj=m,
    vary_context=True,
    opt_type='min',
    noise_var=0.1, # variance of the Gaussian noise
)

- Define the distribution of the preference vector (uniform distributed correlates to Pareto optimality)

In [5]:
def runif_in_simplex(n):
  ''' Return uniformly random vector in the n-simplex '''
  k = np.random.exponential(scale=1.0, size=n)
  return k / sum(k)

In [6]:
# reset the agent and environment
mon_ucb.reset()
env.reset()

tot_reg = 0
for t in range(T): 
    # sample preference vector 
    weight_vector = runif_in_simplex(m)
    # observe the context
    X = env.observe_context()
    # select an arm by the agent
    a_t = mon_ucb.take_action(context=X, weight_vector=weight_vector)
    # obtian the reward (and the regret (for evaluating the performance))
    r_t = env.get_reward(arm=a_t)
    reg_t = env.get_regret(arm=a_t, weight_vector=weight_vector).item()
    # update the agent
    mon_ucb.update(info=(a_t, r_t, X[a_t]))
    # print the cumulative regret
    if t%100 == 0: print(f"Round: {t:d}, instantaneous regret: {reg_t:.4f}, cumulative regret: {tot_reg:.4f}.")
    tot_reg += reg_t

Round: 0, instantaneous regret: 1.3743, cumulative regret: 0.0000.
Round: 100, instantaneous regret: 0.0000, cumulative regret: 411.8753.
Round: 200, instantaneous regret: 0.0000, cumulative regret: 443.4100.
Round: 300, instantaneous regret: 0.0000, cumulative regret: 447.6687.
Round: 400, instantaneous regret: 0.0000, cumulative regret: 458.7001.
Round: 500, instantaneous regret: 0.0000, cumulative regret: 464.3926.
Round: 600, instantaneous regret: 0.0000, cumulative regret: 465.0309.
Round: 700, instantaneous regret: 0.0000, cumulative regret: 465.2262.
Round: 800, instantaneous regret: 0.0000, cumulative regret: 466.5677.
Round: 900, instantaneous regret: 0.0000, cumulative regret: 473.3409.
Round: 1000, instantaneous regret: 0.0000, cumulative regret: 473.7176.
Round: 1100, instantaneous regret: 0.0000, cumulative regret: 474.6320.
Round: 1200, instantaneous regret: 0.0000, cumulative regret: 476.4915.
Round: 1300, instantaneous regret: 0.0000, cumulative regret: 476.9105.
Round:

# MOO Cases

In [7]:
from utils import get_problem

p_name = "DTLZ2"
p = get_problem(p_name)

d = p.n_dim
m = p.n_obj

class mooEnv(moContextMABSimulator): 
    def _sample_context(self):
        self.A = np.random.rand(self.K, self.d)
    
    def _eval_expected_reward(self, arm):
        return p.evaluate(torch.tensor(arm)).to('cpu').numpy()

In [8]:
# number of arms
K = 20
# number of rounds
T = 1000

In [9]:
mon_ts = MONeural(
    opt_type='min',
    style='ts',
    lamda=1.,
    delta=.05, 
    num_arm=K, 
    num_dim=d, 
    num_obj=m,
    hidden_size=256, 
    hidden_layer=2,
    rho=0.01, 
)

env = mooEnv(
    num_arm=K, 
    num_dim=d, 
    num_obj=m, 
    vary_context=1, 
    opt_type='min',
    noise_var=0.1,
)

In [10]:
alg = mon_ts
# reset the agent and environment
alg.reset()
env.reset()

tot_reg = 0
for t in range(T): 
    # sample preference vector 
    weight_vector = runif_in_simplex(m)
    # observe the context
    X = env.observe_context()
    # select an arm by the agent
    a_t = alg.take_action(context=X, weight_vector=weight_vector)
    # obtian the reward (and the regret (for evaluating the performance))
    r_t = env.get_reward(arm=a_t)
    reg_t = env.get_regret(arm=a_t, weight_vector=weight_vector).item()
    # update the agent
    alg.update(info=(a_t, r_t, X[a_t]))
    # print the cumulative regret
    if t%100 == 0: print(f"Round: {t:d}, instantaneous regret: {reg_t:.4f}, cumulative regret: {tot_reg:.4f}.")
    tot_reg += reg_t

Round: 0, instantaneous regret: 0.1523, cumulative regret: 0.0000.
Round: 100, instantaneous regret: 0.0000, cumulative regret: 18.3629.
Round: 200, instantaneous regret: 0.0321, cumulative regret: 22.2620.
Round: 300, instantaneous regret: 0.0000, cumulative regret: 23.3036.
Round: 400, instantaneous regret: 0.0000, cumulative regret: 24.2018.
Round: 500, instantaneous regret: 0.0000, cumulative regret: 24.7917.
Round: 600, instantaneous regret: 0.0000, cumulative regret: 25.2387.
Round: 700, instantaneous regret: 0.0181, cumulative regret: 25.6296.
Round: 800, instantaneous regret: 0.0000, cumulative regret: 26.0412.
Round: 900, instantaneous regret: 0.0000, cumulative regret: 26.5372.
