In [None]:
# Imports: python modules
import sys, os, re, time
import timeit
from configparser import ConfigParser
# Science
import numpy as np
import scipy.stats as stats

In [None]:
# Imports: our own modules
# Add path
sys.path.append('../../bandits')
# Aux functions
from aux_functions import *
# Bandit modules
from bandits import *
from bandit_plotting import *
from bandit_environments import *
from bandit_reward_models import * 

# GP models
from gp_models import *

# Useful bandit execution for later use

In [None]:
def execute_bandit(bandit, environment, n_bandit_interactions):
    for t in np.arange(n_bandit_interactions):
        print('Bandit interaction t={}'.format(t))
        # Decide next arm
        #t_init=time.time()
        a_t=bandit.next_action()
        #print('\t next action decided in {}s'.format(time.time()-t_init))
        
        # Play selected arm and observe reward
        #t_init=time.time()
        y_t=environment.play(a_t)
        #print('\t action played in {}s'.format(time.time()-t_init))

        # Update history
        #t_init=time.time()
        bandit.update_history(
                played_arm=a_t,
                observed_reward=y_t
                )
        #print('\t bandit updated in {}s'.format(time.time()-t_init))
        
    return bandit

# Bandit

## Arm space

In [None]:
# n points
n_arm_points=100
a_points=torch.linspace(0,1,n_arm_points+1) # Equal spaced
# Dimensionality
d_arms=1
per_arm_meshgrid=torch.meshgrid([a_points]*d_arms)
a=torch.stack(per_arm_meshgrid, axis=-1).reshape(-1, d_arms) # n_points in dim==1

## True bandit reward model

In [None]:
true_bandit_environment=ContinuousLinearGaussianBanditModel(
                slope=1,
                intercept=0,
                noise_var=0.1
        )

In [None]:
optimal_arm = true_bandit_environment.optimal_arm(a)

## Our Bandit

### GP based bandit

In [None]:
# GP configuration
gp_config_filename='gp_config.ini'
# Based on config parser
gp_config = ConfigParser()
gp_config.read('../gp_configs/{}'.format(gp_config_filename))


In [None]:
# GP model definition (without training input/output yet) from config
gp_model=ExactGPModel(
                a=None,
                y=None,
                mean_function=eval(gp_config.get('gp_model', 'mean_function')),
                kernel_function=eval(gp_config.get('gp_model', 'kernel_function')),
                likelihood=eval(gp_config.get('gp_model', 'llh_function')),
            )

In [None]:
# GP training options, from config
gp_training_options={
        'loss':eval(
                gp_config.get(
                        'training',
                        'loss',
                        fallback='gpytorch.mlls.ExactMarginalLogLikelihood()')
                   ),
        'n_train_max_iters':gp_config.getint(
                        'training',
                        'n_train_max_iters',
                        fallback=100),
        'loss_epsilon':gp_config.getfloat(
                        'training',
                        'loss_epsilon',
                        fallback=0.01),
        'optimizer':gp_config.get(
                        'training',
                        'optimizer',
                        fallback=torch.optim.Adam),
        'optimizer_params':cast_dict_values(
                            gp_config._sections['optimization_params'],
                            float
                            ),
    }

In [None]:
# Bandit reward model, from config
bandit_reward_model=GPRewardModel(
        gp_model=gp_model,
        likelihood_model=eval(
                            gp_config.get('gp_model', 'llh_function')
                            ),
        gp_training=gp_training_options,
        )

### Bandit algorithm: Thompson sampling

In [None]:
# Thompson sampling 
ts_algorithm={
    'name':'ThompsonSampling'
}

In [None]:
# Instantiate bandit class
ts_bandit=ContinuousArmBandit(
            arm_space=a,
            reward_model=bandit_reward_model,
            algorithm=ts_algorithm
            )

# One bandit execution

In [None]:
# Number of interactions
n_bandit_interactions=250

In [None]:
# Optimal reward can be pre-computed
optimal_expected_rewards = true_bandit_environment.mean(
                            optimal_arm*torch.ones((1,n_bandit_interactions))
                        )

In [None]:
# Execute bandit, for n_bandit_interaction
ts_bandit.restart(n_bandit_interactions)
ts_bandit = execute_bandit(ts_bandit, true_bandit_environment, n_bandit_interactions)

In [None]:
print(optimal_arm==ts_bandit.played_arms)
print(torch.sum(optimal_arm==ts_bandit.played_arms)/n_bandit_interactions)

# Multiple bandit realizations

In [None]:
# Number of interactions
n_bandit_interactions=250
# Optimal reward can be pre-computed
optimal_expected_rewards = true_bandit_environment.mean(
                            optimal_arm*torch.ones((1,n_bandit_interactions))
                        )
# Number of realizations
R=100
# Bandit arms and rewards for all realizations
played_arms=np.zeros((R,n_bandit_interactions,d_arms))
observed_rewards=np.zeros((R,n_bandit_interactions))

# Run realizations of bandits
for r in np.arange(R):
    print('************ r={}/{} **************'.format(r,R))
    # Restart bandit
    ts_bandit.restart(n_bandit_interactions)
    # Execute bandit realization
    this_bandit = execute_bandit(ts_bandit, true_bandit_environment, n_bandit_interactions)
    # Keep played arms and observed rewards
    played_arms[r]=this_bandit.played_arms
    observed_rewards[r]=this_bandit.observed_rewards
    print('**********************************'.format(r,R))

In [None]:
# Plotting
plot_rewards(observed_rewards, optimal_expected_rewards, n_bandit_interactions)
plot_cumrewards(observed_rewards, optimal_expected_rewards, n_bandit_interactions)
plot_cumregret(observed_rewards, optimal_expected_rewards, n_bandit_interactions)

# Plotting (saved)
fig_dir='./figs_GPBandit_linear_Gaussian'
os.makedirs(fig_dir, exist_ok=True)
plot_rewards(
    observed_rewards,
    optimal_expected_rewards,
    n_bandit_interactions,
    plot_filename='{}/rewards_ts_R{}.pdf'.format(fig_dir, R),
)
plot_cumregret(observed_rewards,
    optimal_expected_rewards,
    n_bandit_interactions,
    plot_filename='{}/cum_regret_ts_R{}.pdf'.format(fig_dir, R),
)

### Bandit algorithm: UCB

In [None]:
# Time-varying beta function to use
# Update beta functions as per guidelines here:
#   https://arxiv.org/abs/0912.3995
#   https://papers.nips.cc/paper/2011/file/f3f1b7fc5a8779a9e618e1f23a7b7860-Paper.pdf
def beta_function(t,D=1,delta=0.01):
    return 2*torch.log(
                D*torch.pow(torch.tensor((t+1)*np.pi),2)/(6*delta)
            )

In [None]:
# UCB 
ucb_algorithm={
    'name':'UCB',
    'beta':beta_function
}

In [None]:
# Instantiate bandit class
ucb_bandit=ContinuousArmBandit(
            arm_space=a,
            reward_model=bandit_reward_model,
            algorithm=ucb_algorithm
            )

# Multiple bandit realizations

In [None]:
# Number of interactions
n_bandit_interactions=250
# Optimal reward can be pre-computed
optimal_expected_rewards = true_bandit_environment.mean(
                            optimal_arm*torch.ones((1,n_bandit_interactions))
                        )
# Number of realizations
R=100
# Bandit arms and rewards for all realizations
played_arms=np.zeros((R,n_bandit_interactions,d_arms))
observed_rewards=np.zeros((R,n_bandit_interactions))

# Run realizations of bandits
for r in np.arange(R):
    print('************ r={}/{} **************'.format(r,R))
    # Restart bandit
    ucb_bandit.restart(n_bandit_interactions)
    # Execute bandit realization
    this_bandit = execute_bandit(ucb_bandit, true_bandit_environment, n_bandit_interactions)
    # Keep played arms and observed rewards
    played_arms[r]=this_bandit.played_arms
    observed_rewards[r]=this_bandit.observed_rewards
    print('**********************************'.format(r,R))

In [None]:
# Plotting
plot_rewards(observed_rewards, optimal_expected_rewards, n_bandit_interactions)
plot_cumrewards(observed_rewards, optimal_expected_rewards, n_bandit_interactions)
plot_cumregret(observed_rewards, optimal_expected_rewards, n_bandit_interactions)

# Plotting (saved)
fig_dir='./figs_GPBandit_linear_Gaussian'
os.makedirs(fig_dir, exist_ok=True)
plot_rewards(
    observed_rewards,
    optimal_expected_rewards,
    n_bandit_interactions,
    plot_filename='{}/rewards_ucb_R{}.pdf'.format(fig_dir, R),
)
plot_cumregret(observed_rewards,
    optimal_expected_rewards,
    n_bandit_interactions,
    plot_filename='{}/cum_regret_ucb_R{}.pdf'.format(fig_dir, R),
)