In [None]:
# Imports: python modules
import sys, os, re, time
import timeit
from configparser import ConfigParser
# Science
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt

In [None]:
# Imports: our own modules
# Add path
sys.path.append('../../bandits')
# Aux functions
from aux_functions import *
# Bandit modules
from bandits import *
from bandit_plotting import *
from bandit_environments import *
from bandit_reward_models import * 

# GP models
from gp_models import *

# Useful bandit execution for later use

In [None]:
def execute_contextual_bandit(my_bandit, environment, n_bandit_interactions, context):
    for t in np.arange(n_bandit_interactions):
        print('Bandit interaction t={}'.format(t))
        
        # Decide next arm
        #t_init=time.time()
        a_t=my_bandit.next_action(context[t])
        #print('\t next action decided in {}s'.format(time.time()-t_init))
        
        # Play selected arm and observe reward
        #t_init=time.time()
        y_t=environment.play(a_t,context[t])
        #print('\t action played in {}s'.format(time.time()-t_init))

        # Update history
        #t_init=time.time()
        my_bandit.update_history(
                observed_context=context[t],
                played_arm=a_t,
                observed_reward=y_t
                )
        #print('\t bandit updated in {}s'.format(time.time()-t_init))
        
    return my_bandit

# Bandit

## Arm space

In [None]:
# n points
n_arm_points=100
a_points=torch.linspace(0,1,n_arm_points+1) # Equal spaced
# Dimensionality
d_arms=2
per_arm_meshgrid=torch.meshgrid([a_points]*d_arms)
a=torch.stack(per_arm_meshgrid, axis=-1).reshape(-1, d_arms) # n_points in dim==1
# Context space 
# For now, d_contex==d_arm
d_context=d_arms

## True bandit reward model

In [None]:
true_bandit_environment=ContinuousContextualLinearGaussianBanditModel(
                slope=1,
                intercept=0,
                noise_var=0.1
        )

## Our Bandit

### GP based bandit

In [None]:
# GP configuration
gp_config_filename='contextual_gp_config.ini'
# Based on config parser
gp_config = ConfigParser()
gp_config.read('../gp_configs/{}'.format(gp_config_filename))


In [None]:
# GP model definition (without training input/output yet) from config
gp_model=ExactContextualGPModel(
                        gp_input=None, y=None,
                        d_context=d_context,
                        mean_functions=load_gp_functions_to_dict(gp_config, 'mean_functions'),
                        kernel_functions=load_gp_functions_to_dict(gp_config, 'kernel_functions'),
                        action_context_composition=gp_config.get(
                                    'gp_model','action_context_composition',fallback='add'
                                    ),
                        likelihood=eval(gp_config.get('gp_model', 'llh_function')),
                    )

In [None]:
# GP training options, from config
gp_training_options={
        'loss':eval(
                gp_config.get(
                        'training',
                        'loss',
                        fallback='gpytorch.mlls.ExactMarginalLogLikelihood()')
                   ),
        'n_train_max_iters':gp_config.getint(
                        'training',
                        'n_train_max_iters',
                        fallback=100),
        'loss_epsilon':gp_config.getfloat(
                        'training',
                        'loss_epsilon',
                        fallback=0.01),
        'optimizer':gp_config.get(
                        'training',
                        'optimizer',
                        fallback=torch.optim.Adam),
        'optimizer_params':cast_dict_values(
                            gp_config._sections['optimization_params'],
                            float
                            ),
    }

In [None]:
# Bandit reward model, from config
bandit_reward_model=GPContextualRewardModel(
        gp_model=gp_model,
        likelihood_model=eval(
                            gp_config.get('gp_model', 'llh_function')
                            ),
        gp_training=gp_training_options,
        )

### Bandit algorithm

In [None]:
# Thompson sampling 
ts_algorithm={'name':'ThompsonSampling'}

In [None]:
# Instantiate bandit class
ts_bandit=ContinuousArmContextualBandit(
            d_context=d_context,
            arm_space=a,
            reward_model=bandit_reward_model,
            algorithm=ts_algorithm
            )

# One bandit execution

In [None]:
# Number of interactions
n_bandit_interactions=250

In [None]:
# context
context=torch.ones((n_bandit_interactions,d_context)) # Non-sensincal context
#context=torch.arange(n_bandit_interactions)*torch.ones((1,d_context)) # Increasing context

In [None]:
# optimal arm 
optimal_arm = true_bandit_environment.optimal_arm(a,context[:,None,:]) # One more dimension to pick arm per bandit interaction
# Optimal reward can be pre-computed
optimal_expected_rewards = true_bandit_environment.mean(
                            optimal_arm,
                            context
                        )

In [None]:
# Execute bandit
ts_bandit.restart(n_bandit_interactions)
ts_bandit = execute_contextual_bandit(ts_bandit, true_bandit_environment, n_bandit_interactions, context)

In [None]:
ts_bandit.played_arms.shape

In [None]:
print(optimal_arm==ts_bandit.played_arms)
print(torch.sum(optimal_arm==ts_bandit.played_arms)/n_bandit_interactions)

# Multiple bandit realizations

In [None]:
# Number of interactions
n_bandit_interactions=250

# context
context=torch.ones((n_bandit_interactions,d_context)) # Non-sensincal context
#context=torch.arange(n_bandit_interactions)*torch.ones((1,d_context)) # Increasing context

# optimal arm 
optimal_arm = true_bandit_environment.optimal_arm(a,context[:,None,:]) # One more dimension to pick arm per bandit interaction
# Optimal reward can be pre-computed
optimal_expected_rewards = true_bandit_environment.mean(
                            optimal_arm,
                            context
                        )[None,:]
# Number of realizations
R=10
# Bandit arms and rewards for all realizations
played_arms=np.zeros((R,n_bandit_interactions,d_arms))
observed_rewards=np.zeros((R,n_bandit_interactions))

# Run realizations of bandits
for r in np.arange(R):
    print('************ r={}/{} **************'.format(r,R))
    # Restart bandit
    ts_bandit.restart(n_bandit_interactions)
    # Execute bandit realization
    this_bandit = execute_contextual_bandit(ts_bandit, true_bandit_environment, n_bandit_interactions, context)
    # Keep played arms and observed rewards
    played_arms[r]=this_bandit.played_arms
    observed_rewards[r]=this_bandit.observed_rewards
    print('**********************************'.format(r,R))

In [None]:
# Plotting
plot_rewards(observed_rewards, optimal_expected_rewards, n_bandit_interactions)
plot_cumrewards(observed_rewards, optimal_expected_rewards, n_bandit_interactions)
plot_cumregret(observed_rewards, optimal_expected_rewards, n_bandit_interactions)

# Plotting (saved)
fig_dir='./figs_ContextualGPBandit_linear_Gaussian'
os.makedirs(fig_dir, exist_ok=True)
plot_rewards(
    observed_rewards,
    optimal_expected_rewards,
    n_bandit_interactions,
    plot_filename='{}/rewards_ts_R{}.pdf'.format(fig_dir, R),
)
plot_cumregret(observed_rewards,
    optimal_expected_rewards,
    n_bandit_interactions,
    plot_filename='{}/cum_regret_ts_R{}.pdf'.format(fig_dir, R),
)


### Bandit algorithm: UCB¶

In [None]:
# Time-varying beta function to use
# Update beta functions as per guidelines here:
#   https://arxiv.org/abs/0912.3995
#   https://papers.nips.cc/paper/2011/file/f3f1b7fc5a8779a9e618e1f23a7b7860-Paper.pdf
def beta_function(t,D=1,delta=0.01):
    return 2*torch.log(
                D*torch.pow(torch.tensor((t+1)*np.pi),2)/(6*delta)
            )

In [None]:
# UCB
ucb_algorithm={
    'name':'UCB',
    'beta':beta_function
}

In [None]:
# Instantiate bandit class
ucb_bandit=ContinuousArmContextualBandit(
            d_context=d_context,
            arm_space=a,
            reward_model=bandit_reward_model,
            algorithm=ucb_algorithm
            )

In [None]:
# Number of interactions
n_bandit_interactions=250

# context
context=torch.ones((n_bandit_interactions,d_context)) # Non-sensincal context
#context=torch.arange(n_bandit_interactions)*torch.ones((1,d_context)) # Increasing context

# optimal arm 
optimal_arm = true_bandit_environment.optimal_arm(a,context[:,None,:]) # One more dimension to pick arm per bandit interaction
# Optimal reward can be pre-computed
optimal_expected_rewards = true_bandit_environment.mean(
                            optimal_arm,
                            context
                        )[None,:]
# Number of realizations
R=10
# Bandit arms and rewards for all realizations
played_arms=np.zeros((R,n_bandit_interactions,d_arms))
observed_rewards=np.zeros((R,n_bandit_interactions))

# Run realizations of bandits
for r in np.arange(R):
    print('************ r={}/{} **************'.format(r,R))
    # Restart bandit
    ucb_bandit.restart(n_bandit_interactions)
    # Execute bandit realization
    this_bandit = execute_contextual_bandit(ucb_bandit, true_bandit_environment, n_bandit_interactions, context)
    # Keep played arms and observed rewards
    played_arms[r]=this_bandit.played_arms
    observed_rewards[r]=this_bandit.observed_rewards
    print('**********************************'.format(r,R))

In [None]:
# Plotting
plot_rewards(observed_rewards, optimal_expected_rewards, n_bandit_interactions)
plot_cumrewards(observed_rewards, optimal_expected_rewards, n_bandit_interactions)
plot_cumregret(observed_rewards, optimal_expected_rewards, n_bandit_interactions)

# Plotting (saved)
fig_dir='./figs_ContextualGPBandit_linear_Gaussian'
os.makedirs(fig_dir, exist_ok=True)
plot_rewards(
    observed_rewards,
    optimal_expected_rewards,
    n_bandit_interactions,
    plot_filename='{}/rewards_ucb_R{}.pdf'.format(fig_dir, R),
)
plot_cumregret(observed_rewards,
    optimal_expected_rewards,
    n_bandit_interactions,
    plot_filename='{}/cum_regret_ucb_R{}.pdf'.format(fig_dir, R),
)