<a href="https://colab.research.google.com/github/jmhuer/utaustin_optimization/blob/main/homework13/LinUCB_assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# LinUCB

In this exercise, we will start looking at linear bandit with finite arms.

The set-up is captured by the following python class:

In [None]:
import numpy as np
from matplotlib import pylab as plt
from tqdm import tqdm

# (4 choose 2)
available_arms = np.array([
  (1, 1, 0, 0),
  (1, 0, 1, 0),
  (1, 0, 0, 1),
  (0, 1, 1, 0),
  (0, 1, 0, 1),
  (0, 0, 1, 1)])

class context_arm(object):
  def __init__(self, available_arms=available_arms,gaussian_bandit=True):
    self.theta = np.array((0.1, 0.2, 0.2, 0.3))
    self.available_arms = np.array(available_arms)
    self.gaussian_bandit = gaussian_bandit

  def pull_arm(self, arm_idx):
    arm = self.available_arms[arm_idx]
    if self.gaussian_bandit:
      reward = self.theta.dot(arm) + np.random.randn() / 2 
    else:  # Bernoulli bandit
      reward = 1 if np.random.random()< self.theta.dot(arm) else 0
    return reward
  
  def genie_reward(self, arms=None):
    if not arms:
      arms = self.available_arms
    if self.gaussian_bandit:
      reward = np.max([self.theta.dot(arm) for arm in arms]) 
    else:  # Bernoulli bandit
      reward = np.max([self.theta.dot(arm) for arm in arms])
    return reward


## Goal of this exercise
1. Basic LinUCB algorithm implementation, for both Gaussian Reward and Bernoulli rewards
2. Plot the regret VS horizon ($n$).
3. Compare LinUCB with original UCB (for Gaussian rewards only) and KL-UCB (for Bernoulli rewards only)



In [None]:


NUM_ARMS = 2

class Empirical_mean:
    def __init__(self,num_arms):
        self.rewards = np.zeros(num_arms)
        self.count = np.zeros(num_arms)
        self.mean =  np.zeros(num_arms)
    def add_new_reward(self, reward, indx):
        self.rewards[indx] += reward
        self.count[indx] += 1
        self.mean[indx] = self.rewards[indx] / self.count[indx]
    def mean(self, indx):
        return self.mean[indx]
    def reset(self):
        self.rewards = self.rewards * 0 
        self.count = self.rewards * 0 
        self.mean =  self.rewards * 0 



def LinUCB(arm, N, num_arms=NUM_ARMS):
  delta = 1/N**2 
  alpha = 1 + np.sqrt(np.log(2/delta) / 2)
  bonus = lambda x, a, t: alpha * np.sqrt(x @ A[a] @ x) 
  A = []
  b = []
  for i in range(N):
      UCBbest_arm = np.argmax([u + bonus(delta, t) for (u,t) in zip(mean_vals.mean, mean_vals.count)])
      mean_vals.add_new_reward(arm.pull_arm(UCBbest_arm), UCBbest_arm)
  return arm.my_rewards()



In [None]:

def regret_vs_horizon(Ns:list, REPEAT:int, algorithm: type(lambda x: None)):
  regret = []
  mu = [0.1, 0.0]
  my_arm = Gaussian_Arm(NUM_ARMS, mu=mu)
  for NUM_RUNs in Ns:
    print(NUM_RUNs)
    cur_regret = 0
    for repeat in range(REPEAT):
        rewards = algorithm(my_arm, NUM_RUNs, my_arm.num_arms) ## everyrun NUM_RUNs += 10000
        cur_regret += my_arm.genie_reward() - rewards
        my_arm.clear_reward_hist()
    cur_regret /= REPEAT
    regret.append(cur_regret)
    #
  return regret


import plotly.graph_objects as graph
def plot(all_history:list, title:str, log = False):
    """
    input:
        all_history: list of dicts to plot
    ret:
        None: show plotly fig
    """
    fig = graph.Figure(layout = graph.Layout(title=graph.layout.Title(text=title))) 
    for i in range(len(all_history)):
        fig.add_trace(graph.Scatter(x = all_history[i]["x"], 
                                    y = all_history[i]["y"],
                                    name = all_history[i]["legend"])) 
    if log: fig.update_xaxes(type="log")
    fig.show()



In [None]:

Ninit = 200
Ns  = [Ninit * (2**i) for i in range(1, 11)]
UCB_regret = regret_vs_horizon(Ns, REPEAT=200, algorithm=UCB)


plot_UCB_regret = {"legend": "mean_UBC_regret", 
                   "x": Ns , 
                   "y": UCB_regret}

plot([plot_UCB_regret], title="regret VS horizon - linear" , log = False)
plot([plot_UCB_regret], title="regret VS horizon - Log" , log = True)
