<a href="https://colab.research.google.com/github/jmhuer/utaustin_optimization/blob/main/homework12/LinUCB_assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# LinUCB

In this exercise, we will start looking at linear bandit with finite arms.

The set-up is captured by the following python class:

In [95]:
import numpy as np
from matplotlib import pylab as plt
from tqdm import tqdm

# (4 choose 2)
available_arms = np.array([
  (1, 1, 0, 0),
  (1, 0, 1, 0),
  (1, 0, 0, 1),
  (0, 1, 1, 0),
  (0, 1, 0, 1),
  (0, 0, 1, 1)])

class Context_arm(object):
  def __init__(self, available_arms=available_arms,gaussian_bandit=True):
    self.available_arms = np.array(available_arms)
    self.gaussian_bandit = gaussian_bandit
    self.num_arms = len(self.available_arms)
    self.theta = np.array((0.1, 0.2, 0.2, 0.3)) #this is what we want to learn
    self.num_features = len(self.available_arms[0])

    # keep track of the rewards for the user
    self.rewards_history = []
    # keep track of how many times the arms have been pulled
    self.total_pull = 0 


  def pull_arm(self, arm_idx):
    arm = self.available_arms[arm_idx]
    if self.gaussian_bandit:
      reward = self.theta.dot(arm) + np.random.randn() / 2 
    else:  # Bernoulli bandit
      reward = 1 if np.random.random()< self.theta.dot(arm) else 0
    
    self.total_pull+=1
    self.rewards_history.append(reward)
    return reward
  
  def genie_reward(self, arms=None):
    if not arms:
      arms = self.available_arms
    if self.gaussian_bandit:
      reward = np.max([self.theta.dot(arm) for arm in arms]) 
    else:  # Bernoulli bandit
      reward = np.max([self.theta.dot(arm) for arm in arms])
    return reward * self.total_pull

  def my_rewards(self):
    return sum(self.rewards_history)

  def clear_reward_hist(self):
    self.rewards_history = []
    self.total_pull = 0


## Goal of this exercise
1. Basic LinUCB algorithm implementation, for both Gaussian Reward and Bernoulli rewards
2. Plot the regret VS horizon ($n$).
3. Compare LinUCB with original UCB (for Gaussian rewards only) and KL-UCB (for Bernoulli rewards only)



In [96]:


# NUM_ARMS = 2


def LinUCB(arm, N, num_arms=NUM_ARMS): 
  pull_val = 1
  ##pre alg calculations
  delta = 1/N**2 #desiree confidence
  alpha = 1 + np.sqrt(np.log(2/delta) / 2)
  bonus = lambda x, A, a, t: 2 * np.sqrt(x.T @ (np.linalg.inv(A[a]) @ x)) 

  #I will initialize A, b before main loop for simplicity, yahoo paper does it when new actions show up
  A = [np.identity(arm.num_features) for i in range(arm.num_arms)]
  b = [np.zeros(arm.num_features) for i in range(arm.num_arms)]

  # MAIN LOOOP
  for t in range(N):
      thetas = [np.linalg.inv(Aa) @ ba for (Aa, ba) in zip(A, b)] #disjoint implementation meaning thetas are not shared accross arms 
      confidence_intervals = np.array([(theta.T @ xt) + bonus(xt, A, a, t) for (xt, a, theta) in zip(arm.available_arms, range(arm.num_arms), thetas)])
      
      #pull top 2
      UCBbest_arms = (-confidence_intervals).argsort()[:pull_val] # top 2
      for best_arm in UCBbest_arms:
        rt = arm.pull_arm(best_arm)
        A = [Aa + np.outer(xt, xt.T) for (xt, Aa) in zip(arm.available_arms, A)]
        b = [ba + (rt * xt) for (xt, ba) in zip(arm.available_arms, b)]

  # DONE
  print(thetas)
  return arm.my_rewards()



In [97]:

def regret_vs_horizon(Ns:list, REPEAT:int, algorithm: type(lambda x: None)):
  regret = []
  # mu = [0.1, 0.0]
  my_arm = Context_arm()
  for NUM_RUNs in Ns:
    print(NUM_RUNs)
    cur_regret = 0
    for repeat in range(REPEAT):
        rewards = algorithm(my_arm, NUM_RUNs, my_arm.num_arms) ## everyrun NUM_RUNs += 10000
        cur_regret += my_arm.genie_reward() - rewards
        my_arm.clear_reward_hist()
    cur_regret /= REPEAT
    regret.append(cur_regret)
    #
  return regret


import plotly.graph_objects as graph
def plot(all_history:list, title:str, log = False):
    """
    input:
        all_history: list of dicts to plot
    ret:
        None: show plotly fig
    """
    fig = graph.Figure(layout = graph.Layout(title=graph.layout.Title(text=title))) 
    for i in range(len(all_history)):
        fig.add_trace(graph.Scatter(x = all_history[i]["x"], 
                                    y = all_history[i]["y"],
                                    name = all_history[i]["legend"])) 
    if log: fig.update_xaxes(type="log")
    fig.show()



In [98]:

Ninit = 100
Ns  = [Ninit * (2**i) for i in range(1, 5)]
UCB_regret = regret_vs_horizon(Ns, REPEAT=100, algorithm=LinUCB)


plot_UCB_regret = {"legend": "mean_UBC_regret", 
                   "x": Ns , 
                   "y": UCB_regret}

plot([plot_UCB_regret], title="regret VS horizon - linear" , log = False)
plot([plot_UCB_regret], title="regret VS horizon - Log" , log = True)


200
[array([0.14621782, 0.14621782, 0.        , 0.        ]), array([0.14621782, 0.        , 0.14621782, 0.        ]), array([0.14621782, 0.        , 0.        , 0.14621782]), array([0.        , 0.14621782, 0.14621782, 0.        ]), array([0.        , 0.14621782, 0.        , 0.14621782]), array([0.        , 0.        , 0.14621782, 0.14621782])]
[array([0.14489877, 0.14489877, 0.        , 0.        ]), array([0.14489877, 0.        , 0.14489877, 0.        ]), array([0.14489877, 0.        , 0.        , 0.14489877]), array([0.        , 0.14489877, 0.14489877, 0.        ]), array([0.        , 0.14489877, 0.        , 0.14489877]), array([0.        , 0.        , 0.14489877, 0.14489877])]
[array([0.12839086, 0.12839086, 0.        , 0.        ]), array([0.12839086, 0.        , 0.12839086, 0.        ]), array([0.12839086, 0.        , 0.        , 0.12839086]), array([0.        , 0.12839086, 0.12839086, 0.        ]), array([0.        , 0.12839086, 0.        , 0.12839086]), array([0.        , 0.   