<a href="https://colab.research.google.com/github/jmhuer/utaustin_optimization/blob/main/homework9/Elin_assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Elimination

In this excercises, we will be playing with the Multi-arm bandit problem with the Elinmination algorithm.

## Setup

As before, consider unstructural bandit problem. Suppose we have $k$ arms, each with random rewards $p_i = u_i + \epsilon$ where $\epsilon$ is draw from i.i.d. standard gaussian. (Note that we only require $\epsilon$ to be sub-gaussian for the analysis to go through)

The following codes is capturing the setup.

In [105]:
import numpy as np
import matplotlib.pyplot as plt
import pdb

class Gaussian_Arm:
  def __init__(self, num_arms, mu=None):
    '''
    num_arms: (int). the number of arms
    mu: (None or list-type). the mean of the reward of each arm.
        if set to None, a random vector will be generated.
    '''
    if num_arms <= 1 or not isinstance(num_arms, int):
      print('number of arms has an int that is at least two')
      return
    
    self.num_arms = num_arms
    #
    if mu:
      self.mu = np.asarray(mu)
      if len(self.mu) != num_arms:
        print('The lenth of mu does not match the number of arms')
        return
    else:
      self.mu = np.random.rand((num_arms))
    # 
    self.delta = max(self.mu) - min(self.mu)
    #

    # keep track of the rewards for the user
    self.rewards_history = []
    # keep track of how many times the arms have been pulled
    self.total_pull = 0 

  def pull_arm(self, arm_id=-1, pull_time=1):
    if arm_id < 0 or arm_id >= self.num_arms:
      print('please specify arm id in the range of 0-%d' % (self.num_arms))
      return
    assert (isinstance(pull_time, int) and pull_time >= 1)
    self.total_pull += pull_time
    # Generate reward
    reward = self.mu[arm_id] * pull_time + sum(np.random.randn(pull_time))
    self.rewards_history.append(reward)
    return reward


  def genie_reward(self):
    '''
    the best expected reward after pulling self.total_pull times
    '''
    best_mu = max(self.mu)
    return self.total_pull * best_mu

  def my_rewards(self):
    return sum(self.rewards_history)

  def clear_reward_hist(self):
    self.rewards_history = []
    self.total_pull = 0



## Goal of this exercise
1. Basic Elimination algorithm implementation
2. Plot the regret VS horizon ($n$).
3. Implement the doubling trick
4. Plot the regret of the doubling trick VS horizon.

Please refer to previous assignment for tips.


In [106]:


NUM_ARMS = 2

class Empirical_mean:
    def __init__(self,num_arms):
        self.rewards = np.zeros(num_arms)
        self.count = np.zeros(num_arms)
        self.mean =  np.zeros(num_arms)
    def add_new_reward(self, reward, indx):
        self.rewards[indx] += reward
        self.count[indx] += 1
        self.mean[indx] = self.rewards[indx] / self.count[indx]
    def mean(self, indx):
        return self.mean[indx]
    def reset(self):
        self.rewards = self.rewards * 0 
        self.count = self.rewards * 0 
        self.mean =  self.rewards * 0 


def elimination(arm, N, num_arms=NUM_ARMS):
  '''
  arm: Arm object.
  N: the ultimate time horizon
  delta: separation
  m: int. The exploration time for EACH arm
  num_arms: int. The total number of arms
  '''
  delta=1
  B = [i for i in range(num_arms)] # here live index of alive arms 
  mean_vals = Empirical_mean(num_arms)
  while len(B) > 1: ##repeat untill only 1 left
      mean_vals.reset()
      m = int( (2*np.log(N * (delta**2) )) / delta**2  )
      if m <= 1: break
    
      # PHASE 1: round robin
      for i in range(m): 
        for k in B:
          mean_vals.add_new_reward(arm.pull_arm(k), k)

      # PHASE 2: decide who stays
      C = np.sqrt(np.log(N*(delta**2))/(2*m))
      max_muo = max([mean_vals.mean[b] - C for b in B])
      # print("max measn", max_muo )
      # print("B + C ", mean_vals.mean + C)
      B = [b for b in B if (mean_vals.mean[b] + C >= max_muo)]
      delta *= 0.5
      # print("size of C {} \t size of B {} \t delta {} \t m val {}".format(C, len(B), delta, m))
  # print("size of B~~~~~~~~~~", len(B))
  pulls_left = int(N - sum(mean_vals.count))
  if pulls_left > 0 : arm.pull_arm(B.pop(), pull_time = pulls_left) ## pull arm N - m*num_arms
  return arm.my_rewards()



def doubling_elimination(arm, N, num_arms=NUM_ARMS):
  total_run = 0
  r = 8
  delta = arm.delta
  #
  while total_run < N:
    r += 1
    N_r = num_arms ** r
    if total_run + N_r > N:
      N_r = N - total_run
    m_r = max(1, 4/delta**2*np.log(N_r * delta**2 / 4))
    m_r = int(m_r)
    elimination(arm, N=m_r, num_arms=num_arms)
    total_run += N_r
  return arm.my_rewards()



In [107]:


def regret_vs_horizon(Ns:list, REPEAT:int, algorithm: type(lambda x: None)):
  regret = []
  mu = [0.1, 0.0]
  my_arm = Gaussian_Arm(NUM_ARMS, mu=mu)
  for NUM_RUNs in Ns:
    print(NUM_RUNs)
    cur_regret = 0
    for repeat in range(REPEAT):
      rewards = algorithm(my_arm, NUM_RUNs, my_arm.num_arms) ## everyrun NUM_RUNs += 10000
      cur_regret += my_arm.genie_reward() - rewards
      my_arm.clear_reward_hist()
      #
    cur_regret /= REPEAT
    regret.append(cur_regret)
    #
  return regret



import plotly.graph_objects as graph
def plot(all_history:list, title:str, log = False):
    """
    input:
        all_history: list of dicts to plot
    ret:
        None: show plotly fig
    """
    fig = graph.Figure(layout = graph.Layout(title=graph.layout.Title(text=title))) 
    for i in range(len(all_history)):
        fig.add_trace(graph.Scatter(x = all_history[i]["x"], 
                                    y = all_history[i]["y"],
                                    name = all_history[i]["legend"])) 
    if log: fig.update_xaxes(type="log")
    fig.show()




In [None]:

Ninit = 200
Ns  = [Ninit * (2**i) for i in range(1, 10)]
UCB_regret = regret_vs_horizon(Ns, REPEAT=300, algorithm=elimination)


400
800
1600
3200


In [None]:

Ninit = 200
Ns  = [Ninit * (2**i) for i in range(1, 10)]
UCB_doubling_regret = regret_vs_horizon(Ns, REPEAT=200, algorithm=doubling_elimination)


In [None]:

mean_ETC_regret = {"legend": "mean_ETC_regret", 
                   "x": Ns , 
                   "y": UCB_regret}

mean_ETC_doubling_regret =  {"legend": "mean_ETC_regret", 
                             "x": Ns , 
                             "y": UCB_doubling_regret}

# all_history = [mean_ETC_regret, mean_ETC_doubling_regret ]

plot([mean_ETC_regret], title="regret VS horizon - linear" , log = False)
plot([mean_ETC_regret], title="regret VS horizon - Log" , log = True)

plot([mean_ETC_doubling_regret], title="doubling_regret VS horizon - linear" , log = False)
plot([mean_ETC_doubling_regret], title="doubling_regret VS horizon - Log" , log = True)

