<a href="https://colab.research.google.com/github/jmhuer/utaustin_optimization/blob/main/homework9/Elin_assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Elimination

In this excercises, we will be playing with the Multi-arm bandit problem with the Elinmination algorithm.

## Setup

As before, consider unstructural bandit problem. Suppose we have $k$ arms, each with random rewards $p_i = u_i + \epsilon$ where $\epsilon$ is draw from i.i.d. standard gaussian. (Note that we only require $\epsilon$ to be sub-gaussian for the analysis to go through)

The following codes is capturing the setup.

In [168]:
import numpy as np
import matplotlib.pyplot as plt
import pdb

class Gaussian_Arm:
  def __init__(self, num_arms, mu=None):
    '''
    num_arms: (int). the number of arms
    mu: (None or list-type). the mean of the reward of each arm.
        if set to None, a random vector will be generated.
    '''
    if num_arms <= 1 or not isinstance(num_arms, int):
      print('number of arms has an int that is at least two')
      return
    
    self.num_arms = num_arms
    #
    if mu:
      self.mu = np.asarray(mu)
      if len(self.mu) != num_arms:
        print('The lenth of mu does not match the number of arms')
        return
    else:
      self.mu = np.random.rand((num_arms))
    # 
    self.delta = max(self.mu) - min(self.mu)
    #

    # keep track of the rewards for the user
    self.rewards_history = []
    # keep track of how many times the arms have been pulled
    self.total_pull = 0 

  def pull_arm(self, arm_id=-1, pull_time=1):
    if arm_id < 0 or arm_id >= self.num_arms:
      print('please specify arm id in the range of 0-%d' % (self.num_arms))
      return
    assert (isinstance(pull_time, int) and pull_time >= 1)
    self.total_pull += pull_time
    # Generate reward
    reward = self.mu[arm_id] * pull_time + sum(np.random.randn(pull_time))
    self.rewards_history.append(reward)
    return reward


  def genie_reward(self):
    '''
    the best expected reward after pulling self.total_pull times
    '''
    best_mu = max(self.mu)
    return self.total_pull * best_mu

  def my_rewards(self):
    return sum(self.rewards_history)

  def clear_reward_hist(self):
    self.rewards_history = []
    self.total_pull = 0



## Goal of this exercise
1. Basic Elimination algorithm implementation
2. Plot the regret VS horizon ($n$).
3. Implement the doubling trick
4. Plot the regret of the doubling trick VS horizon.

Please refer to previous assignment for tips.


In [169]:


NUM_ARMS = 2

def ETC(arm, N, delta, m=None, num_arms=NUM_ARMS):
  '''
  arm: Arm object.
  N: the ultimate time horizon
  delta: separation
  m: int. The exploration time for EACH arm
  num_arms: int. The total number of arms
  '''
  if m == None:
    m = max(1, 4/delta**2*np.log(N * delta**2 / 4))
    m = int(m)
  assert(m > 0 and isinstance(m, int))
  reward_est = np.zeros(num_arms)
  for i in range(m):
    for j in range(num_arms):
      reward_est[j] += arm.pull_arm(j) ##add rewards per arm
  best_arm = np.argmax(reward_est)
  rewards = sum(reward_est) #sum reward from exploration phase
  rewards += arm.pull_arm(best_arm, pull_time=int(N - m*num_arms)) ## pull arm N - m*num_arms
  return rewards


def doubling_ETC(arm, N, delta, num_arms=NUM_ARMS):
  total_run = 0
  r = 8
  #
  while total_run < N:
    r += 1
    N_r = num_arms ** r
    if total_run + N_r > N:
      N_r = N - total_run
    m_r = max(1, 4/delta**2*np.log(N_r * delta**2 / 4))
    m_r = int(m_r)
    ETC(arm, N_r, delta)
    total_run += N_r
  return arm.my_rewards()


In [170]:

def regret_vs_horizon(Ns:list):
  REPEAT = 1
  ETC_regret = []
  ETC_doubling_regret = []
  mu = [0.1, 0.0]
  my_arm = Gaussian_Arm(NUM_ARMS, mu=mu)
  for NUM_RUNs in Ns:
    # print(NUM_RUNs)
    ETC_cur_regret = 0
    ETC_doubling_cur_regret = 0
    for repeat in range(REPEAT):
      ETC_rewards = ETC(my_arm, NUM_RUNs, my_arm.delta) ## everyrun NUM_RUNs += 10000
      ETC_cur_regret += my_arm.genie_reward() - ETC_rewards
      my_arm.clear_reward_hist()
      #
      ETC_doubling_rewards = doubling_ETC(my_arm, NUM_RUNs, my_arm.delta)
      ETC_doubling_cur_regret += my_arm.genie_reward() - ETC_doubling_rewards
      my_arm.clear_reward_hist()
    ETC_cur_regret /= REPEAT
    ETC_doubling_cur_regret /= REPEAT
    ETC_regret.append(ETC_cur_regret)
    ETC_doubling_regret.append(ETC_doubling_cur_regret)
    #
  return ETC_regret, ETC_doubling_regret


import plotly.graph_objects as graph
def plot(all_history:list, xname:str, yname:str ,title:str, log = False):
    """
    input:
        all_history: list of dicts to plot
    ret:
        None: show plotly fig
    """
    fig = graph.Figure(layout = graph.Layout(title=graph.layout.Title(text=title))) 
    for i in range(len(all_history)):
        fig.add_trace(graph.Scatter(x = all_history[i][xname], 
                                    y = all_history[i][yname],
                                    name = all_history[i]["legend"])) 
    if log: fig.update_xaxes(type="log")
    fig.show()



In [171]:


Ninit = 10000
Ns  = [Ninit * i for i in range(1, 20)]
avg = 5
mean_ETC_regret, mean_ETC_doubling_regret = np.zeros((avg , len(Ns))), np.zeros((avg , len(Ns))) ##initialize mean arrays
for i in range(avg):
  print("simulation {} completed".format(i))
  ETC_regret,  ETC_doubling_regret = regret_vs_horizon(Ns)
  mean_ETC_regret[i] = ETC_regret
  mean_ETC_doubling_regret[i] = ETC_doubling_regret



simulation 0 completed
simulation 1 completed
simulation 2 completed
simulation 3 completed
simulation 4 completed


In [172]:

mean_ETC_regret = {"legend": "mean_ETC_regret", 
                   "x": Ns , 
                   "y": list(np.mean(mean_ETC_regret, axis=0))}

mean_ETC_doubling_regret =  {"legend": "mean_ETC_regret", 
                             "x": Ns , 
                             "y": list(np.mean(mean_ETC_doubling_regret, axis=0))}

all_history = [mean_ETC_regret, mean_ETC_doubling_regret ]



plot([mean_ETC_regret], xname="x", yname="y" , title="regret VS horizon - linear" , log = False)
plot([mean_ETC_doubling_regret], xname="x", yname="y" , title="doubling_regret VS horizon - linear" , log = False)
plot(all_history, xname="x", yname="y" , title="doubling_regret VS horizon - linear" , log = False)

plot([mean_ETC_regret], xname="x", yname="y" , title="regret VS horizon - Log" , log = True)
plot([mean_ETC_doubling_regret], xname="x", yname="y" , title="doubling_regret VS horizon - Log" , log = True)
plot(all_history, xname="x", yname="y" , title="doubling_regret VS horizon - Log" , log = True)
