<a href="https://colab.research.google.com/github/jmhuer/utaustin_optimization/blob/main/homework9/Elin_assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Elimination

In this excercises, we will be playing with the Multi-arm bandit problem with the Elinmination algorithm.

## Setup

As before, consider unstructural bandit problem. Suppose we have $k$ arms, each with random rewards $p_i = u_i + \epsilon$ where $\epsilon$ is draw from i.i.d. standard gaussian. (Note that we only require $\epsilon$ to be sub-gaussian for the analysis to go through)

The following codes is capturing the setup.

In [110]:
import numpy as np
import matplotlib.pyplot as plt
import pdb

class Gaussian_Arm:
  def __init__(self, num_arms, mu=None):
    '''
    num_arms: (int). the number of arms
    mu: (None or list-type). the mean of the reward of each arm.
        if set to None, a random vector will be generated.
    '''
    if num_arms <= 1 or not isinstance(num_arms, int):
      print('number of arms has an int that is at least two')
      return
    
    self.num_arms = num_arms
    #
    if mu:
      self.mu = np.asarray(mu)
      if len(self.mu) != num_arms:
        print('The lenth of mu does not match the number of arms')
        return
    else:
      self.mu = np.random.rand((num_arms))
    # 
    self.delta = max(self.mu) - min(self.mu)
    #

    # keep track of the rewards for the user
    self.rewards_history = []
    # keep track of how many times the arms have been pulled
    self.total_pull = 0 

  def pull_arm(self, arm_id=-1, pull_time=1):
    if arm_id < 0 or arm_id >= self.num_arms:
      print('please specify arm id in the range of 0-%d' % (self.num_arms))
      return
    assert (isinstance(pull_time, int) and pull_time >= 1)
    self.total_pull += pull_time
    # Generate reward
    reward = self.mu[arm_id] * pull_time + sum(np.random.randn(pull_time))
    self.rewards_history.append(reward)
    return reward


  def genie_reward(self):
    '''
    the best expected reward after pulling self.total_pull times
    '''
    best_mu = max(self.mu)
    return self.total_pull * best_mu

  def my_rewards(self):
    return sum(self.rewards_history)

  def clear_reward_hist(self):
    self.rewards_history = []
    self.total_pull = 0



## Goal of this exercise
1. Basic Elimination algorithm implementation
2. Plot the regret VS horizon ($n$).
3. Implement the doubling trick
4. Plot the regret of the doubling trick VS horizon.

Please refer to previous assignment for tips.


In [111]:


NUM_ARMS = 2

def ETC(arm, N, delta, m=None, num_arms=NUM_ARMS):
  '''
  arm: Arm object.
  N: the ultimate time horizon
  delta: separation
  m: int. The exploration time for EACH arm
  num_arms: int. The total number of arms
  '''
  if m == None:
    m = max(1, 4/delta**2*np.log(N * delta**2 / 4))
    m = int(m)
  assert(m > 0 and isinstance(m, int))
  reward_est = np.zeros(num_arms)
  for i in range(m):
    for j in range(num_arms):
      reward_est[j] += arm.pull_arm(j) ##add rewards per arm
  best_arm = np.argmax(reward_est)
  rewards = sum(reward_est) #sum reward from exploration phase
  rewards += arm.pull_arm(best_arm, pull_time=int(N - m*num_arms)) ## pull arm N - m*num_arms
  return rewards


def doubling_ETC(arm, N, delta, num_arms=NUM_ARMS):
  total_run = 0
  r = 8
  #
  while total_run < N:
    r += 1
    N_r = num_arms ** r
    if total_run + N_r > N:
      N_r = N - total_run
    m_r = max(1, 4/delta**2*np.log(N_r * delta**2 / 4))
    m_r = int(m_r)
    ETC(arm, N_r, delta)
    total_run += N_r
  return arm.my_rewards()


In [112]:

def regret_vs_horizon(Ns:list):
  REPEAT = 1
  ETC_regret = []
  ETC_doubling_regret = []
  mu = [0.1, 0.0]
  my_arm = Gaussian_Arm(NUM_ARMS, mu=mu)
  for NUM_RUNs in Ns:
    # print(NUM_RUNs)
    ETC_cur_regret = 0
    ETC_doubling_cur_regret = 0
    for repeat in range(REPEAT):
      ETC_rewards = ETC(my_arm, NUM_RUNs, my_arm.delta) ## everyrun NUM_RUNs += 10000
      ETC_cur_regret += my_arm.genie_reward() - ETC_rewards
      my_arm.clear_reward_hist()
      #
      ETC_doubling_rewards = doubling_ETC(my_arm, NUM_RUNs, my_arm.delta)
      ETC_doubling_cur_regret += my_arm.genie_reward() - ETC_doubling_rewards
      my_arm.clear_reward_hist()
    ETC_cur_regret /= REPEAT
    ETC_doubling_cur_regret /= REPEAT
    ETC_regret.append(ETC_cur_regret)
    ETC_doubling_regret.append(ETC_doubling_cur_regret)
    #
  return ETC_regret, ETC_doubling_regret





In [114]:


Ninit = 10000
Ns  = [Ninit * i for i in range(1, 20)]
avg = 5
mean_ETC_regret, mean_ETC_doubling_regret = np.zeros((avg , len(Ns))), np.zeros((avg , len(Ns))) ##initialize mean arrays
for i in range(avg):
  print("simulation {} completed".format(i))
  ETC_regret,  ETC_doubling_regret = regret_vs_horizon(Ns)
  mean_ETC_regret[i] = ETC_regret
  mean_ETC_doubling_regret[i] = ETC_doubling_regret


# '''
# just in case you don't want to spend 2 - 4 CPU hours on it:
# ETC_regret = [131.82144761, 164.05276726, 178.14454248, 186.96920924,
#        200.71992072, 211.24218225, 209.97037274, 225.29555665,
#        226.31542287, 227.84898093, 233.52895855, 231.61707574,
#        232.73323221, 230.47352683, 245.42566151, 246.2055098 ,
#        244.76404079, 249.82165257, 247.08038319]
# ETC_doubling_regret = [ 296.28143055,  441.84768859,  491.78999568,  625.13853049,
#         655.64274781,  675.67153638,  789.59309794,  829.8073726 ,
#         855.97071018,  864.50042971,  872.11629734,  884.49669962,
#         878.11998828, 1012.95650198, 1048.23083199, 1062.88694778,
#        1071.05337232, 1078.68582528, 1099.49708922]
# '''

simulation 0 completed
simulation 1 completed
simulation 2 completed
simulation 3 completed
simulation 4 completed


In [None]:

def plot_history(all_history:dict, x:str, y:str , title:str , log = False):
  fig = graph.Figure(layout = graph.Layout(title=graph.layout.Title(text=title))) 
  for i in all_history:
    fig.add_trace(graph.Scatter(x = all_history[i][x], y = all_history[i][y],name = i)) 
    if log: fig.update_xaxes(type="log")
  fig.show()
plot_history(all_history, "x", "y" , "regret VS horizon - linear" , log = False)


# %matplotlib inline

# import matplotlib
# import matplotlib.pyplot as plt

# plt.plot(runs, ETC_regret, label='ETC')
# plt.title('regret VS horizon - linear')
# plt.legend()
# plt.show()
# plt.plot(runs, ETC_doubling_regret, label='ETC doubling')
# plt.title('regret VS horizon - linear')
# plt.legend()
# plt.show()




In [None]:
runs = Ns

%matplotlib inline

import matplotlib
import matplotlib.pyplot as plt

plt.semilogx(runs, ETC_regret, label='ETC')
plt.title('regret VS horizon - semilogx')
plt.legend()
plt.show()
plt.semilogx(runs, ETC_doubling_regret, label='ETC doubling')
plt.title('regret VS horizon - semilogx')
plt.legend()
plt.show()