<a href="https://colab.research.google.com/github/jmhuer/utaustin_optimization/blob/main/homework11/Exp3_assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Exponential Weight Algorithm for Explore and Exploit (EXP3)

In this exercise, we will be studying the exponential weight algorithm for explore and exploit (EXP3).

While this algorithm is designed for adversarial bandit setting, we will test this algorithm in the Bernoulli bandit setting, for the ease of implementation.

As an example of the Bernoulli bandit, the following codes create a rewards history matrix $rewards$, such that $rewards[t, a]$ returns the reward you will get if you query arm $a$ at time $t$.

In [None]:
import numpy as np


numAction = 10
numRound = 100000
expected_rewards = np.arange(numAction) + 1
expected_rewards = 1 / expected_rewards
expected_rewards = np.repeat(expected_rewards.reshape(1,-1), numRound, axis=0)
rewards = np.random.rand(numRound, numAction) < expected_rewards
rewards = rewards.astype(float)

In [None]:
import numpy as np
import pdb
import matplotlib.pyplot as plt

class Gaussian_Arm:
  def __init__(self, num_arms, mu=None):
    '''
    num_arms: (int). the number of arms
    mu: (None or list-type). the mean of the reward of each arm.
        if set to None, a random vector will be generated.
    '''
    if num_arms <= 1 or not isinstance(num_arms, int):
      print('number of arms has an int that is at least two')
      return
    
    self.num_arms = num_arms
    #
    if mu:
      self.mu = np.asarray(mu)
      if len(self.mu) != num_arms:
        print('The lenth of mu does not match the number of arms')
        return
    else:
      self.mu = np.random.rand((num_arms))
    # 
    self.delta = max(self.mu) - min(self.mu)
    #

    # keep track of the rewards for the user
    self.rewards_history = []
    # keep track of how many times the arms have been pulled
    self.total_pull = 0 

  def pull_arm(self, arm_id=-1, pull_time=1):
    if arm_id < 0 or arm_id >= self.num_arms:
      print('please specify arm id in the range of 0-%d' % (self.num_arms))
      return
    assert (isinstance(pull_time, int) and pull_time >= 1)
    self.total_pull += pull_time
    # Generate reward
    reward = self.mu[arm_id] * pull_time + sum(np.random.randn(pull_time))
    self.rewards_history.append(reward)
    return reward


  def genie_reward(self):
    '''
    the best expected reward after pulling self.total_pull times
    '''
    best_mu = max(self.mu)
    return self.total_pull * best_mu

  def my_rewards(self):
    return sum(self.rewards_history)

  def clear_reward_hist(self):
    self.rewards_history = []
    self.total_pull = 0



## Goal of these exercises

Implement the following:

1. Basic EXP3 algorithm implementation under the Bernoulli bandit setting.
2. Plot the expected regret of EXP3 versus horizon (number of rounds).

Optional:

1. Plot the expected regret of EXP3 versus the number of arms.
2. Implement an adversarial bandit, and test EXP3 algorithm on it. 

## Tips:

1. To see if the regret is correct, try to run your EXP3 algorithm repeatedly with horizon equals to $[50^2, 60^2, 70^2, 80^2, 90^2, 100^2]$. Plot your regret (as the y-axis), versus $[50, 60, 70, 80, 90, 100]$ (as the x-axis). The figure should look like a straight line.
2. Check out numpy.random.choice for drawing from a discrete distribution

In [None]:

  
NUM_ARMS = 2

class Empirical_mean:
    def __init__(self,num_arms):
        self.rewards = np.zeros(num_arms)
        self.count = np.zeros(num_arms)
        self.mean =  np.zeros(num_arms)
    def add_new_reward(self, reward, indx):
        self.rewards[indx] += reward
        self.count[indx] += 1
        self.mean[indx] = self.rewards[indx] / self.count[indx]
    def mean(self, indx):
        return self.mean[indx]
    def reset(self):
        self.rewards = self.rewards * 0 
        self.count = self.rewards * 0 
        self.mean =  self.rewards * 0 



def exp3(arm, N, num_arms=NUM_ARMS, n_rate=0.01):
  # bonus = lambda delta, t: np.sqrt((2*np.log(1/delta))/t) if t > 0 else float('inf')
  mean_vals = Empirical_mean(num_arms)
  for i in range(N):
      prob = np.exp(x)/sum(np.exp(x))
      mean_vals.add_new_reward(arm.pull_arm(UCBbest_arm), UCBbest_arm)
  return arm.my_rewards()



In [None]:

def regret_vs_horizon(Ns:list, REPEAT:int, algorithm: type(lambda x: None)):
  regret = []
  mu = [0.1, 0.0]
  my_arm = Gaussian_Arm(NUM_ARMS, mu=mu)
  for NUM_RUNs in Ns:
    print(NUM_RUNs)
    cur_regret = 0
    for repeat in range(REPEAT):
        rewards = algorithm(my_arm, NUM_RUNs, my_arm.num_arms) ## everyrun NUM_RUNs += 10000
        cur_regret += my_arm.genie_reward() - rewards
        my_arm.clear_reward_hist()
    cur_regret /= REPEAT
    regret.append(cur_regret)
    #
  return regret


import plotly.graph_objects as graph
def plot(all_history:list, title:str, log = False):
    """
    input:
        all_history: list of dicts to plot
    ret:
        None: show plotly fig
    """
    fig = graph.Figure(layout = graph.Layout(title=graph.layout.Title(text=title))) 
    for i in range(len(all_history)):
        fig.add_trace(graph.Scatter(x = all_history[i]["x"], 
                                    y = all_history[i]["y"],
                                    name = all_history[i]["legend"])) 
    if log: fig.update_xaxes(type="log")
    fig.show()



In [None]:

Ninit = 200
Ns  = [Ninit * (2**i) for i in range(1, 11)]
UCB_regret = regret_vs_horizon(Ns, REPEAT=200, algorithm=UCB)


plot_UCB_regret = {"legend": "mean_UBC_regret", 
                   "x": Ns , 
                   "y": UCB_regret}

plot([plot_UCB_regret], title="regret VS horizon - linear" , log = False)
plot([plot_UCB_regret], title="regret VS horizon - Log" , log = True)
