<a href="https://colab.research.google.com/github/jmhuer/utaustin_optimization/blob/main/homework8/e_greedy_assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>



```
# This is formatted as code
```

# Epsilon Greedy

In this excercises, we will be playing with the Multi-arm bandit problem with the Epsilon Greedy algorithm.

## Setup

As before, consider unstructural bandit problem. Suppose we have $k$ arms, each with random rewards $p_i = u_i + \epsilon$ where $\epsilon$ is draw from i.i.d. standard gaussian. (Note that we only require $\epsilon$ to be sub-gaussian for the analysis to go through)

The following codes is capturing the setup.

In [None]:
import numpy as np
import pdb
import numpy as np
import pdb
import matplotlib.pyplot as plt
import plotly.graph_objects as graph

class Gaussian_Arm:
  def __init__(self, num_arms, mu=None):
    '''
    num_arms: (int). the number of arms
    mu: (None or list-type). the mean of the reward of each arm.
        if set to None, a random vector will be generated.
    '''
    if num_arms <= 1 or not isinstance(num_arms, int):
      print('number of arms has an int that is at least two')
      return
    
    self.num_arms = num_arms
    #
    if mu:
      self.mu = np.asarray(mu)
      if len(self.mu) != num_arms:
        print('The lenth of mu does not match the number of arms')
        return
    else:
      self.mu = np.random.rand((num_arms))
    # 
    self.delta = max(self.mu) - min(self.mu)
    #

    # keep track of the rewards for the user
    self.rewards_history = []
    # keep track of how many times the arms have been pulled
    self.total_pull = 0 

  def pull_arm(self, arm_id=-1):
    if arm_id < 0 or arm_id >= self.num_arms:
      print('please specify arm id in the range of 0-%d' % (self.num_arms))
      return
    self.total_pull += 1
    # Generate reward
    reward = self.mu[arm_id] + np.random.randn()
    self.rewards_history.append(reward)
    return reward


  def genie_reward(self):
    '''
    the best expected reward after pulling self.total_pull times
    '''
    best_mu = max(self.mu)
    return self.total_pull * best_mu

  def my_rewards(self):
    return sum(self.rewards_history)

  def clear_reward_hist(self):
    self.rewards_history = []
    self.total_pull = 0



## Goal

Implement the following:

1. Basic epsilon greedy algorithm implementation
2. Plot the expected regret of the epsilong greedy algorithm VS horizon ($n$).

## Tips:

1. The regret is expected to be logarithmic against the horizon. To check if the relation is logarithmic, one can use the semilogx function in matplotlib.pyplot
2. When the regret is not logarithmic, please check against the analysis, and obtain insights there for debugging.
3. Similar to ETC, to see a smooth curve, one would have to repeat the simulation for about 10k times. One can either use parallel implementation, or run a smaller number of simulation for debug first.


# 1. Basic epsilon greedy algorithm implementation

In [60]:
def plot_history(all_history:dict, x:str, y:str , title:str , log = False):
  fig = graph.Figure(layout = graph.Layout(title=graph.layout.Title(text=title)))
  for i in all_history:
      fig.add_trace(graph.Scatter(x    = all_history[i][x],
                                  y    = all_history[i][y],
                                  name = i))
  if log: fig.update_xaxes(type="log")
  fig.show()

def ealgorithm(arm:Gaussian_Arm, horizon:int, epsilon= lambda t: 0.1):
  history = {"step":[], 
             "regret":[], 
             "expected_mu":np.array([arm.pull_arm(i % arm.num_arms) for i in range(arm.num_arms)]), ##here we init each arm
             "arm_chosen":[], 
             "N":np.zeros((arm.num_arms))}
  for i in range(horizon):
    decision = int(np.random.binomial(1, 1 - epsilon(i) , 1))
    #case 1
    random_action = np.random.randint(arm.num_arms, size=1)
    #case 2
    greedy_action = np.argmax(history['expected_mu'])
    #decide
    arm_decision = (decision * greedy_action) + ( (1 - decision) * random_action )
    #collect reward
    reward = arm.pull_arm(arm_decision)
    #update
    N = history['N'][arm_decision]
    current_mean = history['expected_mu'][arm_decision] 
    history['expected_mu'][arm_decision] = (current_mean*N + reward )/(N + 1)
    ## store history
    history['N'][arm_decision]+=1
    history['step'].append(i)
    history['regret'].append( float(arm.genie_reward() - arm.my_rewards())  )
    history['arm_chosen'].append(arm_decision)
  print("experimental mu: {} \t real mu: {}".format(history['expected_mu'], arm.mu))
  return history

# 2. Plot the expected regret of the epsilong greedy algorithm VS horizon ( 𝑛 )

In [66]:
total_run = 8166

arm = Gaussian_Arm(2, mu =  [1.2, 1.4] )

# average regret for n runs 
mean_history = {"step": list(range(total_run)), "regret": np.zeros(total_run)}
for i in range(15):
  history = ealgorithm(arm, horizon= total_run, epsilon = lambda t: min(1, 7*arm.num_arms/((t+1)*arm.delta**2)))
  mean_history["regret"] = (mean_history["regret"]*i + history["regret"]) /( (i + 1))


# combine all plots into one dic for ploting multiple lines in one plot
all_history = {"ETC": mean_history}


#plot
plot_history(all_history, x="step" , y="regret", title="e-greedy v log horizon", log=True)
plot_history(all_history, x="step" , y="regret", title="e-greedy v horizon", log=False)

experimental mu: [1.19118666 1.40983834] 	 real mu: [1.2 1.4]
experimental mu: [1.29587783 1.4211537 ] 	 real mu: [1.2 1.4]
experimental mu: [1.20662105 1.39300974] 	 real mu: [1.2 1.4]
experimental mu: [1.14851249 1.40334498] 	 real mu: [1.2 1.4]
experimental mu: [1.22114622 1.39429212] 	 real mu: [1.2 1.4]
experimental mu: [1.20896241 1.38372954] 	 real mu: [1.2 1.4]
experimental mu: [1.17923845 1.42123219] 	 real mu: [1.2 1.4]
experimental mu: [1.19471039 1.39257934] 	 real mu: [1.2 1.4]
experimental mu: [1.16783283 1.40760367] 	 real mu: [1.2 1.4]
experimental mu: [1.25599756 1.38524556] 	 real mu: [1.2 1.4]
experimental mu: [1.1904481  1.39691045] 	 real mu: [1.2 1.4]
experimental mu: [1.22065838 1.42598592] 	 real mu: [1.2 1.4]
experimental mu: [1.20066731 1.38574043] 	 real mu: [1.2 1.4]
experimental mu: [1.23300937 1.40017255] 	 real mu: [1.2 1.4]
experimental mu: [1.21295506 1.40531305] 	 real mu: [1.2 1.4]
