<a href="https://colab.research.google.com/github/jmhuer/utaustin_optimization/blob/main/homework13/Thompson_Sampling_Assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Thompson Sampling

In this excercise, we will be playing with the Multi-arm bandit problem with the UCB algorithm.

Assume we are under the Bernoulli reward setting.

In [50]:
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

mu = np.array([0.30, 0.25, 0.20, 0.15, 0.10])


class Bernoulli_Bandit():
  def __init__(self, num_arms, mu=None):
    self.num_arms = num_arms
    if mu is None:
      self.mu = np.random.uniform(0,1, num_arms)
    else:
      self.mu = np.array(mu)
    self.total_pull = 0 
     # keep track of the rewards for the user
    self.rewards_history = []
    
  def pull_arm(self, arm):
    reward = 1 if np.random.random()< self.mu[arm] else 0
    self.total_pull += 1
    self.rewards_history.append(reward)
    return reward

  def my_rewards(self):
    return sum(self.rewards_history)

  def genie_reward(self):
    return self.mu.max() * self.total_pull
  
  def clear_reward_hist(self):
    self.rewards_history = []
    self.total_pull = 0



## Goal of this exercise
1. Basic Thompson Sampling algorithm implementation
2. Plot the regret VS horizon ($n$).
3. Compare to the UCB algorithm.

Please refer to previous assignment for tips.


In [51]:
# utils 

class Running_mean:
    def __init__(self,num_arms):
        self.rewards = np.zeros(num_arms)
        self.count = np.zeros(num_arms)
        self.mean =  np.zeros(num_arms)
    def add_new_reward(self, reward, indx):
        self.rewards[indx] += reward
        self.count[indx] += 1
        self.mean[indx] = self.rewards[indx] / self.count[indx]
    def mean(self, indx):
        return self.mean[indx]
    def reset(self):
        self.rewards = self.rewards * 0 
        self.count = self.rewards * 0 
        self.mean =  self.rewards * 0


def regret_vs_horizon(arm, Ns:list, REPEAT:int, algorithm: type(lambda x: None)):
  regret = []
  my_arm = arm
  for NUM_RUNs in Ns:
    print(NUM_RUNs)
    cur_regret = 0
    for repeat in range(REPEAT):
        rewards = algorithm(my_arm, NUM_RUNs) ## everyrun NUM_RUNs += 10000
        cur_regret += my_arm.genie_reward() - rewards
        my_arm.clear_reward_hist()
    cur_regret /= REPEAT
    regret.append(cur_regret)
  return regret


import plotly.graph_objects as graph
def plot(all_history:list, title:str, log = False):
    """
    input:
        all_history: list of dicts to plot
    ret:
        None: show plotly fig
    """
    fig = graph.Figure(layout = graph.Layout(title=graph.layout.Title(text=title))) 
    for i in range(len(all_history)):
        fig.add_trace(graph.Scatter(x = all_history[i]["x"], 
                                    y = all_history[i]["y"],
                                    name = all_history[i]["legend"])) 
    if log: fig.update_xaxes(type="log")
    fig.show()

In [52]:
def UCB(arm, N):
  bonus = lambda delta, t: np.sqrt((2*np.log(1/delta))/t) if t > 0 else float('inf')
  delta = 1/N**2 
  mean_vals = Running_mean(arm.num_arms)
  for i in range(N):
      UCBbest_arm = np.argmax([u + bonus(delta, t) for (u,t) in zip(mean_vals.mean, mean_vals.count)])
      mean_vals.add_new_reward(arm.pull_arm(UCBbest_arm), UCBbest_arm)
  return arm.my_rewards()

In [53]:
from numpy.random import beta


def Thompson(arm, N):
  sample_beta_posterior = lambda successes, failures: beta(successes, failures, 1)
  counter = {"success" : [1 for i in range(arm.num_arms)], 
             "failure" : [1 for i in range(arm.num_arms)]}

  for i in range(N):
      UCBbest_arm = np.argmax([sample_beta_posterior(s,f) for (s,f) in zip(counter['success'], counter['failure'])])
      reward = arm.pull_arm(UCBbest_arm)

      #update 
      if reward==1: counter['success'][UCBbest_arm] += 1 
      else : counter['failure'][UCBbest_arm] += 1 

  return arm.my_rewards()

In [54]:
Ninit = 100
Ns  = [Ninit * (2**i) for i in range(1, 6)]

bernoulli_arm  = Bernoulli_Bandit(num_arms=mu.shape[0] , mu=mu)

Thompson_regret = regret_vs_horizon(bernoulli_arm, Ns, REPEAT=100, algorithm=Thompson)
UCB_regret = regret_vs_horizon(bernoulli_arm, Ns, REPEAT=100, algorithm=UCB)

plot_Thompson_regret = {"legend": "mean_Thompson_regret", 
                        "x": Ns , 
                        "y": Thompson_regret}

plot_UCB_regret = {"legend": "mean_UCB_regret", 
                   "x": Ns , 
                   "y": UCB_regret}


plot([plot_UCB_regret, plot_Thompson_regret], title="regret VS horizon - linear", log = False)


200
400
800
1600
3200
200
400
800
1600
3200
