Multi-arm Bandits
------

![](http://i1.wp.com/banditalgs.com/wp-content/uploads/2016/09/cropped-bandit-algorithm-full.png?fit=512%2C512)

In [1]:
reset -fs

In [2]:
import math
import random

import numpy as np
import scipy.stats as stats

In [3]:
class Bandits:

    def __init__(self, p_array):
        self.p_array = p_array            # Array of probabilities to initialize bandits
        self.optimal = np.argmax(p_array) # Index of the optimal bandit

    def pull(self, index):
        "Sample from a given bandit"
        return np.random.random() < self.p_array[index] # Boolean indicating whether the bandit returned a reward or not

    def __len__(self):
        return len(self.p_array)

In [4]:
class BanditStrategy:
    " Implements a online, learning strategy to solve the Multi-Armed Bandit problem."

    def __init__(self, bandits, choice_function):
        self.bandits = bandits                 # Instance of Bandits class
        n_bandits = len(self.bandits)
        self.wins = np.zeros(n_bandits)
        self.trials = np.zeros(n_bandits)
        self.N = 0                             # The cumulative number of samples
        self.choices = []                      # The historical choices 
        self.score = []                        # The historical score 
        self.choice_function = choice_function # Has self argument to gives access to all the instance variables
                                               # Returns and int between 0 and n-1

    def sample_bandits(self, n=1):
        "Simulate n rounds of running the bandit machine."
        score = np.zeros(n)
        choices = np.zeros(n)

        # Seed the random number generators so you get the same results every time.
        np.random.seed(42)
        random.seed(42)

        for k in range(n):
            # Sample from the bandits's priors, and select the largest sample
            choice = self.choice_function(self)

            # Sample the chosen bandit
            result = self.bandits.pull(choice)

            # Update priors and score
            self.wins[choice] += result
            self.trials[choice] += 1
            score[k] = result
            self.N += 1
            choices[k] = choice

        self.score = np.r_[self.score, score]
        self.choices = np.r_[self.choices, choices]

    def regret(self, probabilities):
        """Take an array of the true probabilities for each machine and 
        an array of the indices of the machine played at each round.
        Return an array giving the total regret after each round."""
        p_opt = np.max(probabilities)
        return np.cumsum(p_opt - probabilities[self.choices.astype(int)]) # Total Regret

    def optimum_choices(self, probabilities):
        """Take an array of the true probabilities for each machine and 
        an array of the indices of the machine played at each round.
        Return an array giving the total regret after each round.
        """
        p_opt = np.argmax(probabilities)
        return np.cumsum(np.select([self.choices==p_opt], [1]))

    def max_mean(self):
        "Pick the bandit with the current best observed proportion of winning."
        if len(self.trials.nonzero()[0]) < len(self.bandits): # Make sure to play each bandit at least once
            return np.random.randint(0, len(self.bandits))
        return np.argmax(self.wins / (self.trials + 1)) # Return the index of the winning bandit.

In [5]:
def random_choice(self): # XXX: This is silly technical debt. This method depends on some j-random classs!
    "Pick a bandit uniformly at random."
    return np.random.randint(0, len(self.wins)) # Return the index of the winning bandit.

In [7]:
bandits = Bandits([0.05, 0.03, 0.06])
bs = BanditStrategy(bandits, random_choice)
bs.sample_bandits(10) # 1000 is a good value

In [8]:
def print_results(bs_instance):
    print("Number of trials: ", bs_instance.trials)
    print("Number of wins: ", bs_instance.wins)
    print("Conversion rates: ", bs_instance.wins / bs_instance.trials)
    print(f"A total of {bs_instance.wins.sum()} wins of {bs_instance.trials.sum()} trials.")

In [9]:
print_results(bs)

Number of trials:  [ 3.  3.  4.]
Number of wins:  [ 0.  0.  1.]
Conversion rates:  [ 0.    0.    0.25]
A total of 1.0 wins of 10.0 trials.


-----
Implement the four multi-arm bandit strategies: 
        
1. epsilon-greedy
2. softmax
3. ucb1 
4. bayesian bandits

In [22]:
def epsilon_greedy(self, epsilon=0.1):
    '''
    Pick a bandit uniformly at random epsilon percent of the time.
    Otherwise pick the bandit with the best observed proportion of winning.
    Return the index of the winning bandit.
    '''
    pass

In [23]:
def softmax(self, tau=0.01):
    '''
    Pick an bandit according to the Boltzman Distribution.
    Return the index of the winning bandit.
    '''
    pass

In [24]:
def ucb1(self):
    '''
    Pick the bandit according to the UCB1 strategy.
    Return the index of the winning bandit.
    '''
    pass

In [25]:
def bayesian_bandit(self):
    '''
    Randomly sample from a beta distribution for each bandit and pick the one
    with the largest value.
    Return the index of the winning bandit.
    '''
    pass

See how many wins you have of the 1000 trials using each of the six strategies (two already implemented) with the starting bandits given above.

In [26]:
bandits = Bandits([0.05, 0.03, 0.06])
# bs = BanditStrategy(bandits, epsilon_greedy)
# bs = BanditStrategy(bandits, softmax)
# bs = BanditStrategy(bandits, ucb1)
# bs = BanditStrategy(bandits, bayesian_bandit)
bs.sample_bandits(1000)
print_results(bs)

Number of trials:  [ 373.  341.  296.]
Number of wins:  [ 18.   5.  16.]
Conversion rates:  [ 0.04825737  0.01466276  0.05405405]
A total of 39.0 wins of 1010.0 trials.


What is your summary of the different strategies?

Try running it again  and see how different each algorithm does with respect to


In [27]:
probs ={'atlantic_city': [0.1, 0.1, 0.1, 0.1, 0.9],
        'las_vegas': [0.1, 0.1, 0.1, 0.1, 0.12],
        'reno': [0.1, 0.2, 0.3, 0.4, 0.5]}

bandits = Bandits(probs['atlantic_city'])

What is your summary of the different strategies?

In [28]:
def regret(probabilities, choices):
        '''
        INPUT: array of floats (0 to 1), array of ints
        OUTPUT: array of floats

        Take an array of the true probabilities for each machine and an
        array of the indices of the machine played at each round.
        Return an array giving the total regret after each round.
        '''
        p_opt = np.max(probabilities)
        return np.cumsum(p_opt - probabilities[choices])

Plot the total regret over time of each algorithm. 

Use the Bandits with these hidden probabilities:

In [29]:
bandits = Bandits([0.05, 0.03, 0.06])

Now plot the percentage of time the optimal bandit was chosen over time.

<br>
<br> 
<br>

----