In [18]:
import math
import random

class Arms():
    def __init__(self, mus):
        self.mus = mus
        self.n_arms = len(mus)
        self.best = max(mus)
        assert all(0 <= mu <= 1 for mu in mus)

    def __str__(self):
        return str(self.mus)

    def pull(self, idx):
        # Bernoulli reward
        return 1 if random.random() < self.mus[idx] else 0


class Policy():
    def __init__(self):
        pass

    def pick(self, n_arms, history, to_pick=[]):
        ''' to_pick stores the future picks '''
        pass


def experiment(arms, policy, T):
    ''' return simulated history and toal regret '''
    best_mu = arms.best
    n_arms = arms.n_arms
    history = [[0, 0] for _ in range(n_arms)]
    total_regret = 0

    for t in range(T):
        picked = policy.pick(n_arms, history)
        reward = arms.pull(picked)
        history[picked][0] += reward
        history[picked][1] += 1
        total_regret += best_mu - arms.mus[picked]

    return history, total_regret


def argmax(s):
    ''' return the first index corresponding to the max element '''
    return s.index(max(s))


class RandomPick(Policy):
    def pick(self, n_arms, history):
        return random.choice(range(n_arms))


class EpsGreedy(Policy):
    def __init__(self, eps):
        self.eps = eps
    def pick(self, n_arms, history):
        if random.random() < self.eps:
            return random.choice(range(n_arms))
        for i, [_, n] in enumerate(history):
            if n == 0:
                return i
        return argmax([r / n for r, n in history])

class UCB(Policy):
    def pick(self, n_arms, history):
        for i, [_, n] in enumerate(history):
            if n == 0:
                return i
        t = sum(n for _, n in history)
        ucb = [r / n + math.sqrt(math.log(t) / n) for r, n in history]
        return argmax(ucb)

In [19]:
class BatchRandomPick(Policy):
    def __init__(self, batch_size):
        self.batch_size = batch_size

    def pick(self, n_arms, history, to_pick=[]):
        if not to_pick:
            to_pick += [random.choice(range(n_arms))] * self.batch_size
        return to_pick.pop()

In [20]:
class BatchUCB(Policy):
    def __init__(self, batch_size):
        self.batch_size = batch_size
        
    def pick(self, n_arms, history, to_pick=[]):
        if to_pick:
            return to_pick.pop()
        for i, [_, n] in enumerate(history):
            if n == 0:
                return i
        t = sum(n for _, n in history)
        ucb = [r / n + math.sqrt(math.log(t) / n) for r, n in history]
        to_pick += [argmax(ucb)] * self.batch_size
        return to_pick.pop()

In [85]:
a = Arms([0.2, 0.25])
T = 100000

In [86]:
experiment(a, RandomPick(), T)

([[9984, 49928], [12594, 50072]], 2496.400000000316)

In [87]:
experiment(a, BatchUCB(100), T)

([[436, 2301], [24434, 97699]], 115.0499999999956)

In [88]:
experiment(a, BatchRandomPick(100), T)

([[10404, 51200], [12083, 48800]], 2560.0000000005475)

In [89]:
experiment(a, EpsGreedy(0.1), T)

([[1332, 6728], [23286, 93272]], 336.4000000000422)

In [119]:
experiment(a, UCB(), T)

([[434, 2275], [24552, 97725]], 113.74999999999568)

In [130]:
experiment(a, ABTesting(sig=0.01), T)

([[182, 923], [24703, 99077]], 46.149999999999515)

In [40]:
norm.isf(0.05 / 2)

1.9599639845400545

In [39]:
# http://stackoverflow.com/questions/15204070/

from scipy.stats import norm, zscore
def sample_power_probtest(p1, p2, power=0.9, sig=0.05):
    
    z = norm.isf([sig / 2]) # two-sided t test
    zp = -norm.isf([power]) 
    d = p1 - p2
    s = 2 * ((p1 + p2) / 2) * (1 - (p1 + p2) / 2)
    n = s * ((zp + z) ** 2) / (d ** 2)
    return int(round(n[0]))

In [66]:
def z(history):
    [r1, n1], [r2, n2] = history
    p1 = r1 / n1
    p2 = r2 / n2
    z = (p1 - p2) / math.sqrt(p1 * (1 - p1) / n1 + p2 * (1 - p2) / n2)
    return z

In [67]:
z([[10021, 50000], [14960, 50000]])

-36.315627518946926

In [None]:
sample_power_probtest(0.2, 0.3, power=0.9, sig=0.05)

In [68]:
class ABTesting(Policy):

    def __init__(self, power=0.8, sig=0.05):
        self.power = power
        self.sig = sig
        self.best = None
        self.z_need = norm.isf(sig / 2) # 2-tail test

    def best_arm(self, history):
        [r1, n1], [r2, n2] = history
        p1 = r1 / n1
        p2 = r2 / n2
        z = (p1 - p2) / math.sqrt(p1 * (1 - p1) / n1 + p2 * (1 - p2) / n2)
        if z > self.z_need:
            self.best = 0
        if z < -self.z_need:
            self.best = 1
        return self.best

    def pick(self, n_arms, history, to_pick=[]):

        for i, [_, n] in enumerate(history):
            if n == 0:
                return i

        # if we have the best choice, pick it
        if self.best is not None:
            return self.best
        
        if to_pick:
            return to_pick.pop()
        else:
            try:
                self.best = self.best_arm(history)
                if self.best is not None:
                    return self.best
            except ZeroDivisionError:
                pass
            to_pick += list(range(n_arms))
            return to_pick.pop()