In [None]:
import numpy as np
from functools import partial
from abc import ABCMeta, abstractmethod

In [None]:
class Distribution:
    __metaclass__ = ABCMeta

    @abstractmethod
    # Returns a single sample (int) drawn from the distribution
    def getSample(self):
        pass
    
    @abstractmethod
    # Returns the mean (float) of the distribution
    def getMean(self):
        pass

In [None]:
class Binomial(Distribution):
    """docstring for _Binomial"""
    def __init__(self, n, p):
        self._mean = n * p
        self._binom = partial(np.random.binomial, n=n, p=p)        
        
    def getSample(self):
        return self._binom()
    
    def getMean(self):
        return self._mean

In [None]:
class _Arm(object):
    """docstring for _Arm"""
    def __init__(self, dist):
        self._dist = dist
    
    def payout(self):
        return self._dist.getSample()

_vArm = np.vectorize(_Arm)

In [None]:
class Bandit(object):
    """docstring for Bandit"""
    def __init__(self, dists):
        for dist in dists:
            if not isinstance(dist, Distribution):
                raise TypeError("Must pass in Distribution Iterable")
                
        self.arms = _vArm(dists)
        self.K = self.arms.size
        self._optimalMean = max(dists, key=lambda d: d.getMean()).getMean()
        
    def getOptimalStrategyPayout(self, T):
        return T * self._optimalMean
    
    def getRandomArm(self):
        index = np.random.randint(0, self.K)
        return index, self.arms[index]
        

In [None]:
bandit = Bandit([Binomial(1, .5), 
                 Binomial(1, .3),
                 Binomial(1, .6)])

In [None]:
bandit.getOptimalStrategyPayout(10)

In [None]:
class Strategy:
    __metaclass__ = ABCMeta

    @abstractmethod
    # Returns payouts at each step of the algorithm
    def run(self, T, bandit):
        pass


In [None]:
class EpsilonGreedyAlgorithm(Strategy):
    """docstring for EpsilonGreedyAlgorithm"""
    def __init__(self, epsilon):
        self._epsilon = epsilon

    def run(self, T, bandit):
        payoutPerStep = np.zeros(T)
        cumPayouts = np.zeros(bandit.K)
        cumPlays = np.zeros(bandit.K)
        sampleMeans = np.zeros(bandit.K)
        for t in range(T):
            if (np.random.random() > 1 - self._epsilon):
                index, arm = bandit.getRandomArm()
            else:
                index = np.argmax(sampleMeans)
                arm = bandit.arms[index]

            payout = arm.payout()
            cumPayouts[index] += payout
            cumPlays[index] += 1
            payoutPerStep[t] = payout
            sampleMeans[index] = np.divide(cumPayouts[index], cumPlays[index])

        return payoutPerStep

In [None]:
epg = EpsilonGreedyAlgorithm(.2)

In [None]:
epg.run(1000, bandit)