In [1]:
import numpy as np
import numpy.random as rand
import pandas as pd

%matplotlib inline
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.tools import FigureFactory as FF




# Simple Bandit algorithm

This is a simple implementation of the k-Armed Bandit Problem form Sutton's book.  

In [2]:
# initialization of the k bandits
k = 10
eps = 0.1
nb_steps = 1000
nb_iteration = 1000

Here we have 10 bandit to choose from, with their means following a Normal(0,1) and having a variance of 1.  
Below is an example of the bandit's reward distribution

In [82]:
bandit_mean = rand.normal(0,1, size=(k))
bandit_var = np.ones(k)
data = []
for i in range(k):
    data.append(go.Box(y=rand.normal(bandit_mean[i], bandit_var[i], size= (100)), boxmean=True, name=str(i+1)))

layout = go.Layout(
    title='Reward distribution for the k bandits',
    yaxis=dict(
        title='Reward',
        titlefont=dict(family='Courier New, monospace',size=18,color='#7f7f7f')
    ),
    showlegend=False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='bandit_reward_distribution')

In [34]:
class Bandit:
    def __init__(self, t_max, k, epsilon, ucb, bandit_mean=[], bandit_var=[]):
        self.bandit_mean = bandit_mean
        self.bandit_var = bandit_var
        self.actions = np.arange(k)
        self.q = np.zeros(k)
        self.n = np.zeros(k)
        self.e = epsilon
        self.k = k
        self.ucb = ucb
        
    def reset(self):
        self.q = np.zeros(k)
        self.n = np.zeros(k)
        
    def play(self):
        if(rand.rand() < self.e):         # choose randomly a bandit
            a = rand.choice(self.actions)
        else:                          # select the bandit with the highest estimate
            if(self.ucb):
                a = np.argmax(self.q + 2*np.sqrt(np.log(np.sum(self.n))/self.n))
            else:
                a = np.argmax(self.q)
        
        r = rand.normal(self.bandit_mean[a], self.bandit_var[a])  # generate the reward from the chosen bandit
        self.n[a] += 1
        self.q[a] += (r - self.q[a])/self.n[a]   # update the estimate for the chosen bandit

        return r

In [35]:
strategies = {'optimist':Bandit(nb_steps,k,0.1,False), 'curious':Bandit(nb_steps,k,0.1,False),
            'greedy':Bandit(nb_steps,k,0,False), 'scientist':Bandit(nb_steps,k,0,True)}
rewards = {}
for key, value in strategies.items():
    rewards[key] = np.zeros(nb_steps)
    
for i in range(nb_iteration):
    strategies['optimist'].q += 5
    for key, strategie in strategies.items():    # create new bandits distribution
        strategie.bandit_mean = rand.normal(0,1, size=(k))
        strategie.bandit_var = np.ones(k) 
        strategie.reset()
    
    for t in range(nb_steps):   # iterate nb_steps time the bandit algorithm and the store the mean reward
        for key, strategie in strategies.items():
            rewards[key][t] += strategie.play()

for key, value in rewards.items():
    rewards[key] /= nb_iteration

In [37]:
data = []
for key, value in rewards.items():
    data.append(go.Scatter(y=value, name=key))

py.iplot(data, filename='reward_by_strategies')