In [1]:
import numpy as np
import pandas as pd

In [26]:
class ThompsonSampler():
    def __init__(self, arms, n_trials):
        self.arms = arms
        self.n_trials = n_trials
        self.ad_i = np.zeros(n_trials)
        self.r_i = np.zeros(n_trials)
        self.thetas = np.zeros(n_trials)
        self.regret_i = np.zeros(n_trials)
        self.thetaregret = np.zeros(n_trials)
        
        n_arms = len(arms)
        self.alphas = np.ones(n_arms) 
        self.betas = np.ones(n_arms) 
        self.theta = np.zeros(n_arms)
        self.reward = 0
        self.total_reward = 0
        self.k = 0
        self.i = 0
        
    def choose_arm(self):
        self.theta = np.random.beta(self.alphas, self.betas)
        self.k = self.arms[np.argmax(self.theta)]
        return self.k
    
    def update(self):
        self.alphas[self.k] += self.reward
        self.betas[self.k] += 1 - self.reward
        
        self.thetas[self.i] = self.theta[self.k]
        self.thetaregret[self.i] = np.max(self.thetas) - self.theta[self.k]

        self.ad_i[self.i] = self.k
        self.r_i[self.i] = self.reward
        self.i += 1
        
    def collect_data(self):
        return pd.DataFrame({'ad':self.ad_i,
                             'reward':self.r_i,
                             'regret':self.regret_i}, dtype=int)

In [27]:
class Simulator:
    def __init__(self, arms, p, n_trials, variance=False):
        if variance:
            self.p = np.clip(p + np.random.normal(0, 0.04, size=len(arms)), 0, .2)
        else:
            self.p = p
        #self.p[5] = self.p[5] if i < n_trials/2 else 0.1
        self.n_trials = n_trials
        self.agent = ThompsonSampler(arms=arms, n_trials=n_trials)
        
    def run(self):
        total_reward = 0
        for i in range(self.n_trials):
            x_chosen = self.agent.choose_arm()
            
            # Simulator returns reward
            reward = np.random.binomial(1, p=self.p[x_chosen])
            # agent learns of reward
            self.agent.reward = reward
            # agent updates parameters based on the data
            self.agent.update()
            total_reward += reward
        
        return total_reward, self.agent

In [28]:
arms = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
payouts = [0.023, 0.03, 0.029, 0.001, 0.05, 0.06, 0.0234, 0.035, 0.01, 0.11]
labels = ["V" + str(i) + (str(p)) for i, p in zip(arms, payouts)]

In [29]:
sim = Simulator(arms=arms, p=payouts, n_trials=10000)
rewards, agent = sim.run()
rewards

1005

In [30]:
agent.r_i, agent.ad_i

(array([0., 0., 0., ..., 1., 0., 0.]), array([7., 2., 1., ..., 9., 9., 9.]))

In [34]:
data = agent.collect_data()
data

Unnamed: 0,ad,reward,regret
0,7,0,0
1,2,0,0
2,1,0,0
3,9,0,0
4,8,0,0
5,5,0,0
6,1,0,0
7,7,0,0
8,6,0,0
9,0,0,0
