In [None]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)


from bandits.environment import Environment
from bandits.bandit import GaussianBandit
from bandits.bandit import BernoulliBandit
from bandits.bandit import TruncatedGaussianBandit
import math
from bandits.agent import Agent, GradientAgent
from bandits.policy import (EpsilonGreedyPolicy, GreedyPolicy, UCBPolicy,
                            ExploreFirstPolicy,
                            SoftmaxPolicy)
import math
import matplotlib.pyplot as plt
from bandits.agent import Agent, BetaAgent
import seaborn as sns
from IPython.core.pylabtools import figsize
import numpy as np
import pymc3 as pm

In [None]:
n_arms = 10
num_bandit_rounds = 20000
np.random.seed(3)


p_array = []

p_expected_values = np.linspace(start = 0.9,stop = 0.1, num = n_arms)

for i in p_expected_values:
    p_array.append(pm.TruncatedNormal.dist(lower=0,upper=1,mu=i,sigma = 0.4).random())
    
    

p_best = np.max(p_array)
p_worst = np.min(p_array)



bandit = BernoulliBandit(k=n_arms,p_array = p_array)
rho_p = 1/n_arms
tau_p = 1/n_arms

L_p  = int(2 + (p_best - p_worst)/(rho_p*tau_p))
k_p = 1000

print(p_array)
print(k_p)
print(L_p)
print(L_p*num_bandit_rounds)

In [None]:
class BICAgent:
    def __init__(self, k_p,L_p,K):
        self.k_p = k_p
        self. L_p = L_p
        self.K = K
        self.model = pm.Model()
        self._value_estimates = np.zeros(self.K)
        self.a_star = 0
        self.alpha = np.ones(self.K)
        self.beta = np.ones(self.K)


    def reset():
        self._value_estimates = np.zeros(self.K)
        self.a_star = 0
        self.alphas = np.ones(self.K)
        self.betas = np.ones(self.K)
        
    
    def update(self,i,reward):
        self.alpha[i] += reward
        self.beta[i] += 1 - reward
        self._value_estimates = self.alpha / (self.alpha + self.beta)
        self.a_star =  np.argmax(self._value_estimates)
        
  
    
   
        
    

In [None]:
sampling_num_agents = 0
regret = []


r_1 = np.repeat(-1,k_p)
# sampling stage

agent = BICAgent(k_p,L_p,n_arms)
# sample k_p times from bandit 1
for i in range(k_p):
    sampling_num_agents += 1
    r_1[i] = bandit.pull(0)[0]
    agent.update(0,r_1[i])
    
    
    
    
    

for i in range(1,n_arms):
    
    a_star = agent.a_star
    
    
    
    
    explore_agents = set(np.random.choice(np.arange(start=i,stop = i + k_p*L_p,dtype=int),size=k_p,replace=False))
    
    

    
    for p in range(i,int(i+k_p*L_p)):

        sampling_num_agents += 1
        
        if p in explore_agents:
            choice = i
                
        else: 
            choice = a_star
            
        reward_tuple = bandit.pull(choice)
        
        if len(regret) > 0:
            
            regret.append(regret[-1]+p_best - reward_tuple[-1])
            
        else:
            regret.append(p_best - reward_tuple[-1])
            
        agent.update(choice,reward_tuple[0])
            
            
    


In [None]:
print(sampling_num_agents)

In [None]:
conf_radius_parameter = L_p*k_p
bandit_agent = BetaAgent(bandit, GreedyPolicy(),ic=True)
sim_num_agents = 0
for phase in range(num_bandit_rounds):
    if phase % 1000 == 0:
        print("Starting phase ",phase)
        
    a_star = agent.a_star
    bandit_choice = bandit_agent.choose()
    explore_agent = np.random.randint(0,L_p)
# do a phase of L rounds
    for agent_round in range(L_p):
        sim_num_agents += 1

        if agent_round == explore_agent:

            choice = bandit_choice

        else:

            choice = a_star

        reward_tuple = bandit.pull(choice)
        if len(regret) > 0:

                regret.append(regret[-1]+p_best - reward_tuple[-1])
        else:
                regret.append(p_best - reward_tuple[-1])
        agent.update(choice,reward_tuple[0])
        bandit_agent.observe(reward = reward_tuple[0],action_attempt = choice)



In [None]:
# import pickle 
# file = open('regret1.pkl', 'wb') 
# pickle.dump(regret, file)

In [None]:
file = open("regret1.pkl",'rb')
regret = pickle.load(file)

In [None]:
print(regret[-1])

In [None]:
n_trials = sim_num_agents + sampling_num_agents

TSagent = BetaAgent(bandit, GreedyPolicy(),ts=True)
regret_ts = []
for t in range(n_trials):
   


    action = TSagent.choose()
    reward, is_optimal,mean_reward = bandit.pull(action)
    TSagent.observe(reward)

    if t%10000 == 0:
        print('we are at time: ',t)
      

    if len(regret_ts) == 0:
        regret_ts.append(p_best - mean_reward) 
    else:
        regret_ts.append(regret_ts[-1] + p_best - mean_reward)
        
  

In [None]:
# import pickle 
# f = open('regretTS.pkl', 'wb') 
# pickle.dump(regret_ts, f)

In [None]:
f = open('regretTS.pkl', 'rb') 
regret_new = pickle.load(f)


In [None]:
print(regret_new[-1])

In [None]:
print(n_trials)

In [None]:
print(len(regret_new))

In [None]:
print(len(regret))

In [None]:
c = k_p + L_p*k_p
cExpGap = c*(p_best-p_worst)
constant = 1
TSRegretBound = lambda x : constant*np.sqrt(n_arms*x*np.log(x)) 

theoreticalBound = L_p*TSRegretBound(n_trials/L_p) + cExpGap

In [None]:

sns.set_style('white')
sns.set_context('talk')
ax = plt.subplot(111)

plt.ylabel("Cummulative Regret")

plt.xlabel('Time Step')


plt.plot(regret,label = "BIC TS")
plt.plot(regret_ts,label = "TS")
plt.vlines(x=sampling_num_agents,ymin = 0,ymax = regret[sampling_num_agents],color = 'g',label="End of Sampling Stage")

plt.axhline(y=theoreticalBound, color='r', linestyle='-',label="Theorem 7 Theoretical Bound")
box = ax.get_position()
ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])

# plt.legend(loc=4)
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

sns.despine()
plt.show()

# New Run

In [None]:
n_arms = 10
num_bandit_rounds = 20000
np.random.seed(3)


p_array = []

p_expected_values = np.linspace(start = 0.9,stop = 0.1, num = n_arms)

for i in p_expected_values:
    p_array.append(pm.TruncatedNormal.dist(lower=0,upper=1,mu=i,sigma = 0.4).random())
    
    

p_best = np.max(p_array)
p_worst = np.min(p_array)




bandit = BernoulliBandit(k=n_arms,p_array = p_array)
rho_p = 1/n_arms
tau_p = 1/n_arms

L_p  = int(2 + (p_best - p_worst)/(rho_p*tau_p))
k_p = 1000

print(p_array)
print(k_p)
print(L_p)
print(L_p*num_bandit_rounds)

In [None]:
sampling_num_agents = 0
regretUCBIC = []


r_1 = np.repeat(-1,k_p)
# sampling stage

agent = BICAgent(k_p,L_p,n_arms)
# sample k_p times from bandit 1
for i in range(k_p):
    sampling_num_agents += 1
    r_1[i] = bandit.pull(0)[0]
    agent.update(0,r_1[i])
    
    
    
    
    

for i in range(1,n_arms):
    
    a_star = agent.a_star
    
    
    
    
    explore_agents = set(np.random.choice(np.arange(start=i,stop = i + k_p*L_p,dtype=int),size=k_p,replace=False))
    
    

    
    for p in range(i,int(i+k_p*L_p)):

        sampling_num_agents += 1
        
        if p in explore_agents:
            choice = i
                
        else: 
            choice = a_star
            
        reward_tuple = bandit.pull(choice)
        
        if len(regretUCBIC) > 0:
            
            regretUCBIC.append(regretUCBIC[-1]+p_best - reward_tuple[-1])
            
        else:
            regretUCBIC.append(p_best - reward_tuple[-1])
            
        agent.update(choice,reward_tuple[0])
            
            

In [None]:
conf_radius_parameter = L_p*k_p
bandit_agent =  Agent(bandit, UCBPolicy(2,num_bandit_rounds))
sim_num_agents = 0
for phase in range(num_bandit_rounds):
    if phase % 1000 == 0:
        print("Starting phase ",phase)
        
    a_star = agent.a_star
    bandit_choice = bandit_agent.choose()
    explore_agent = np.random.randint(0,L_p)
# do a phase of L rounds
    for agent_round in range(L_p):
        sim_num_agents += 1

        if agent_round == explore_agent:

            choice = bandit_choice

        else:

            choice = a_star

        reward_tuple = bandit.pull(choice)
        if len(regretUCBIC) > 0:

                regretUCBIC.append(regretUCBIC[-1]+p_best - reward_tuple[-1])
        else:
                regretUCBIC.append(p_best - reward_tuple[-1])
        agent.update(choice,reward_tuple[0])
        bandit_agent.observe(reward = reward_tuple[0],action_attempt = choice)


In [None]:
n_trials = len(regretUCBIC)

UCBagent = Agent(bandit, UCBPolicy(2,num_bandit_rounds))
regret_ucb = []
for t in range(n_trials):
   


    action = UCBagent.choose()
    reward, is_optimal,mean_reward = bandit.pull(action)
    UCBagent.observe(reward)

    if t%10000 == 0:
        print('we are at time: ',t)
      

    if len(regret_ucb) == 0:
        regret_ucb.append(p_best - mean_reward) 
    else:
        regret_ucb.append(regret_ucb[-1] + p_best - mean_reward)
        
  

In [None]:
sumGaps = 0

for i in p_array:
    if i != p_best:
        sumGaps += 1/(p_best - i)
        
constant = 1

UCBRegretBound = lambda x : constant*np.log(x)*sumGaps

c = k_p + L_p*k_p
cExpGap = c*(p_best-p_worst)

theoreticalBound = L_p*UCBRegretBound((len(regretUCBIC))/L_p) + cExpGap
print(theoreticalBound)

In [None]:
sns.set_style('white')
sns.set_context('talk')
ax = plt.subplot(111)

plt.ylabel("Cummulative Regret")

plt.xlabel('Time Step')


plt.plot(regretUCBIC,label = "BIC UCB1")
plt.plot(regret_ucb,label = "UCB1")
plt.vlines(x=sampling_num_agents,ymin = 0,ymax = regretUCBIC[sampling_num_agents],color = 'g',label="End of Sampling Stage")

plt.axhline(y=theoreticalBound, color='r', linestyle='-',label="Theorem 7 Theoretical Bound")
box = ax.get_position()
ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])

# plt.legend(loc=4)
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

sns.despine()
plt.show()