In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)


from bandits.environment import Environment
from bandits.bandit import GaussianBandit
from bandits.bandit import BernoulliBandit
from bandits.bandit import TruncatedGaussianBandit
import math
from bandits.agent import Agent, GradientAgent
from bandits.policy import (EpsilonGreedyPolicy, GreedyPolicy, UCBPolicy,
                            ExploreFirstPolicy,
                            SoftmaxPolicy)
import math
import matplotlib.pyplot as plt
from bandits.agent import Agent, BetaAgent
import seaborn as sns
from IPython.core.pylabtools import figsize
import numpy as np

In [None]:
np.random.seed(30)
sigma = 1
sigma_means = 0.3

n_arms = 10
bandit = TruncatedGaussianBandit(n_arms,sigma=sigma,sigma_means = sigma_means)
# bandit = BernoulliBandit(k=n_arms)
n_trials = 20000
n_experiments = 1


## Explore first example

In [None]:
N = math.ceil((n_trials/n_arms)**(2/3)*math.log(n_trials**(1/3)))
print(N)

## High variance

In [None]:

agents = [
        Agent(bandit, ExploreFirstPolicy(N,n_arms)),
      
    ]
env = Environment(bandit, agents)

In [None]:
scores, optimal,regret = env.run(n_trials, n_experiments)


In [None]:
env.plot_reward(scores,label = "Reward")
plt.figure()
env.plot_regret(regret,label = "Cumulative Regret")


## Low Variance 

In [None]:
sigma = 0.01
bandit = TruncatedGaussianBandit(n_arms,sigma=sigma)
# bandit = BernoulliBandit(k=n_arms)
N = math.ceil((n_trials/n_arms)**(2/3)*math.log(n_trials**(1/3)))
agents = [
        Agent(bandit, ExploreFirstPolicy(N,n_arms)),
      
    ]
env = Environment(bandit, agents)

scores, optimal,regret = env.run(n_trials, n_experiments)

In [None]:
env.plot_reward(scores,label = "Reward")
plt.figure()
env.plot_regret(regret,label = "Cumulative Regret")


# e-greedy

In [None]:
n_arms = 100
n_trials = 1000
n_experiments = 100


p_array = [np.random.normal(0.5,0.1) for i in range(n_arms)]



bandit = BernoulliBandit(k=n_arms,p_array = p_array)

N = math.ceil((n_trials/n_arms)**(2/3)*math.log(n_trials**(1/3)))
print(N*n_arms)

In [None]:
print(p_array)

In [None]:
agents = [Agent(bandit, ExploreFirstPolicy(N,n_arms)), 
          Agent(bandit,EpsilonGreedyPolicy(epsilon = 0.1)), 
          Agent(bandit,EpsilonGreedyPolicy(epsilon = 0.01)),
         
         ]
              
              
env = Environment(bandit, agents)
score1, optimal1,regret1 = env.run(n_trials, n_experiments)
scores1,optimal1,regret1 = score1/n_experiments,optimal1/n_experiments,regret1/n_experiments

In [None]:
n_trials = 10000
N = math.ceil((n_trials/n_arms)**(2/3)*math.log(n_trials**(1/3)))
agents = [Agent(bandit, ExploreFirstPolicy(N,n_arms)), 
          Agent(bandit,EpsilonGreedyPolicy(epsilon = 0.1)), 
          Agent(bandit,EpsilonGreedyPolicy(epsilon = 0.01)),
         
         ]
              
              
env = Environment(bandit, agents)
score2, optimal2,regret2 = env.run(n_trials, n_experiments)
scores2,optimal2,regret2 = score2/n_experiments,optimal2/n_experiments,regret2/n_experiments


In [None]:
n_trials = 100000
N = math.ceil((n_trials/n_arms)**(2/3)*math.log(n_trials**(1/3)))
agents = [Agent(bandit, ExploreFirstPolicy(N,n_arms)), 
          Agent(bandit,EpsilonGreedyPolicy(epsilon = 0.1)), 
          Agent(bandit,EpsilonGreedyPolicy(epsilon = 0.01)),
         
         ]
              
              
env = Environment(bandit, agents)
score3, optimal3,regret3 = env.run(n_trials, n_experiments)
scores3,optimal3,regret3 = score3/n_experiments,optimal3/n_experiments,regret3/n_experiments


In [None]:
figsize(11.0, 10)
sns.set_style('white')
sns.set_context('talk')
plt.subplot(131)

plt.ylabel("Average Regret")

plt.xlabel('Time Step')

plt.plot(regret1)
plt.legend(agents, loc=2)

plt.subplot(132)

plt.ylabel("Average Regret")

plt.xlabel('Time Step')

plt.plot(regret2)
plt.legend(agents, loc=2)

plt.subplot(133)

plt.ylabel("Average Regret")

plt.xlabel('Time Step')

plt.plot(regret3)
plt.legend(agents, loc=2)

sns.despine()
plt.show()


In [None]:
n_trials = 100000
n_experiments = 1
agents = [
          Agent(bandit,EpsilonGreedyPolicy(epsilon = 0.1)), 
          Agent(bandit, UCBPolicy(2,n_trials))
         
         ]
              
              
env = Environment(bandit, agents)
score3, optimal3,regret3 = env.run(n_trials, n_experiments)
scores3,optimal3,regret3 = score3/n_experiments,optimal3/n_experiments,regret3/n_experiments

In [None]:
figsize(11.0, 10)
sns.set_style('white')
sns.set_context('talk')


plt.ylabel("Average Regret")

plt.xlabel('Time Step')

plt.plot(regret3)
plt.legend(agents, loc=2)


In [None]:
n_trials = 1000000
n_experiments = 1
agents = [
          BetaAgent(bandit, GreedyPolicy()), 
          Agent(bandit, UCBPolicy(2,n_trials))
         
         ]
              
              
env = Environment(bandit, agents)
score4, optimal4,regret4 = env.run(n_trials, n_experiments)
scores4,optimal4,regret4 = score4/n_experiments,optimal4/n_experiments,regret4/n_experiments


    

In [None]:
n_trials = 1000000
n_experiments = 1
N = math.ceil((n_trials/n_arms)**(2/3)*math.log(n_trials**(1/3)))
agents = [
          BetaAgent(bandit, GreedyPolicy()), 
        Agent(bandit, ExploreFirstPolicy(N,n_arms)), 
          Agent(bandit,EpsilonGreedyPolicy(epsilon = 0.1)),
          Agent(bandit, UCBPolicy(2,n_trials))
         
         ]
              
              
env = Environment(bandit, agents)
score5, optimal5,regret5 = env.run(n_trials, n_experiments)
scores5,optimal5,regret5 = score5/n_experiments,optimal5/n_experiments,regret5/n_experiments



In [None]:
figsize(11.0, 10)
sns.set_style('white')
sns.set_context('talk')
plt.subplot(1,2,1)

plt.ylabel("Average Regret")

plt.xlabel('Time Step')

plt.plot(regret4)
plt.legend(agents, loc=2)

agents = [
          BetaAgent(bandit, GreedyPolicy()), 
        Agent(bandit, ExploreFirstPolicy(N,n_arms)), 
          Agent(bandit,EpsilonGreedyPolicy(epsilon = 0.1)),
          Agent(bandit, UCBPolicy(2,n_trials))
         
         ]
plt.subplot(1,2,2)


plt.ylabel("Average Regret")

plt.xlabel('Time Step')

plt.plot(regret5)
plt.legend(agents, loc=2)
