Permalink
Browse files

Revised UCB2; improved Python Hedge

  • Loading branch information...
1 parent 94cf898 commit ae71e416341332e6a5fa7acb9f74c58524261cdd @johnmyleswhite committed Nov 17, 2012
View
@@ -3,8 +3,10 @@
This repo contains code in several languages that implements several standard algorithms for solving the Multi-Armed Bandits Problem, including:
* epsilon-Greedy
-* Softmax
+* Softmax (Boltzmann)
* UCB1
+* UCB2
+* Hedge
* Exp3
It also contains code that provides a testing framework for bandit algorithms based around simple Monte Carlo simulations.
View
@@ -1,2 +1,7 @@
-* Implement a faster multinomial draw code on Tim Hopper's suggestion
-
+* Implement a faster multinomial draw function (Tim Hopper's suggestion)
+* Revise hedge implementation to prevent numeric overflow
+* Implement all algorithms in Matlab
+* Make sure that all algorithms are implemented equivalently in all languages
+* Implement an Expert type/class
+* Implement Exp4
+* Implement Thompson sampling under specific distributional assumptions?
@@ -0,0 +1,22 @@
+execfile("core.py")
+
+import random
+
+random.seed(1)
+means = [0.1, 0.1, 0.1, 0.1, 0.9]
+n_arms = len(means)
+random.shuffle(means)
+arms = map(lambda (mu): BernoulliArm(mu), means)
+print("Best arm is " + str(ind_max(means)))
+
+f = open("algorithms/hedge/hedge_results.tsv", "w")
+
+for eta in [0.1, 0.2, 0.3, 0.4, 0.5]:
+ algo = Hedge(eta, [], [])
+ algo.initialize(n_arms)
+ results = test_algorithm(algo, arms, 5000, 250)
+ for i in range(len(results[0])):
+ f.write(str(temperature) + "\t")
+ f.write("\t".join([str(results[j][i]) for j in range(len(results))]) + "\n")
+
+f.close()
@@ -1,77 +1,73 @@
import math
-
def ind_max(x):
- m = max(x)
- return x.index(m)
-
+ m = max(x)
+ return x.index(m)
class UCB2(object):
- def __init__(self, alpha, counts, values):
- """
- UCB2 algorithm. Implementation of the slides at:
- http://lane.compbio.cmu.edu/courses/slides_ucb.pdf
- """
- self.alpha = alpha
- self.counts = counts
- self.values = values
- self.__current_arm = 0
- self.__next_update = 0
- return
-
- def initialize(self, n_arms):
- self.counts = [0 for col in range(n_arms)]
- self.values = [0.0 for col in range(n_arms)]
- self.r = [0 for col in range(n_arms)]
- self.__current_arm = 0
- self.__next_update = 0
-
- def __bonus(self, n, r):
- tau = self.__tau(r)
- bonus = math.sqrt((1. + self.alpha) * math.log(math.e * float(n) / tau) / (2 * tau))
- return bonus
-
- def __tau(self, r):
- return int(math.ceil((1 + self.alpha) ** r))
-
- def __set_arm(self, arm):
- """
- When choosing a new arm, make sure we play that arm for
- tau(r+1) - tau(r) episodes.
- """
- self.__current_arm = arm
- self.__next_update += max(1, self.__tau(self.r[arm] + 1) - self.__tau(self.r[arm]))
- self.r[arm] += 1
-
- def select_arm(self):
- n_arms = len(self.counts)
-
- # play each arm once
- for arm in range(n_arms):
- if self.counts[arm] == 0:
- self.__set_arm(arm)
- return arm
-
- # make sure we aren't still playing the previous arm.
- if self.__next_update > sum(self.counts):
- return self.__current_arm
-
- ucb_values = [0.0 for arm in range(n_arms)]
- total_counts = sum(self.counts)
- for arm in xrange(n_arms):
- bonus = self.__bonus(total_counts, self.r[arm])
- ucb_values[arm] = self.values[arm] + bonus
-
- chosen_arm = ind_max(ucb_values)
- self.__set_arm(chosen_arm)
- return chosen_arm
-
- def update(self, chosen_arm, reward):
- n = self.counts[chosen_arm]
- self.counts[chosen_arm] = n + 1
-
- value = self.values[chosen_arm]
- if n == 0:
- self.values[chosen_arm] = reward
- else:
- self.values[chosen_arm] = ((n - 1) / float(n)) * value + (1 / float(n)) * reward
+ def __init__(self, alpha, counts, values):
+ """
+ UCB2 algorithm. Implementation of the slides at:
+ http://lane.compbio.cmu.edu/courses/slides_ucb.pdf
+ """
+ self.alpha = alpha
+ self.counts = counts
+ self.values = values
+ self.__current_arm = 0
+ self.__next_update = 0
+ return
+
+ def initialize(self, n_arms):
+ self.counts = [0 for col in range(n_arms)]
+ self.values = [0.0 for col in range(n_arms)]
+ self.r = [0 for col in range(n_arms)]
+ self.__current_arm = 0
+ self.__next_update = 0
+
+ def __bonus(self, n, r):
+ tau = self.__tau(r)
+ bonus = math.sqrt((1. + self.alpha) * math.log(math.e * float(n) / tau) / (2 * tau))
+ return bonus
+
+ def __tau(self, r):
+ return int(math.ceil((1 + self.alpha) ** r))
+
+ def __set_arm(self, arm):
+ """
+ When choosing a new arm, make sure we play that arm for
+ tau(r+1) - tau(r) episodes.
+ """
+ self.__current_arm = arm
+ self.__next_update += max(1, self.__tau(self.r[arm] + 1) - self.__tau(self.r[arm]))
+ self.r[arm] += 1
+
+ def select_arm(self):
+ n_arms = len(self.counts)
+
+ # play each arm once
+ for arm in range(n_arms):
+ if self.counts[arm] == 0:
+ self.__set_arm(arm)
+ return arm
+
+ # make sure we aren't still playing the previous arm.
+ if self.__next_update > sum(self.counts):
+ return self.__current_arm
+
+ ucb_values = [0.0 for arm in range(n_arms)]
+ total_counts = sum(self.counts)
+ for arm in xrange(n_arms):
+ bonus = self.__bonus(total_counts, self.r[arm])
+ ucb_values[arm] = self.values[arm] + bonus
+
+ chosen_arm = ind_max(ucb_values)
+ self.__set_arm(chosen_arm)
+ return chosen_arm
+
+ def update(self, chosen_arm, reward):
+ self.counts[chosen_arm] = self.counts[chosen_arm] + 1
+ n = self.counts[chosen_arm]
+
+ value = self.values[chosen_arm]
+ new_value = ((n - 1) / float(n)) * value + (1 / float(n)) * reward
+ self.values[chosen_arm] = new_value
View
@@ -17,7 +17,9 @@ def ind_max(x):
from algorithms.softmax.standard import *
from algorithms.softmax.annealing import *
from algorithms.ucb.ucb1 import *
+from algorithms.ucb.ucb2 import *
from algorithms.exp3.exp3 import *
+from algorithms.hedge.hedge import *
# # Testing framework
from testing_framework.tests import *

0 comments on commit ae71e41

Please sign in to comment.