## Simple Multi-armed Bandit

In [1]:
# import libraries
import tensorflow as tf
import tensorflow.contrib.slim as slim
import numpy as np

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
# create bandit index
# in current situation, 4th bandit will create max reward
bandit_arms = [0.2, 0, -0.2, -2]
num_arms = len(bandit_arms)

def pullBandit(bandit):
    result = np.random.randn(1)
    if result > bandit:
        return 1
    else :
        return -1
    
tf.reset_default_graph()

# network and feedforward
weights = tf.Variable(tf.ones([num_arms]))  
output = tf.nn.softmax(weights)  # weights of each arm

# training
reward_holder = tf.placeholder(shape = [1], dtype = tf.float32)
action_holder = tf.placeholder(shape = [1], dtype = tf.int32)

responsible_output = tf.slice(output, action_holder, [1])  # probability of selected arm
loss = -(tf.log(responsible_output) * reward_holder)  # loss = -log(policy) * quantity of the action works better than any baseline(in this case reward)
optimizer = tf.train.AdamOptimizer(learning_rate = 1e-3)
update = optimizer.minimize(loss)

# set the number of episode
total_episodes = 1000

# initialize reward
total_reward = np.zeros(num_arms)

init = tf.global_variables_initializer()

# launching tensorflow graph
with tf.Session() as sess:
    sess.run(init)
    i = 0
    while i < total_episodes:
        actions = sess.run(output)
        a = np.random.choice(actions, p = actions)
        action = np.argmax(actions == a)
        
        reward = pullBandit(bandit_arms[action])
        
        _,resp,ww = sess.run([update, responsible_output, weights], feed_dict = {reward_holder : [reward], action_holder : [action]})
        
        total_reward[action] += reward
        if i % 50 == 0:
            print("Running reward for the " + str(num_arms) + " arms of the bandit : " + str(total_reward))
            
        i += 1
        
print("\nThe agent thinks arm " + str(np.argmax(ww)+1) + " is the most promising....")
if np.argmax(ww) == np.argmax(-np.array(bandit_arms)):
    print("...and it was right!")
else:
    print("...and it was wrong!")



Running reward for the 4 arms of the bandit : [1. 0. 0. 0.]
Running reward for the 4 arms of the bandit : [-3. -3.  3. 12.]
Running reward for the 4 arms of the bandit : [-3. -4.  2. 24.]
Running reward for the 4 arms of the bandit : [-8. -4.  7. 36.]
Running reward for the 4 arms of the bandit : [-11.   0.  10.  52.]
Running reward for the 4 arms of the bandit : [-10.   1.   7.  67.]
Running reward for the 4 arms of the bandit : [-15.  -1.  12.  75.]
Running reward for the 4 arms of the bandit : [-20.   2.  17.  84.]
Running reward for the 4 arms of the bandit : [-18.   3.  26. 100.]
Running reward for the 4 arms of the bandit : [-20.   7.  32. 110.]
Running reward for the 4 arms of the bandit : [-22.   8.  36. 127.]
Running reward for the 4 arms of the bandit : [-24.  13.  45. 143.]
Running reward for the 4 arms of the bandit : [-25.  14.  51. 163.]
Running reward for the 4 arms of the bandit : [-26.  11.  55. 179.]
Running reward for the 4 arms of the bandit : [-23.   9.  62. 193.]


## Context Bandit

In [4]:
import tensorflow as tf
import tensorflow.contrib.slim as slim
import numpy as np

In [9]:
# context bandit 3 bandits composed with 4 arms
class contextual_bandit():
    def __init__(self):
        self.state = 0
        self.bandits = np.array([[0.2, 0, -0.0, -5], [0.1, -5, 1, 0.25], [-5, 5, 5, 5]])  # index of arms. 4th, 2nd and 1st arms are optimum resp.
        self.num_bandits = self.bandits.shape[0]
        self.num_actions = self.bandits.shape[1]
        
    def getBandit(self):
        #return random state for each episode
        self.state = np.random.randint(0, len(self.bandits))  # in this case, the state is which bandit I chose
        
        return self.state
    
    def pullArm(self, action):
        bandit = self.bandits[self.state, action]
        result = np.random.randn(1)
        if result > bandit:
            return 1
        else:
            return -1
        

# policy based agent
class agent():
    def __init__(self, lr, s_size, a_size):  # s_size = state size, a_size = action size
        # feedforward of network
        self.state_in = tf.placeholder(shape = [1], dtype = tf.int32)
        state_in_OH = slim.one_hot_encoding(self.state_in, s_size)
        output = slim.fully_connected(state_in_OH, a_size, biases_initializer = None,
                                      activation_fn = tf.nn.sigmoid, 
                                      weights_initializer = tf.ones_initializer())
        self.output = tf.reshape(output, [-1])
        self.chosen_action = tf.argmax(self.output, 0)
        
        # define variables while training
        # to calculate reward, feed reward and action to the network and use them to update network
        self.reward_holder = tf.placeholder(shape = [1], dtype = tf.float32)
        self.action_holder = tf.placeholder(shape = [1], dtype = tf.int32)
        self.responsible_weight = tf.slice(self.output, self.action_holder, [1])
        self.loss = -(tf.log(self.responsible_weight) * self.reward_holder)
        optimizer = tf.train.GradientDescentOptimizer(learning_rate = lr)
        self.update = optimizer.minimize(self.loss)
        
# training agent
tf.reset_default_graph()

# load bandit
cBandit = contextual_bandit()
# load agent
myAgent = agent(lr = 0.001, s_size = cBandit.num_bandits, a_size = cBandit.num_actions)

weights = tf.trainable_variables()[0]
total_episodes = 10000
total_reward = np.zeros([cBandit.num_bandits, cBandit.num_actions])
# probability of greedy
e = 0.1

init = tf.global_variables_initializer()

# launching tensorflow graph
with tf.Session() as sess:
    sess.run(init)
    i = 0
    while i < total_episodes:
        s = cBandit.getBandit()
        # random action or one action
        if np.random.rand(1) < e:
            action = np.random.randint(cBandit.num_actions)
        else:
            action = sess.run(myAgent.chosen_action, feed_dict = {myAgent.state_in:[s]})
        
        reward = cBandit.pullArm(action)
        
        feed_dict = {myAgent.reward_holder : [reward], myAgent.action_holder : [action], myAgent.state_in : [s]}
        _, ww = sess.run([myAgent.update, weights], feed_dict = feed_dict)
        
        total_reward[s, action] += reward
        if i % 500 == 0:
            print("Mean reward for each of the " + str(cBandit.num_bandits) + 
                  "bandits : " + str(np.mean(total_reward, axis = 1)))
        i += 1
        
for a in range(cBandit.num_bandits):
    print("The agent thinks action " + str(np.argmax(ww[a]) + 1) + 
          " for bandit " + str(a+1) + " is the most promising...")
    if np.argmax(ww[a]) == np.argmin(cBandit.bandits[a]):
        print("...and it was right!")
    else:
        print("...and it was wrong!")

Mean reward for each of the 3bandits : [-0.25  0.    0.  ]
Mean reward for each of the 3bandits : [33.   40.   34.25]
Mean reward for each of the 3bandits : [73.   77.75 70.  ]
Mean reward for each of the 3bandits : [109.5  117.   108.25]
Mean reward for each of the 3bandits : [148.   159.   142.25]
Mean reward for each of the 3bandits : [186.5  197.5  175.75]
Mean reward for each of the 3bandits : [230.   236.   206.75]
Mean reward for each of the 3bandits : [266.75 275.5  241.5 ]
Mean reward for each of the 3bandits : [309.25 309.75 279.25]
Mean reward for each of the 3bandits : [345.   349.75 315.5 ]
Mean reward for each of the 3bandits : [385.5  387.   349.25]
Mean reward for each of the 3bandits : [424.   425.25 381.  ]
Mean reward for each of the 3bandits : [463.75 463.75 413.75]
Mean reward for each of the 3bandits : [503.5  503.25 445.5 ]
Mean reward for each of the 3bandits : [543.   540.5  481.75]
Mean reward for each of the 3bandits : [580.25 574.5  522.  ]
Mean reward for e