In [1]:
import numpy as np
import tensorflow as tf
import tensorflow.contrib.slim as slim  #high level library

## Bandit
Four armed bandit. Lower the bandit number, higher the chance of positive reward

In [2]:
bandit_arms = [0.2, 0, -0.2, -2] #arm number 4 i.e index 3 will give the most reward
num_arms = len(bandit_arms)

def pullBandit(bandit):
    result = np.random.randn(1)
    if result > bandit:
        return 1
    else:
        return -1

In [7]:
tf.reset_default_graph()

weights = tf.Variable(tf.ones([num_arms]))
output = tf.nn.softmax(weights)

reward_holder = tf.placeholder(shape=[1], dtype=tf.float32)
action_holder = tf.placeholder(shape=[1], dtype=tf.int32)

responsible_output = tf.slice(output, action_holder, [1])
loss = -(tf.log(responsible_output)*reward_holder)
optimizer = tf.train.AdamOptimizer(learning_rate=1e-3)
update = optimizer.minimize(loss)



## Training the agent

In [10]:
total_episodes = 1000
total_reward = np.zeros(num_arms)

init = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init)
    i = 0
    while i < total_episodes:
        #Choose actions according to Boltzmann distribution
        actions = sess.run(output)   #action is the bandit arm to pull
        a = np.random.choice(actions, p=actions)
        action = np.argmax(actions)
        
        reward = pullBandit(bandit_arms[action])
        _, resp, ww = sess.run([update, responsible_output, weights], feed_dict={reward_holder:[reward], action_holder: [action]})
        
        total_reward[action] += reward
        
        if i % 50 == 0:
            print("Running reward for the " + str(num_arms) + " arms of the bandit: " + str(total_reward))
        i += 1
        
print("\nThe agent thinks arm " + str(np.argmax(ww)+1) + " is the most promising....")
if np.argmax(ww) == np.argmax(-np.array(bandit_arms)):
    print("...and it was RIGHT!")
else:
    print("...and it was WRONG!")



Running reward for the 4 arms of the bandit: [-1.  0.  0.  0.]
Running reward for the 4 arms of the bandit: [-1. -1. -1. 48.]
Running reward for the 4 arms of the bandit: [-1. -1. -1. 98.]
Running reward for the 4 arms of the bandit: [ -1.  -1.  -1. 148.]
Running reward for the 4 arms of the bandit: [ -1.  -1.  -1. 196.]
Running reward for the 4 arms of the bandit: [ -1.  -1.  -1. 244.]
Running reward for the 4 arms of the bandit: [ -1.  -1.  -1. 294.]
Running reward for the 4 arms of the bandit: [ -1.  -1.  -1. 340.]
Running reward for the 4 arms of the bandit: [ -1.  -1.  -1. 388.]
Running reward for the 4 arms of the bandit: [ -1.  -1.  -1. 436.]
Running reward for the 4 arms of the bandit: [ -1.  -1.  -1. 480.]
Running reward for the 4 arms of the bandit: [ -1.  -1.  -1. 530.]
Running reward for the 4 arms of the bandit: [ -1.  -1.  -1. 580.]
Running reward for the 4 arms of the bandit: [ -1.  -1.  -1. 630.]
Running reward for the 4 arms of the bandit: [ -1.  -1.  -1. 676.]
Running

# Contextual bandits