In [3]:
import numpy as np
import tensorflow as tf
import tensorflow.contrib.slim as slim  #high level library

## Bandit
Four armed bandit. Lower the bandit number, higher the chance of positive reward

In [2]:
bandit_arms = [0.2, 0, -0.2, -2] #arm number 4 i.e index 3 will give the most reward
num_arms = len(bandit_arms)

def pullBandit(bandit):
    result = np.random.randn(1)
    if result > bandit:    #lower arm number, higher chance of reward
        return 1
    else:
        return -1

In [7]:
tf.reset_default_graph()

weights = tf.Variable(tf.ones([num_arms]))
output = tf.nn.softmax(weights)

reward_holder = tf.placeholder(shape=[1], dtype=tf.float32)
action_holder = tf.placeholder(shape=[1], dtype=tf.int32)

responsible_output = tf.slice(output, action_holder, [1])
loss = -(tf.log(responsible_output)*reward_holder)
optimizer = tf.train.AdamOptimizer(learning_rate=1e-3)
update = optimizer.minimize(loss)



## Training the agent

In [10]:
total_episodes = 1000
total_reward = np.zeros(num_arms)

init = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init)
    i = 0
    while i < total_episodes:
        #Choose actions according to Boltzmann distribution
        actions = sess.run(output)   #action is the bandit arm to pull
        a = np.random.choice(actions, p=actions)
        action = np.argmax(actions)
        
        reward = pullBandit(bandit_arms[action])
        _, resp, ww = sess.run([update, responsible_output, weights], feed_dict={reward_holder:[reward], action_holder: [action]})
        
        total_reward[action] += reward
        
        if i % 50 == 0:
            print("Running reward for the " + str(num_arms) + " arms of the bandit: " + str(total_reward))
        i += 1
        
print("\nThe agent thinks arm " + str(np.argmax(ww)+1) + " is the most promising....")
if np.argmax(ww) == np.argmax(-np.array(bandit_arms)):
    print("...and it was RIGHT!")
else:
    print("...and it was WRONG!")



Running reward for the 4 arms of the bandit: [-1.  0.  0.  0.]
Running reward for the 4 arms of the bandit: [-1. -1. -1. 48.]
Running reward for the 4 arms of the bandit: [-1. -1. -1. 98.]
Running reward for the 4 arms of the bandit: [ -1.  -1.  -1. 148.]
Running reward for the 4 arms of the bandit: [ -1.  -1.  -1. 196.]
Running reward for the 4 arms of the bandit: [ -1.  -1.  -1. 244.]
Running reward for the 4 arms of the bandit: [ -1.  -1.  -1. 294.]
Running reward for the 4 arms of the bandit: [ -1.  -1.  -1. 340.]
Running reward for the 4 arms of the bandit: [ -1.  -1.  -1. 388.]
Running reward for the 4 arms of the bandit: [ -1.  -1.  -1. 436.]
Running reward for the 4 arms of the bandit: [ -1.  -1.  -1. 480.]
Running reward for the 4 arms of the bandit: [ -1.  -1.  -1. 530.]
Running reward for the 4 arms of the bandit: [ -1.  -1.  -1. 580.]
Running reward for the 4 arms of the bandit: [ -1.  -1.  -1. 630.]
Running reward for the 4 arms of the bandit: [ -1.  -1.  -1. 676.]
Running

# Contextual bandits
 Here we introduce the concept of a state. There will be 3 four-armed bandits with the same set of actions. The different bandits indicate different states.

In [4]:
class contextual_bandit():
    def __init__(self):
        self.state = 0
        #List out our bandits. Arms 4, 2, 1 are the most optimal
        self.bandits = np.array([[0.2, 0, -0.0, -5], [0.1, -5, 1, 0.25], [-5, 5, 5, 5]])
        self.num_bandits = self.bandits.shape[0]
        self.num_actions = self.bandits.shape[1]
        
    def getBandit(self):
        self.state = np.random.randint(0, len(self.bandits))   #returns a random bandit aka random state
        return self.state
    
    def pullArm(self, action):
        bandit = self.bandits[self.state, action]
        result = np.random.randn(1)
        if result > bandit:
            return 1
        else:
            return -1

### Agent 
State as input, action as output

In [6]:
class agent():
    def __init__(self, lr, s_size, a_size):
        #These lines establish the feedforward part of the network
        
        self.state_in = tf.placeholder(shape=[1], dtype=tf.int32)
        state_in_OH = slim.one_hot_encoding(self.state_in, s_size)
        output = slim.fully_connected(state_in_OH, a_size, biases_initializer=None, \
                                     activation_fn=tf.nn.sigmoid, weights_initializer=tf.ones_initializer())
        self.output = tf.reshape(output, [-1])
        self.chosen_action = tf.argmax(self.output, 0)
        
        #training
        self.reward_holder = tf.placeholder(shape=[1], dtype=tf.float32)
        self.action_holder = tf.placeholder(shape=[1], dtype=tf.int32)
        self.responsible_weight = tf.slice(self.output, self.action_holder, [1])
        self.loss = -(tf.log(self.responsible_weight)*self.reward_holder)
        optimizer = tf.train.GradientDescentOptimizer(learning_rate=lr)
        self.update = optimizer.minimize(self.loss)
        


### Training the agent

In [13]:
tf.reset_default_graph()

cBandit = contextual_bandit() #Load the bandits
myAgent = agent(lr=0.001, s_size=cBandit.num_bandits, a_size=cBandit.num_actions)
weights = tf.trainable_variables()[0]

total_episodes= 10000
total_reward = np.zeros([cBandit.num_bandits, cBandit.num_actions])
e= 0.1
init = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init)
    i = 0
    while i < total_episodes:

        s = cBandit.getBandit() #get a state

        if np.random.rand(1) < e:
            action = np.random.randint(cBandit.num_actions)
        else:
            action = sess.run(myAgent.chosen_action, feed_dict={myAgent.state_in:[s]})
        reward = cBandit.pullArm(action)

        feed_dict = {myAgent.reward_holder:[reward], myAgent.action_holder:[action], myAgent.state_in:[s]}
        _, ww = sess.run([myAgent.update, weights], feed_dict=feed_dict)

        total_reward[s, action] += reward

        if i % 500 == 0:
            print("Mean reward for each of the " + str(cBandit.num_bandits) + " bandits: " + str(np.mean(total_reward,axis=1)))
        i+=1

    for a in range(cBandit.num_bandits):
        print("The agent thinks action " + str(np.argmax(ww[a])+1) + " for bandit " + str(a+1) + " is the most promising....")
        if np.argmax(ww[a]) == np.argmin(cBandit.bandits[a]):
            print("...and it was RIGHT!")
        else:
            print("...and it was WRONG!")



Mean reward for each of the 3 bandits: [0.   0.   0.25]
Mean reward for each of the 3 bandits: [35.5  38.   32.25]
Mean reward for each of the 3 bandits: [72.25 79.25 66.25]
Mean reward for each of the 3 bandits: [107.25 117.5  106.  ]
Mean reward for each of the 3 bandits: [143.75 151.75 146.25]
Mean reward for each of the 3 bandits: [182.5  191.25 179.5 ]
Mean reward for each of the 3 bandits: [222.5  226.75 213.  ]
Mean reward for each of the 3 bandits: [259.75 263.25 248.75]
Mean reward for each of the 3 bandits: [294.75 302.   282.5 ]
Mean reward for each of the 3 bandits: [334.   336.75 317.  ]
Mean reward for each of the 3 bandits: [373.   371.5  354.75]
Mean reward for each of the 3 bandits: [410.25 408.   391.  ]
Mean reward for each of the 3 bandits: [444.   445.75 429.  ]
Mean reward for each of the 3 bandits: [481.25 486.   465.  ]
Mean reward for each of the 3 bandits: [520.75 526.5  497.5 ]
Mean reward for each of the 3 bandits: [557.5  559.25 534.5 ]
Mean reward for each