In [1]:
import numpy as np
import tensorflow as tf
from matplotlib import pyplot as plt
import numpy.random as rnd

import datetime,csv

In [15]:
class contextual_bandit():
    def __init__(self):
        self.state = 0
        self.bandits = np.array([[0.2,0,-0.2,-5],
                                 [0.1,-5,1,0.25],
                                 [-5,5,5,5]])
        self.num_bandits = self.bandits.shape[0]
        self.num_actions = self.bandits.shape[1]
        
    def get_bandit(self):
        # Select random bandit
        self.state = rnd.randint(0,len(self.bandits))
        return self.state
    
    def pull_bandit(self, action):
        res = self.bandits[self.state,action]
        randnum = rnd.randn(1)
        if res>randnum:
            return 1
        else:
            return -1

In [50]:
class agent():
    def __init__(self,lr,in_size,out_size):
        self.state_in = tf.placeholder(shape=[1],dtype=tf.int32)
        
        state_in_oh = tf.contrib.slim.one_hot_encoding(self.state_in,out_size)
        #output = tf.layers.dense(state_in_oh,units=out_size,activation=tf.nn.sigmoid)
        output = tf.contrib.slim.fully_connected(state_in_oh,out_size,biases_initializer=None,activation_fn=tf.nn.sigmoid,weights_initializer=tf.ones_initializer())
        
        self.output = tf.reshape(output,[-1])
        self.chosen_action = tf.argmax(self.output,axis=0)
        
        self.reward_holder = tf.placeholder(shape=[1],dtype=tf.float32)
        self.action_holder = tf.placeholder(shape=[1],dtype=tf.int32)
        
        self.responsible_weight = tf.slice(self.output,self.action_holder,[1])
        self.loss = -tf.log(self.responsible_weight)*self.reward_holder
        self.optimize = tf.train.GradientDescentOptimizer(lr).minimize(self.loss)

In [70]:
tf.reset_default_graph()

episodes = 10000
e = 0.1

with tf.Session() as sess:
    
    cBandit = contextual_bandit()
    mAgent = agent(lr=0.01,in_size=cBandit.num_bandits,out_size=cBandit.num_actions) 
    
    weights = tf.trainable_variables()[0]
    total_rewards = np.zeros([cBandit.num_bandits,cBandit.num_actions])
    
    sess.run(tf.global_variables_initializer())
    i = 0
    while i<episodes:
        
        bandit_selected = cBandit.get_bandit()
        
        if rnd.rand(1)<e:
            action = rnd.randint(0,cBandit.num_actions,1)[0]
        else:
            action = sess.run(mAgent.chosen_action,feed_dict={
                mAgent.state_in:[bandit_selected]
            })
        reward = cBandit.pull_bandit(bandit_selected)
        
        _,ww = sess.run([mAgent.optimize,weights],feed_dict={
            mAgent.action_holder:[action],
            mAgent.reward_holder:[reward],
            mAgent.state_in:[bandit_selected]
        })
        
        total_rewards[bandit_selected,action] += reward
        if i%500==0:
            print('Mean reward for each of the ' + str(cBandit.num_bandits) + ' bandits: ' + str(np.mean(total_rewards,axis=1)))
        i += 1
        
for b in range(cBandit.num_bandits):
    print("The agent thinks action " + str(np.argmax(ww[b])+1) + " for bandit " + str(b+1) + " is the most promising....")
    if np.argmax(ww[b])==np.argmin(cBandit.bandits[b]):
        print('...and it was right!')
    else:
        print('...and it was wrong.')

Mean reward for each of the 3 bandits: [ 0.   -0.25  0.  ]
Mean reward for each of the 3 bandits: [  5.75 -42.75  37.25]
Mean reward for each of the 3 bandits: [ 16.   -85.5   80.75]
Mean reward for each of the 3 bandits: [  25.25 -126.    125.  ]
Mean reward for each of the 3 bandits: [  32.5  -168.    169.25]
Mean reward for each of the 3 bandits: [  35.25 -210.5   210.5 ]
Mean reward for each of the 3 bandits: [  35.   -255.25  252.  ]
Mean reward for each of the 3 bandits: [  42.   -299.75  288.5 ]
Mean reward for each of the 3 bandits: [  46.75 -341.    330.  ]
Mean reward for each of the 3 bandits: [  51.25 -380.    372.  ]
Mean reward for each of the 3 bandits: [  59.5  -422.25  413.  ]
Mean reward for each of the 3 bandits: [  68.75 -462.25  453.75]
Mean reward for each of the 3 bandits: [  77.   -502.75  492.  ]
Mean reward for each of the 3 bandits: [  84.25 -543.25  533.75]
Mean reward for each of the 3 bandits: [  83.75 -583.25  573.75]
Mean reward for each of the 3 bandits