In [23]:
import numpy as np
import random


# k-armed bandit

Expected reward given that action a is selected: $q_*(a) = E[R_t|A_t=a]$. 
The estimated value of action a at timestep t: $Q_t(a)$.
We want $Q_t(a)$ to be as close to $q_*(a)$ as possible.

For actions that follow a distribution, the true value of an action is the mean reward for that action. 

$q_*(a) = E[R_t|A_t=a] = \sum_r{p(r|a) r}$ for discrete values
Integrals for continuous


Our estimate $Q_t = \frac{\sum_{i=1}^{t-1}R_i}{t-1}$

Here we are basing $Q_t$ time t-1 as Reward will be from that timestep

## Action - value method

$Q_t(a) = \frac{\text{sum of rewards when a taken prior to t}}{\text{number of times a taken prior to t }} $


### Greedy estimate of action

$A_t=argmax_a Q_t(a)$

### Incremental update
$\text{New Estimate}\leftarrow\text{Old Estimate} + \text{Step Size [Target - Old Estimate]}$ 
 

In [38]:
#stationary problem: ie time does not affect the reward distribution

def bandit(action):
    """
    Our bandit will sample from 3 distributions
    depending on action
    """
    samples = [np.random.normal(0, 3),
               np.random.normal(1, 2),
               np.random.binomial(1, 0.5)]
    return samples[action]


# Simple bandit algorithm from Sutton et al.

# Estimates of expected returns
Q = np.zeros(3)

# Number of times action a was taken
N = np.zeros(3)

# Explore 10 % of the time, be greedy 90% 
epsilon = 0.1
for i in range(10000):
    A = np.argmax(Q) if random.random() > epsilon else random.randint(0,2)
    R = bandit(A)
    
    if i % 100 == 0:
        print("Action taken: ", A, "Reward: ", R)
     
    #Increment denominator
    N[A] += 1
    
    #Update average reward of action A
    # Q(A) = Q(A) + 1/n * [R - Q(A)], for sample average, step size = 1/n
    Q[A] += (R-Q[A])/N[A]

print(Q)

Action taken:  0 Reward:  1.219302863871272
Action taken:  0 Reward:  -3.4662487364323376
Action taken:  1 Reward:  -0.5173949640217113
Action taken:  1 Reward:  3.2843830914315104
Action taken:  1 Reward:  2.7991695889607344
Action taken:  1 Reward:  2.057964710092238
Action taken:  1 Reward:  -0.6945817891563733
Action taken:  1 Reward:  0.7228260095526183
Action taken:  1 Reward:  -1.534522317128514
Action taken:  1 Reward:  1.3072859745825902
Action taken:  1 Reward:  -0.6050778500792697
Action taken:  1 Reward:  1.8016258605481874
Action taken:  1 Reward:  2.5667613402513325
Action taken:  1 Reward:  0.7660940051634154
Action taken:  1 Reward:  -0.8661311194921166
Action taken:  1 Reward:  0.4345054505457312
Action taken:  1 Reward:  1.6780679978541009
Action taken:  1 Reward:  -1.372085066568621
Action taken:  1 Reward:  1.9841120510319377
Action taken:  1 Reward:  5.030377641921631
Action taken:  1 Reward:  2.81981477428809
Action taken:  1 Reward:  -0.7061061828426729
Action ta

## Non-stationary problem

If time affects the distribution of the rewards, we can use a fixed step size, this weights recent rewards more heavily.

## Optimistic Initial Values

Encourages exploration at the start, however can cause problems with non-stationary problems. Often we dont know what an optimistic value is

In [58]:
def epsilon_bandit(Q=np.zeros(3), e=0.1, n=1000):
    d = len(Q)
    N = np.zeros(d)

    for i in range(10000):
        A = np.argmax(Q) if random.random() > e else np.random.choice(d)
        R = bandit(A)

        if i % 100 == 0:
            print("Action taken: ", A, "Reward: ", R)

        #Increment denominator
        N[A] += 1

        #Update average reward of action A
        # Q(A) = Q(A) + 1/n * [R - Q(A)], for sample average, step size = 1/n
        Q[A] += (R-Q[A])/N[A]
        
    return Q

q = epsilon_bandit(Q=np.array([2.,2.,2.]))
q

Action taken:  0 Reward:  -1.90661875904948
Action taken:  2 Reward:  1
Action taken:  1 Reward:  2.0233167441441884
Action taken:  1 Reward:  2.051044473220472
Action taken:  1 Reward:  2.4008229062289335
Action taken:  1 Reward:  -2.966628415982666
Action taken:  1 Reward:  1.6921276872579605
Action taken:  1 Reward:  -1.7393040385207494
Action taken:  1 Reward:  3.6389524595195444
Action taken:  0 Reward:  -4.456834828382192
Action taken:  1 Reward:  0.7390826604431331
Action taken:  1 Reward:  -0.8772018830684871
Action taken:  1 Reward:  2.8448751278749342
Action taken:  1 Reward:  1.233988647681002
Action taken:  1 Reward:  3.9130542487885793
Action taken:  1 Reward:  -3.5142072238911215
Action taken:  1 Reward:  1.7099071849030385
Action taken:  1 Reward:  0.8486734441944267
Action taken:  1 Reward:  2.604347360146787
Action taken:  1 Reward:  -0.44959265286405015
Action taken:  1 Reward:  -0.07331565787503624
Action taken:  1 Reward:  0.039906544562063107
Action taken:  1 Rewar

array([-0.13302256,  0.98904705,  0.48723404])

## UCB Upper Confidence Bound

Estimate the confidence interval from our samples, then pick the action with the highest Upper Confidence Interval

$A_t=argmax[Q_t(a) + c\sqrt{\frac{log t}{N_t(a)}}]$

In [79]:
def ucb(Q=np.zeros(3),c=2, n=1000):
    d = len(Q)
    N = np.zeros(d)
    
    def uci(a, t):
        return Q[a] + c * math.sqrt(math.log(t)/(N[a]+1))

    for i in range(1,10000):
        x = np.array([uci(a, i) for a in range(3)])
        A = np.argmax(x)  
        R = bandit(A)

        if i % 100 == 0:
            print("Action taken: ", A, "Reward: ", R)

        #Increment denominator
        N[A] += 1

        #Update average reward of action A
        # Q(A) = Q(A) + 1/n * [R - Q(A)], for sample average, step size = 1/n
        Q[A] += (R-Q[A])/N[A]
        
    return Q

q = ucb()
q

Action taken:  1 Reward:  2.926303218235967
Action taken:  1 Reward:  -0.6782447838138157
Action taken:  1 Reward:  0.4006409129036955
Action taken:  1 Reward:  2.204931908553979
Action taken:  1 Reward:  0.7932834958605895
Action taken:  1 Reward:  0.9738712151018813
Action taken:  1 Reward:  1.969108856151872
Action taken:  1 Reward:  1.2122579993527216
Action taken:  1 Reward:  5.038366810074165
Action taken:  1 Reward:  -0.9071635456073599
Action taken:  1 Reward:  2.891078106598414
Action taken:  1 Reward:  -1.5544866365924364
Action taken:  1 Reward:  1.7421427717961158
Action taken:  1 Reward:  0.9033339659020148
Action taken:  1 Reward:  3.452605637345256
Action taken:  1 Reward:  0.04736580165337634
Action taken:  2 Reward:  1
Action taken:  1 Reward:  -0.7639604681841909
Action taken:  1 Reward:  0.5851406076081012
Action taken:  1 Reward:  4.678176925026728
Action taken:  1 Reward:  1.3421557894835343
Action taken:  1 Reward:  -0.5358740714164438
Action taken:  1 Reward:  2.

array([-4.13603749,  1.0173002 ,  0.48148148])