## Exercise 2.5

The first case using sample averages incrementally computed

In [118]:
import numpy as np

def sample_average(k: int, epsilon: float, n_steps: int, constant_step: float = None):
    # action value and step initialized at 0
    q_a = np.zeros(k,dtype=float)
    n = np.zeros(k, dtype=float)

    # reward distribution initialization
    q_star = np.ones(k, dtype=float)*np.random.random()*3
    q_delta_mean = 0
    q_delta_stdev = 0.1

    optimal_selections = []

    for i in range(n_steps):
        action =  np.argmax(q_a) if np.random.random() > epsilon else np.random.randint(0,k)
        reward =  q_star[action]

        if constant_step is not None:
            alpha = constant_step
        else:
            n[action] = n[action] + 1
            alpha = 1/n[action]
        # update value function
        q_a[action] = q_a[action] + (alpha)*(reward - q_a[action])
        # shift lever
        q_star = q_star + np.random.normal(q_delta_mean,q_delta_stdev,k)
        # this is for plotting
        was_optimal = int(action == np.argmax(q_star))
        optimal_selections.append(was_optimal)
    return optimal_selections

In [120]:
import matplotlib.pyplot as plt

n_runs = 500 # total number of experiments to run

k = 10 # 10 armed bandit
epsilon = 0.1 # e-greedy action selection
n_steps = 10000

incremental_exp_rewards  = np.zeros(n_steps)
constant_exp_rewards = np.zeros(n_steps)
for _ in range(n_runs):
    incremental_exp_rewards += sample_average(k,epsilon,n_steps)
    constant_exp_rewards += sample_average(k,epsilon,n_steps,constant_step=0.1)


incremental_exp_rewards /= n_runs
constant_exp_rewards /= n_runs

fig = plt.figure(figsize=(8,6))
plt.plot(np.arange(0,n_steps),incremental_exp_rewards)
plt.plot(np.arange(0,n_steps),constant_exp_rewards, color='r')
plt.xlabel('step')
plt.ylabel('optimal action (%)')
plt.show()