In [None]:
import numpy as np

In [None]:
def single_bandit(n_arm, q_true, q_est, epsilon, steps, alpha, stationary=True):
    act_counts = np.zeros(n_arm)
    rewards = list()
    opt_act = np.argmax(q_true)
    opt_acts = list()
    for i in range(steps):
        if not stationary:
            q_true += np.random.randn(n_arm) * 0.01
        if np.random.rand() < epsilon:
        	a = np.random.choice(n_arm)
        else:
        	a = np.argmax(q_est)
        act_counts[a] += 1
        reward = q_true[a] + np.random.randn()
        rewards.append(reward)
        opt_acts.append(int(a == opt_act))
        if alpha:
            q_est[a] += alpha * (reward - q_est[a])
        else:
            q_est[a] += 1 / act_counts[a] * (reward - q_est[a])
    return rewards, opt_acts

In [None]:
import matplotlib.pyplot as plt

n_arm = 10
rewards = {}
opt_acts = {}
for epsilon in [0.0, 0.01, 0.1]:
	res = [single_bandit(n_arm, np.random.randn(n_arm), np.zeros(n_arm), epsilon, 1000, alpha=None, stationary=True) for i in range(2000)]
	rs_, os_ = np.array(list(zip(*res))).mean(axis=1)
	rewards[epsilon] = rs_
	opt_acts[epsilon] = os_


plt.figure(figsize=(10, 12))

plt.subplot(2, 1, 1)
for k, v in rewards.items():
	plt.plot(v, label=f'epsilon = {k}')
plt.xlabel('steps')
plt.ylabel('average reward')
plt.legend()

plt.subplot(2, 1, 2)
for k, v in opt_acts.items():
	plt.plot(v, label=f'epsilon = {k}')
plt.xlabel('steps')
plt.ylabel('% optimal action')
plt.legend()
plt.show()

In [None]:
import matplotlib.pyplot as plt

n_arm = 10
rewards = {}
opt_acts = {}
for alpha in [None, 0.1]:
	res = [single_bandit(n_arm, np.zeros(n_arm), np.zeros(n_arm), 0.1, 100000, alpha, False) for i in range(2000)]
	rs_, os_ = np.array(list(zip(*res))).mean(axis=1)
	rewards[alpha] = rs_
	opt_acts[alpha] = os_

plt.figure(figsize=(10, 12))

plt.subplot(2, 1, 1)
for k, v in rewards.items():
	plt.plot(v, label=f'alpha = {k}')
plt.xlabel('steps')
plt.ylabel('average reward')
plt.legend()

plt.subplot(2, 1, 2)
for k, v in opt_acts.items():
	plt.plot(v, label=f'alpha = {k}')
plt.xlabel('steps')
plt.ylabel('% optimal action')
plt.legend()
plt.show()

In [None]:
import matplotlib.pyplot as plt

n_arm = 10
opt_acts = {}
for q_init, eps in [(5, 0.0), (0, 0.1)]:
	res = [single_bandit(n_arm, np.random.randn(n_arm), np.zeros(n_arm) + q_init, eps, 1000, 0.1, True) for i in range(2000)]
	rs_, os_ = np.array(list(zip(*res))).mean(axis=1)
	opt_acts[eps] = os_

plt.figure(figsize=(10, 6))

for k, q_init in zip(opt_acts, [5, 0]):
	plt.plot(opt_acts[k], label=f'eps = {k}, $Q_1$ = {q_init}')
plt.xlabel('steps')
plt.ylabel('% optimal action')
plt.legend()
plt.show()

In [None]:
import matplotlib.pyplot as plt

n_arm = 10
rewards = {}
opt_acts = {}
for alpha in [None, 0.1]:
	res = [single_bandit(n_arm, np.zeros(n_arm), np.zeros(n_arm), 0.1, 10000, alpha, False) for i in range(2000)]
	rs_, os_ = np.array(list(zip(*res))).mean(axis=1)
	rewards[alpha] = rs_
	opt_acts[alpha] = os_

plt.figure(figsize=(10, 6))

for k, v in rewards.items():
	plt.plot(v, label=f'alpha = {k}')
plt.xlabel('steps')
plt.ylabel('average reward')
plt.legend()
plt.show()