In [None]:
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
from tqdm import tqdm

In [None]:
class Bandit:
    def __init__(self, mean, q=0):
        self.mean = mean
        self.std = 1
        self.q = q
        self.n = 0
        self.alpha = None
        
    def plot_distribution(self):
        x = np.linspace(self.mean - 3*self.std, self.mean + 3*self.std, 100)
        distribution = stats.norm.pdf(x, self.mean, self.std)
        plt.plot(x, distribution)
    
    def choose(self):
        self.n += 1
        alpha = self.alpha if self.alpha else self.n
        reward = np.random.normal(self.mean, 1)
        self.q = self.q + (1/alpha)*(reward - self.q)
        return reward

In [None]:
def argmax(a):
    return np.random.choice(np.where(a == np.max(a))[0])

In [None]:
def select_action(q, e=None, type='greedy'):
    if type == 'greedy':
        return argmax(q)
    if type == 'e-greedy':
        if e is None:
            raise ValueError('Supply e when choosing e-greedy')
        chance = np.random.uniform()
        if chance <= e:
            return np.random.choice(len(q))
        return argmax(q)

# Greedy Algorithm

In [None]:
repeat = 20
n_iter = 5
total_rewards_greedy = np.zeros(n_iter)
for j in tqdm(range(repeat)):
    distributions = np.random.normal(0, 1, size=N_ARMS)
    bandits = [Bandit(i) for i in distributions]
    rewards = np.zeros(n_iter)
    for i in range(n_iter):
        q_values = [i.q for i in bandits]
        action = select_action(q_values, type='greedy')
        bandit = bandits[action]
        reward = bandit.choose()
        rewards[i] = reward
    total_rewards_greedy += rewards
total_rewards_greedy = total_rewards_greedy/repeat

# 0.1 Greedy

In [None]:
total_rewards_10 = np.zeros(n_iter)
for j in tqdm(range(repeat)):
    distributions = np.random.normal(0, 1, size=N_ARMS)
    bandits = [Bandit(i) for i in distributions]
    rewards = np.zeros(n_iter)
    for i in range(n_iter):
        q_values = [i.q for i in bandits]
        action = select_action(q_values, e=0.1, type='e-greedy')
        bandit = bandits[action]
        reward = bandit.choose()
        rewards[i] = reward
    total_rewards_10 += rewards
total_rewards_10 = total_rewards_10/repeat

# 0.01 Greedy

In [None]:
total_rewards_1 = np.zeros(n_iter)
for j in tqdm(range(repeat)):
    distributions = np.random.normal(0, 1, size=N_ARMS)
    bandits = [Bandit(i) for i in distributions]
    rewards = np.zeros(n_iter)
    for i in range(n_iter):
        q_values = [i.q for i in bandits]
        action = select_action(q_values, e=0.01, type='e-greedy')
        bandit = bandits[action]
        reward = bandit.choose()
        rewards[i] = reward
    total_rewards_1 += rewards
total_rewards_1 = total_rewards_1/repeat

# 0.5 greedy

In [None]:
total_rewards_50 = np.zeros(n_iter)
for j in tqdm(range(repeat)):
    distributions = np.random.normal(0, 1, size=N_ARMS)
    bandits = [Bandit(i) for i in distributions]
    rewards = np.zeros(n_iter)
    for i in range(n_iter):
        q_values = [i.q for i in bandits]
        action = select_action(q_values, e=0.5, type='e-greedy')
        bandit = bandits[action]
        reward = bandit.choose()
        rewards[i] = reward
    total_rewards_50 += rewards
total_rewards_50 = total_rewards_50/repeat

# See all the results

In [None]:
plt.plot(total_rewards_greedy, label='greedy')
plt.plot(total_rewards_10, label='0.1-greedy')
plt.plot(total_rewards_1, label='0.01-greedy')
plt.plot(total_rewards_50, label='0.5-greedy')
plt.legend()