# 10-armed Testbed

In [1]:
import numpy as np
from tqdm import trange
import matplotlib
import matplotlib.pyplot as plt
# import sys

# sys.path.append(r"C:\Users\asus\Desktop\ReinforcementLearning\ten-armed-testbed")
# sys.path.append(r"C:\Users\asus\Desktop\ReinforcementLearning\ten-armed-testbed")
from src.bandit import Bandit
# from src.bandit import Bandit
matplotlib.use('Agg')

# f = Bandit()



SyntaxError: duplicate argument 'use_gradient' in function definition (bandit.py, line 9)

In [16]:
def simulate(runs, times, bandits):
    # region Summary
    """
    For any learning method, we can measure its performance and behavior as it improves with experience over 1000 time steps
    when applied to 1 of the bandit problems. This makes up 1 run. Repeating this for 2000 independent runs, each with a different
    bandit problem, we obtained measures of the learning algorithm’s average behavior.
    :param runs: Number of runs
    :param times: Number of times
    :param bandits: Bandit problems
    :return: Optimal action count mean and reward mean
    """
    # endregion Summary

    # region Body

    # Prepare a matrix filled with 0s for rewards
    rewards = np.zeros((len(bandits),runs, times))

    # Prepare a matrix filled with 0s for optimal action counts that has the same shape as rewards matrix
    optimal_action_counts = np.zeros(rewards.shape)

    # For every bandit
    for i, bandit in enumerate(bandits):
        # for every run
        for run in trange(runs):
            # initialize bandit
            bandit.initialize()

            # for every time step
            for time in range(times):
                # select an action
                action = bandit.act()

                # get the reward
                rewards[i, run, time] = bandit.step(action)

                # if the selected action is optimal for bandit
                if action == bandit.optimal_action:

                    # change the corresponding 0 in the optimal action counts matrix to 1
                    optimal_action_counts[i, run, time] = 1

    return  optimal_action_counts.mean(axis=1), rewards.mean(axis = 1)

    # endregion Body

## 1. Reward Distribution

In [16]:
# Plot an example reward distribution
plt.violinplot(dataset=np.random.randn(200, 10) + np.random.randn(10))
plt.title("Figure 2.1")
plt.xlabel("Action")
plt.ylabel("Reward distribution")
plt.savefig("../generated_images/figure_2_1.png")
plt.close()

## 2. Greedy Action Selection VS ε-greedy Action Selection

In [5]:


# Create a list of epsilons with 0, 0.1 and 0.01 values
epsilons = [0, 0.1, 0.01]

# Create a list of bandits (1 bandit for every epsilon) where every bandit uses sample-average method
bandits = [Bandit(epsilon=e, use_gradient=True) for e in epsilons]

In [None]:
# Define number of runs
runs = 2000

# Define number of times
time = 1000

# Simulate optimal action counts and rewards
optimal_action_counts, rewards = simulate(runs, time, bandits)


In [17]:
# Plotting
plt.figure(figsize=(10, 20))

<Figure size 1000x2000 with 0 Axes>

In [6]:
plt.subplot(2, 1, 1)
for epsilon, rewards in zip(epsilons, rewards):
    plt.plot(rewards, label="$\epsilon = %.02f$" % epsilon)
plt.title("Figure 2.2")
plt.xlabel("Steps")
plt.ylabel("Average reward")
plt.legend()

invalid escape sequence '\e'
invalid escape sequence '\e'
invalid escape sequence '\e'
invalid escape sequence '\e'


NameError: name 'rewards' is not defined

In [None]:
plt.subplot(2, 1, 2)
for epsilon, counts in zip(epsilons, optimal_action_counts):
    plt.plot(counts, label="$\epsilon = %.02f$" % epsilon)
plt.xlabel("Steps")
plt.ylabel("% Optimal action")
plt.legend()

In [None]:
plt.savefig("../generated_images/figure_2_2.png")
plt.close()

## 3. Optimistic Initial Values VS Realistic Initial Values

In [8]:
# Create a list of 2 bandits where:
# 1. 1st bandit: ε = 0, 𝑄_1(𝑎) = 5, 𝛼 = 0.1,
# 2. 2nd bandit: ε = 0.1, 𝑄_1(𝑎) = 0, 𝛼 = 0.1

bandits = [Bandit(epsilon=0, initial_action_value_estimates=5, step_size=0.1),
           Bandit(epsilon=0.1, initial_action_value_estimates=0, step_size=0.1)]



Bandic class constructor print
Bandic class test print
Bandic class constructor print
Bandic class test print


In [11]:
# Define number of runs
runs = 2000

# Define number of times
times = 1000

# Simulate optimal action counts
optimal_action_counts, _ = simulate(runs, times, bandits)

100%|██████████| 2000/2000 [00:37<00:00, 52.88it/s]
100%|██████████| 2000/2000 [00:36<00:00, 54.98it/s]


In [13]:
# Plotting
plt.plot(optimal_action_counts[0], label="$\epsilon = 0, Q1 = 5$")
plt.plot(optimal_action_counts[1], label="$\epsilon = 0.1, Q1 = 0$")
plt.title("Figure 2.3")
plt.xlabel("Steps")
plt.ylabel("% Optimal action")
plt.legend()

plt.savefig("../generated_images/figure_2_3.png")
plt.close()
# plt.figure(figsize=(10, 20))
# plt.subplot(2, 1, 1)


  plt.plot(optimal_action_counts[0], label="$\epsilon = 0, Q1 = 5$")
  plt.plot(optimal_action_counts[1], label="$\epsilon = 0.1, Q1 = 0$")


## 4. Upper-Confidence-Bound (UCB) Action Selection

In [3]:
# Create a list of 2 bandits where:
# 1. 1st bandit: ε = 0, 𝑐 = 2, uses sample-average method,
# 2. 2nd bandit: ε = 0.1, uses sample-average method
bandits = [Bandit(epsilon=0, confidence_level=2, use_gradient=True),
           Bandit(epsilon=0.1, use_gradient=True)]


Bandic class constructor print
Bandic class test print
Bandic class constructor print
Bandic class test print


In [4]:
# Define number of runs
runs = 2000

# Define number of times
times = 1000

# Simulate average rewards
_, average_rewards = simulate(runs, times, bandits)

100%|██████████| 2000/2000 [00:47<00:00, 42.17it/s]
100%|██████████| 2000/2000 [00:38<00:00, 52.05it/s]


In [2]:
# Plotting
matplotlib.use('Agg')

plt.plot(average_rewards[0], label="UCB $c = 2$")
plt.plot(average_rewards[1], label="epsilon-greedy $\epsilon = 0.1$")

plt.title("Figure 2.4")
plt.xlabel("Steps")
plt.ylabel("Average reward")
plt.legend()

plt.savefig("../generated_images/figure_2_4.png")
plt.close()

  plt.plot(average_rewards[1], label="epsilon-greedy $\epsilon = 0.1$")


NameError: name 'matplotlib' is not defined

## 5. Gradient Bandit Algorithms (GBA)

In [15]:
# Create a list of 4 bandits where:
# 1. 1st bandit: uses GBA, 𝛼 = 0.1, uses average reward as baseline for GBA, expects true reward of 4,
# 2. 2nd bandit: uses GBA, 𝛼 = 0.1, doesn't use average reward as baseline for GBA, expects true reward of 4,
# 3. 3rd bandit: uses GBA, 𝛼 = 0.4, uses average reward as baseline for GBA, expects true reward of 4,
# 4. 4th bandit: uses GBA, 𝛼 = 0.4, doesn't use average reward as baseline for GBA, expects true reward of 4

bandits = [
    Bandit(use_gradient=True, step_size=0.1, use_gradient_baseline=True, true_expected_reward=4),
    Bandit(use_gradient=True, step_size=0.1, use_gradient_baseline=False, true_expected_reward=4),
    Bandit(use_gradient=True, step_size=0.4, use_gradient_baseline=True, true_expected_reward=4),
    Bandit(use_gradient=True, step_size=0.4, use_gradient_baseline=False, true_expected_reward=4),

]


Bandic class constructor print
Bandic class test print
Bandic class constructor print
Bandic class test print
Bandic class constructor print
Bandic class test print
Bandic class constructor print
Bandic class test print


In [18]:
# Define number of runs
runs = 2000

# Define number of times
times = 1000

# Simulate optimal action counts\
optimal_action_counts, _ = simulate(runs, times, bandits)

  4%|▎         | 71/2000 [00:02<01:04, 29.82it/s]


KeyboardInterrupt: 

In [13]:
# Labels
labels  =[
    r"$\alpha = 0.1$ with baseline",
    r"$\alpha = 0.1$ without baseline",
    r"$\alpha = 0.4$ with baseline",
    r"$\alpha = 0.4$ without baseline",
]

In [14]:
# Plotting
for i in range(len(bandits)):
    plt.plot(optimal_action_counts[i], label=labels[i])

plt.title("Figure 2.5")
plt.xlabel("Steps")
plt.ylabel("% Optimal action")
plt.legend()

plt.savefig("../generated_images/figure_2.5.png")
plt.close()