In [1]:
import numpy as np
import matplotlib.pyplot as plt

In [2]:
num_bandits = 10
num_steps = 1000
num_episodes = 1000

def choose_action(q_values, epsilon):
    if np.random.random() < epsilon:
        return np.random.choice(num_bandits)
    else:
        return np.argmax(q_values)

def run_bandit(epsilon):
    total_rewards = np.zeros(num_bandits)
    optimal_actions = np.zeros(num_steps)
    rewards_per_step = np.zeros(num_steps)  # Store rewards per time step
    for _ in range(num_episodes):
        q_true_values = np.random.normal(loc=0, scale=1, size=num_bandits)
        q_values = np.zeros(num_bandits) 
        action_counts = np.zeros(num_bandits)
        optimal_action = np.argmax(q_true_values)
        for step in range(num_steps):
            action = choose_action(q_values, epsilon)
            reward = np.random.normal(loc=q_true_values[action], scale=1)
            total_rewards[action] += reward
            action_counts[action] += 1
            q_values[action] += (reward - q_values[action]) / action_counts[action]
            if action == optimal_action:
                optimal_actions[step] += 1
            rewards_per_step[step] += reward  # Store reward at each time step
    return total_rewards / (num_episodes*num_steps), optimal_actions / (num_episodes*num_steps) * 100, rewards_per_step / num_episodes

In [3]:
epsilons = np.logspace(-4, 0, num=30)[1:]
#instert 0 to the beginning of the array
epsilons = np.insert(epsilons, 0, 0)

average_rewards = []
percentage_optimal_actions = []
reward_per_step_best_epsilon = None
best_avg_reward = float('-inf')
best_epsilon = None
reward_per_step_other_epsilons = []

for epsilon in epsilons:
    print("Current epsilon:", epsilon)
    avg_reward, opt_actions, rewards_per_step = run_bandit(epsilon)
    average_rewards.append(avg_reward)
    percentage_optimal_actions.append(opt_actions)
    # Check if current epsilon gives the best average reward
    if np.mean(rewards_per_step) > best_avg_reward:
        best_avg_reward = np.mean(rewards_per_step)
        best_epsilon = epsilon
        reward_per_step_best_epsilon = rewards_per_step
    # Store rewards per time step for two other epsilons for comparison
    if epsilon == 0.01:
        reward_per_step_other_epsilons.append(rewards_per_step)
    elif epsilon == 0.1:
        reward_per_step_other_epsilons.append(rewards_per_step)

Current epsilon: 0.0
Current epsilon: 0.00013738237958832623
Current epsilon: 0.00018873918221350977
Current epsilon: 0.0002592943797404667
Current epsilon: 0.0003562247890262444
Current epsilon: 0.0004893900918477494
Current epsilon: 0.0006723357536499335
Current epsilon: 0.0009236708571873865
Current epsilon: 0.0012689610031679222


KeyboardInterrupt: 

In [None]:
plt.figure(figsize=(12, 8))

# Plot average reward vs epsilon
plt.subplot(2, 2, 1)
plt.plot(epsilons, [np.mean(reward) for reward in average_rewards])
plt.xlabel('Epsilon')
plt.ylabel('Average Reward')
plt.title('Average Reward vs Epsilon')

# Plot percentage of optimal actions taken vs epsilon
plt.subplot(2, 2, 2)
plt.plot(epsilons, [np.max(actions) for actions in percentage_optimal_actions])
plt.xlabel('Epsilon')
plt.ylabel('Percentage of Optimal Actions Taken')
plt.title('Percentage of Optimal Actions Taken vs Epsilon')

# Plot average reward per time step for the best epsilon
plt.subplot(2, 1, 2)
plt.plot(range(num_steps), reward_per_step_best_epsilon, label=f'Best Epsilon ({best_epsilon})')
for i, epsilon in enumerate([0.01, 0.1]):
    plt.plot(range(num_steps), reward_per_step_other_epsilons[i], label=f'Epsilon={epsilon}')
plt.xlabel('Time Step')
plt.ylabel('Average Reward')
plt.title('Average Reward per Time Step')
plt.legend()
plt.tight_layout()
plt.show()