just some thoughts i had in the exercise:
* rewards are stochastic
* possible approach: continuously sample arms and then progressively eliminate actions with lowest expected return
* maybe apply epsilon-greedy action selection (see decision support)

In [2]:
import numpy as np
import gym
import gym_bandits
import matplotlib.pyplot as plt

In [3]:
from fastcore.basics import patch

In [4]:
class BanditGame:
    def run_n_episodes(self, strategy, n_episodes: int, verbose: bool, strategy_args = []):
        # Setup run
        # np.random.seed(42)  # make runs deterministic for numpy random number generator
        self.env = gym.make("BanditTenArmedGaussian-v0")
        # self.env.seed(34)  # make each run the same
        self.observation = self.env.reset()
        self.rewards = []
        self.average_rewards = np.zeros(self.env.action_space.n)
        self.nr_steps_per_action = np.zeros(self.env.action_space.n)
        self.n_episodes = n_episodes

        if verbose:
            print("observation space:", self.env.observation_space.n, "dimensional")
            print("action space:", self.env.action_space.n, "dimensional")

        for self.i_episode in range(self.n_episodes):

            if verbose:
                print("episode Number is", self.i_episode)

            action = strategy(*strategy_args)

            if verbose:
                print("action is", action)

            self.observation, self.reward, self.done, self.info = self.env.step(action)
            self.rewards.append(self.reward)
            self.average_rewards[action] += self.reward / self.n_episodes

            if verbose:
                print("observation space is: ", self.observation)
                print("reward variable is: ", self.reward)
                print("done flag is: ", self.done)
                print("info variable is: ", self.info)
        if verbose:
            print("sum of rewards: " + str(np.sum(self.rewards)))
        return np.sum(self.rewards)
    
    def run_n_sims(self, strategy, n_sims, n_episodes: int, verbose: bool, strategy_args = []):
        self.reward_sims = np.zeros(n_sims)

        # make strategies comparable
        # np.random.seed(42)
        # self.env.seed(34)  

        for i_sims in range(n_sims): 
            reward_sim = self.run_n_episodes(strategy, n_episodes, verbose, strategy_args)
            self.reward_sims[i_sims] = reward_sim
        print(f"Average reward per sim is {self.reward_sims.mean()} with sd {self.reward_sims.std()}")


In [5]:
@patch
def random_uniform(self: BanditGame):
    return self.i_episode % self.env.action_space.n

In [6]:
bg = BanditGame()
bg.run_n_episodes(bg.random_uniform, 1000, False)

  deprecation(
  deprecation(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.deprecation(
  if not isinstance(done, (bool, np.bool8)):


169.35060480902462

In [7]:
bg.run_n_sims(bg.random_uniform, 1000, 1000, False)

Average reward per sim is -16.87930658481435 with sd 317.9285680140441


In [8]:
@patch
def explore_then_exploit(self: BanditGame, exploration_factor: float):
    assert 0 <= exploration_factor and exploration_factor <= 1
    n_exploration_eps = self.n_episodes * exploration_factor
    if self.i_episode <= n_exploration_eps:
        return self.i_episode % self.env.action_space.n
    else:
        return self.average_rewards.argmax()

In [9]:
bg.run_n_episodes(bg.explore_then_exploit, 1000, False, [0.1])

1241.7556901403777

In [10]:
bg.run_n_sims(bg.explore_then_exploit, 1000, 1000, False, [rate])

Average reward per sim is 1012.0732466514124 with sd 583.3513174223067


In [13]:
for i in range(0, 15): # hyperparameter optimization
    rate = i / 100
    print('rate:', rate)
    bg.run_n_sims(bg.explore_then_exploit, 1000, 1000, False, [rate])

rate: 0.0
Average reward per sim is 1037.462754173668 with sd 604.975693308411
rate: 0.01
Average reward per sim is 1225.074402403252 with sd 627.9652149188141
rate: 0.02
Average reward per sim is 1322.9014369669349 with sd 660.1153122602763
rate: 0.03
Average reward per sim is 1344.6597638918759 with sd 630.6822931766918
rate: 0.04
Average reward per sim is 1380.5978193716016 with sd 630.4201131197293
rate: 0.05
Average reward per sim is 1363.3525800380448 with sd 607.5578162951064
rate: 0.06
Average reward per sim is 1323.1003628029805 with sd 591.5058546184163
rate: 0.07
Average reward per sim is 1318.9621005161903 with sd 580.4700180824374
rate: 0.08


KeyboardInterrupt: 

Questions:
- how many rounds exploring is optimal?
- after a little exploration we should focus more onto the more promising ones