In [1]:
import numpy as np
import gym
from gym import spaces
from gym.utils import seeding


class SlotMachine:
    def __init__(self, mean, std_dev):
        self.mean = mean
        self.std_dev = std_dev

    def pull(self):
        return np.random.normal(self.mean, self.std_dev)


class SlotMachines(gym.Env):
    """
    Slot machine reinforcement learning environment for OpenAI Gym

    Arguments:
        n_machines - (int) Number of slot machines to create
        mean_range - (tuple) Range of values for mean initialization
        std_range - (tuple) Range of values for std initialization
    """

    def __init__(self, n_machines=10, mean_range=(-10, 10), std_range=(5, 10)):
        # Initialize N slot machines with random means and std_devs
        means = np.random.uniform(mean_range[0], mean_range[1], n_machines)
        std_devs = np.random.uniform(std_range[0], std_range[1], n_machines)
        self.machines = [SlotMachine(m, s) for (m, s) in zip(means, std_devs)]

        # Required by OpenAI Gym
        self.action_space = spaces.Discrete(n_machines)
        self.observation_space = spaces.Discrete(1)

    def seed(self, seed=None):
        """
        Seed the environment's random number generator

        Arguments:
          seed - (int) The random number generator seed.
        """
        _, seed = seeding.np_random(seed)
        return [seed]

    def step(self, action):
        """
        Perform an action within the slot machine environment

        Arguments:
          action - (int) An action to perform

        Returns:
          observation - (int) The new environment state. This is always 0 for
            SlotMachines.
          reward - (float) The reward gained by taking an action.
          done - (bool) Whether the environment has been completed and requires
            resetting. This is always True for SlotMachines.
          info - (dict) A dictionary of additional return values used for
            debugging purposes.
        """
        assert self.action_space.contains(action)
        return 0, self.machines[action].pull(), True, {}

    def reset(self):
        """
        Resets the environment. For SlotMachines, this always returns 0.
        """
        return 0

    def render(self, mode='human', close=False):
        """
        Render the environment display. For SlotMachines, this is a no-op.
        """
        pass


In [2]:

class MultiArmedBandit:
    """
    MultiArmedBandit reinforcement learning agent.

    Arguments:
      epsilon - (float) The probability of randomly exploring the action space
        rather than exploiting the best action.
    """

    def __init__(self, epsilon=0.2):
        self.epsilon = epsilon

    def fit(self, env, steps=1000):
        """
        Trains the MultiArmedBandit on an OpenAI Gym environment.

        See page 32 of Sutton and Barto's book Reinformcement Learning for
        pseudocode (http://incompleteideas.net/book/RLbook2018.pdf).
        Initialize your parameters as all zeros. For the step size (alpha), use
        1 / N, where N is the number of times the current action has been
        performed. Use an epsilon-greedy policy for action selection.

        See (https://gym.openai.com/) for examples of how to use the OpenAI
        Gym Environment interface.

        Hints:
          - Use env.action_space.n and env.observation_space.n to get the
            number of available actions and states, respectively.
          - Remember to reset your environment at the end of each episode. To
            do this, call env.reset() whenever the value of "done" returned
            from env.step() is True.
          - If all values of a np.array are equal, np.argmax deterministically
            returns 0.
          - In order to avoid non-deterministic tests, use only np.random for
            random number generation.
          - MultiArmedBandit treats all environment states the same. However,
            in order to have the same API as agents that model state, you must
            explicitly return the state-action-values Q(s, a). To do so, just
            copy the action values learned by MultiArmedBandit S times, where
            S is the number of states.

        Arguments:
          env - (Env) An OpenAI Gym environment with discrete actions and
            observations. See the OpenAI Gym documentation for example use
            cases (https://gym.openai.com/docs/).
          steps - (int) The number of actions to perform within the environment
            during training.

        Returns:
          state_action_values - (np.array) The values assigned by the algorithm
            to each state-action pair as a 2D numpy array. The dimensionality
            of the numpy array should be S x A, where S is the number of
            states in the environment and A is the number of possible actions.
          rewards - (np.array) A 1D sequence of averaged rewards of length 100.
            Let s = np.floor(steps / 100), then rewards[0] should contain the
            average reward over the first s steps, rewards[1] should contain
            the average reward over the next s steps, etc.
        """
        raise NotImplementedError()

In [15]:
sample_space = SlotMachines()
sample_space.seed(42)

[42]

In [36]:
# This is a type of environment with 10 different discreet actions?
sample_space.action_space

Discrete(10)

In [38]:
# This is the observation_space i.e. how many possible locations there are in the Markov chain?
sample_space.observation_space

Discrete(1)

In [35]:
# Does the same as the one below, but returns a few additional parameters, which aren't really relevant here
sample_space.step(0)

(0, -11.31862659734954, True, {})

In [31]:
# Get a reward from the given space i.e. from a specific machine
sample_space.machines[0].pull()

-2.4266823552975225

In [26]:
# Get a random possible action
sample_space.action_space.sample()

8

In [33]:
# This combines the two above
# First we take a random action value, 
# Then we sample from the corresponding slot machine
sample_space.machines[sample_space.action_space.sample()].pull()

-10.91543829240107

### From the tests

In [4]:
# From __init__.py in src
from gym.envs.registration import register

register(
    id='{}-{}'.format('SlotMachines', 'v0'),
    entry_point='src:{}'.format('SlotMachines'),
    max_episode_steps=1,
    nondeterministic=True)

# register(
#     id='FrozonLakeNoSlippery-v0',
#     entry_point='gym.envs.toy_text:FrozenLakeEnv',
#     kwargs={'map_name': '4x4', 'is_slippery': False})

In [6]:
# env = gym.make('SlotMachines-v0', n_machines=10, mean_range=(-10, 10), std_range=(5, 10))

In [9]:
state_action_rewards = np.zeros((1,10))

In [10]:
np.argmax(state_action_rewards)

0

In [12]:
np.random.random()

0.4224418374352119

In [61]:
np.random.randint(0,10)

1

In [30]:
np.argmax([1,2,3,4,5,6,7], axis=0)

6

In [31]:
n_actions=10
N = np.zeros((1, n_actions)) # how many times has each action been selected

In [34]:
N[:, 3] +1

array([1.])

In [44]:
rewards = np.random.randint(0,10,1000)
s = np.floor(1000 / 100)

In [57]:
avg_rewards = np.array([np.mean(rewards[int(interval*s):int((interval+1)*s)]) for interval in range(100)])

In [59]:
avg_rewards.shape

(100,)

In [48]:
rewards[0:9]

array([7, 5, 3, 4, 4, 8, 6, 8, 6])

In [62]:
A = np.random.randint(0, 10)

In [63]:
A

0

In [64]:
state_action_values = np.zeros((1, 10))

In [70]:
A = np.argmax(state_action_values, axis=1)

In [72]:
A.flatten()[0]

0

In [73]:
state_action_values = np.array([
        [0.0, 0.7, 0.3, 0.0],
        [0.0, 1.0, 0.0, 0.0],
        [0.0, 0.0, 1.0, 0.0],
        [0.0, 1.0, 0.0, 0.0],
        [0.0, 0.51, 0.49, 0.0],
        [0.0, 0.0, 0.0, 0.0],
        [0.5, 0.0, 0.5, 0.0],
        [0.0, 0.0, 0.0, 0.0],
        [0.0, 0.2, 0.8, 0.0],
        [0.0, 0.2, 0.8, 0.0],
        [0.0, 0.6, 0.4, 0.0],
        [0.0, 0.0, 0.0, 0.0],
        [0.0, 0.0, 0.0, 0.0],
        [1.0, 0.0, 0.0, 0.0],
        [0.0, 0.0, 1.0, 0.0],
        [0.0, 0.0, 0.0, 0.0]
    ])

In [74]:
np.argmax(state_action_values[0:])

5

In [76]:
state_action_values[0,:]

array([0. , 0.7, 0.3, 0. ])

In [79]:
step = 1000

In [80]:
not step % 1000

True