# Introduction

This files contains solutions to the exercises in the notebook `agents-and-environments.ipynb`.

Run the following cell to define load the libraries used in the notebook:

In [None]:
# Numpy: efficient multidimensional arrays
import numpy as np

# matplotlib: interactive plots
import matplotlib.pyplot as plt

Let's now define the `MDPEnvironment` class:

In [None]:
class MDPEnvironment(object):

    def __init__(self, tprobs, rewards, init_distr, seed=0):
        self._tprobs = np.array(tprobs, dtype=np.float64)
        self._rewards = np.array(rewards, dtype=np.float64)
        self._init_distr = np.array(init_distr, dtype=np.float64)

        # Check array dimensions
        if self._tprobs.ndim != 3:
            raise ValueError('tprobs must be a 3-dimensional array')
        if self._rewards.ndim != 2:
            raise ValueError('rewards must be a 2-dimensional array')
        if self._init_distr.ndim != 1:
            raise ValueError('init_distr must be a 1-dimensional array')
        if self._tprobs.shape[0] != self._rewards.shape[0]:
            raise ValueError('axis 0 of arrays tprobs and rewards must have the same length')
        if self._tprobs.shape[1] != self._tprobs.shape[2]:
            raise ValueError('axes 1 and 2 of array tprobs must have the same length')
        if self._tprobs.shape[1] != self._rewards.shape[1]:
            raise ValueError('axis 1 of arrays tprobs and rewards must have the same length')
        if self._tprobs.shape[1] != self._init_distr.shape[0]:
            raise ValueError('axis 1 of array tprobs must have the same length as init_distr')

        self._num_actions = self._tprobs.shape[0]
        self._num_states = self._tprobs.shape[1]

        # Define random number generator used to simulate process
        self._rng = np.random.default_rng(seed=seed)

        # Start in an unitialized state
        self._current_state = None
        self._current_reward = None
        self._current_status = 0

    def set_seed(self, seed):
        self._rng = np.random.default_rng(seed=seed)

    def transition_probs(self):
        return self._tprobs.copy()

    def rewards(self):
        return self._rewards.copy()

    def status(self):
        return self._current_status

    def current_state(self):
        return self._current_state

    def current_reward(self):
        return self._current_reward

    def reset(self):
        self._current_status = 1
        self._current_state = self._rng.choice(self._num_states, p=init_distr)
        self._current_reward = 0.0

    def step(self, action):
        if action < 0 or action >= self._tprobs.shape[0]:
            raise ValueError(f'action must be an integer between 0 and {self._tprobs.shape[0]}')
        self._current_reward = self._rewards[action, self._current_state]
        self._current_state = self._rng.choice(self._num_states, p=self._tprobs[action, self._current_state])

# Exercise 1

## Exercise 1

An interesting question is whether the average cumulative reward per episode depends on the initial condition. To investigate if this is the case, let's estimate the average reward for each different initial state.

Create a Tiny Robot environment with the same configuration as above, but with a deterministic initial state. For example, to start the environment always in state $0$, use the initial distribution $[1, 0, 0, 0]$. Then, simulate the MDP for 200 steps and print the final average cumulative reward.

Repeat the simulation for each of the other three possible initial states and see if there is a difference.

In [None]:
# Define the environment
tprobs = [
    [
        [2/3, 1/3, 0.0, 0.0],
        [0.0, 2/3, 1/3, 0.0],
        [0.0, 0.0, 2/3, 1/3],
        [1/3, 0.0, 0.0, 2/3]
    ],
    [
        [0.0, 1/2, 0.0, 1/2],
        [1/2, 0.0, 1/2, 0.0],
        [0.0, 1/2, 0.0, 1/2],
        [1/2, 0.0, 1/2, 0.0]
    ]
]

# Define the rewards
rewards = [
    [40, 30, 20, 10],
    [10, 20, 30, 40]
]

# Define initial state distribution
init_distr = [1.0, 1/4, 1/4, 1/4]

# Create the Tiny Robot Environment
tr_env = MDPEnvironment(tprobs, rewards, init_distr, seed=77)