# Q-learning
## Environment
Almost the same as before.

<div>
<img src="img/grid_example.png" width="250"/>
</div>

We do not inteact with human agent anymore.

Changes to the GridWorld environment:
- The possible actions "up", "down", "right", "left" were changed to integers, to make handling simpler.
- Observations changed from visualizing the whole environment to just the updated position of the agent.
- <code>render()</code> function for visualization only.

In [2]:
import numpy as np

legend = {
    'empty': 0,
    'agent': 4,
    'blocking': 8
}

class GridWorld:
    def __init__(self, world_shape, agent_init_pos, blocking_states, terminal_states, reward_states):
        self.world_shape = world_shape
        self.agent_init_pos = agent_init_pos
        self.blocking_states = blocking_states
        self.terminal_states = terminal_states
        self.reward_states = reward_states
        
        # the action representations are now integers, to make indexing and sampling for TD learning simpler
        self.possible_actions = {
            0: np.array([-1, 0]), # up
            1: np.array([1, 0]),  # down
            2: np.array([0, 1]),  # right
            3: np.array([0, -1])  # left
        }
        
        # set initial agent position
        self.agent_current_pos = self.agent_init_pos
        # list of collected rewards, to not collect rewards twice
        self.collected_rewards = []
        
    def reset(self):
        # reset agent position
        self.agent_current_pos = self.agent_init_pos
        # reset list of collected rewards
        self.collected_rewards = []
            
        # render initial observation
        observation = np.copy(self.agent_current_pos)
        return observation
    
    def move_agent(self, action):
        # move agent
        new_agent_pos = np.array(self.agent_current_pos) + self.possible_actions[action]

        # check if new position is blocked
        if tuple(new_agent_pos) in self.blocking_states:
            return self.agent_current_pos

        # check if new position is out of bounds
        if (new_agent_pos < 0).any() or (new_agent_pos >= self.world_shape).any():
            return self.agent_current_pos

        return tuple(new_agent_pos)
        
    def step(self, action):
        # execute action
        self.agent_current_pos = self.move_agent(action)
        
        reward = 0.0
        done = False
        
        # check if there is any reward
        if tuple(self.agent_current_pos) in self.reward_states.keys() and tuple(self.agent_current_pos) not in self.collected_rewards:
            reward += self.reward_states[tuple(self.agent_current_pos)]
            self.collected_rewards.append(tuple(self.agent_current_pos))
        
        # check if there is any reward and whether the game ended
        if tuple(self.agent_current_pos) in self.terminal_states:
            done = True
            
        # render observation
        observation = np.copy(self.agent_current_pos)
        return observation, reward, done
    
    def render(self, show_render=True):
        # initialize empty states
        states = np.ones(self.world_shape) * legend['empty']

        # add agent
        states[tuple(self.agent_current_pos)] = legend['agent']

        # add blocking states
        for blocking_state in self.blocking_states:
            states[blocking_state] = legend['blocking']

        # add rewards
        for state, reward in self.reward_states.items():
            if state not in self.collected_rewards:
                states[state] = reward
        if show_render:
            print(states)
        return states
        

## Agent
### Step 1 (TODO): Tabular Q-learning
We have a discrete state space (bounded integer coordinates) and a discrete action space (up, down, right, left). In such cases, we can represent the approximated **q-value function** $Q$ by a **table** $q$ with dimensions $[\text{number of actions}]\times[\text{number of states}]$. Since we have a 2D state space in this environment, we will represent the q-value function with a 3D **array**. In our case, the dimensions are $[\text{number of actions}]\times[\text{number of rows}]\times[\text{number of columns}] = 4\times3\times4$.

To retrieve an action value given a state $s_t =(r_t, c_t)$ – with $r_t$ and $c_t$ the corresponding row and column of the state – and action $a_t$, we just retrieve it from the array at the indices $[a_t, r_t, c_t]$
$$
Q(s_t, a_t) = q[a_t, r_t, c_t]
$$
The policy derived from the approximated q-value function is then
$$
\pi(s_t) = argmax_a(Q(s_t, a)) = argmax_a(q[a, r_t, c_t])
$$
and the value for the given state
$$
v(s_t) = max_a(Q(s_t, a)) = max_a(q[a, r_t, c_t])
$$
During TD-learning, we update the array with the new values based on the received reward $R$.
$$
Q'(s_t, a_t) = R(s_t, a_t) + \gamma v(s_{t+1})
$$
$$
Q(s_t, a_t) \leftarrow (1-\alpha) Q(s_t, a_t)+ \alpha Q'(s_t, a_t) 
$$
As we can see, we need the current state $s_t$ (at timestep $t$), the action taken $a_t$, the reward received $R(s_t, a_t)$ and the next state $s_{t+1}$ (at timestep $t+1$) resulting from $a_t$ to update the q-value approximation. $\alpha$ is the agent's learining rate and $\gamma$ is the discount factor from Bellman's equation.

While learning we enable exploration, that yields a random action.

In [3]:
class QAgent():
    def __init__(self, n_actions, world_shape, learning_rate=0.2, discount_factor=0.9):
        # TODO set class variables: n_actions, learning_rate and discount_factor
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.n_actions = n_actions
        
        # TODO initialize q-table with zeros of shape [n_actions, n_rows, n_columns]
        self.q_table = np.zeros([n_actions, *world_shape])

    def act(self, observation, explore=True):
        if explore:
            # TODO sample random action during training e.g. a random integer in [0, n_actions - 1]
            action = np.random.randint(self.n_actions - 1)
        else:
            # TODO get the q-values from the q-table at the given position (observation is the same as the agent's position)
            # hint: use numpy array slicing, with the ":" operator
            q_values = self.q_table[:, observation[0], observation[1]]
            # TODO find the action with the maximal q-value
            action = np.argmax(q_values)
        return action

    def learn(self, obs_0, action_0, reward_0, obs_1):
        # TODO retrieve the current q-value approximation for obs_0 and action_0 from the q-table
        q_value = self.q_table[action_0, obs_0[0], obs_0[1]]
        
        # TODO compute the value of the next state obs_1
        next_value = np.max(self.q_table[:, obs_1[0], obs_1[1]])
        
        # TODO compute new estimation of q-value for obs_0 and action_0
        new_q_value = reward_0 + self.discount_factor * next_value
        
        # TODO adjust old estimation using the learning rate
        adapted_q_value = (1 - self.learning_rate) * q_value + self.learning_rate * new_q_value 
        # TODO set the new q-value approximation in the q-table
        self.q_table[action_0, obs_0[0], obs_0[1]] = adapted_q_value

### Step 2 (TODO): Training
The training method is similar to the interaction between the human player and the environment from the previous excersize. We specify a number of training steps, for wich the agent colletcts experience and updates its q-table.

In [4]:
def train(agent, env, steps):
    # TODO reset environment and get initial observation
    obs = env.reset()
    for i in range(steps):
        # TODO get action from agent, given the observation
        action = agent.act(obs)
        # TODO execute action, get reward, new observation and termination flag
        new_obs, reward, done = env.step(action)
        # TODO learn from the gathered experience
        agent.learn(obs, action, reward, new_obs)
        # TODO set obs to new observation obs_1
        obs = new_obs
        # TODO reset environment if game terminated
        if done:
            obs = env.reset()

To run the training, we first need an environment. Here we define the environment's properties.

In [5]:
# dimensions of the GridWorld
world_shape = (3, 4)

# initial position of the agent
agent_init_pos = (2, 0)

# list of blocking state positions
blocking_states = [(1, 1)]

# list of terminal state positions
terminal_states = [(0,3), (1,3)]

# dictionary of rewards with key: position and value: reward
reward_states = {
    (0,3): 1,
    (1,3): -1
}

Now, we initialze the environment and the agent.

In [6]:
# TODO initialize environment
env = GridWorld(world_shape, agent_init_pos, blocking_states, terminal_states, reward_states)
# TODO render initial setup
env.render()

# TODO initialize agent; use len(env.possible_actions) to get the number of actions from the environment
agent = QAgent(len(env.possible_actions), world_shape)

[[ 0.  0.  0.  1.]
 [ 0.  8.  0. -1.]
 [ 4.  0.  0.  0.]]


We train the agent and print the learned values ans the policy (argmax).

In [8]:
# TODO train agent for 10000 steps
train(agent, env, 20000)

# show values
print()
print('values')
print(agent.q_table.max(axis=0))

# show actions with highest q-value
print()
print('argmax q')
print(agent.q_table.argmax(axis=0))

# Reminder: integer-action
# 0: up
# 1: down
# 2: right
# 3: left


values
[[0.81   0.9    1.     0.    ]
 [0.729  0.     0.9    0.    ]
 [0.6561 0.729  0.81   0.    ]]

argmax q
[[2 2 2 0]
 [0 0 0 0]
 [0 2 0 1]]


### Step 3 (TODO): Test
We implement a method to test the agent. This is similar to the training loop, except the agent does not collect experience and does not learn from it. While testing, we only want to play a single game, and compute the cumulative reward for that game collected by the agent.

In [9]:
def test(agent, env, max_steps=30):
    # TODO reset environment and get initial observation
    obs = env.reset()
    
    # TODO record path of agent in a list of positions; the fist position is the initial observation
    path = [obs]
    
    # initialize the cumulated reward as 0
    cumulated_reward = 0.0
    
    # we want to execute only one episode --> until the game is done or until a maximum nuber of steps is reached
    done = False
    n_steps = 0
    while not done and n_steps < max_steps:
        # TODO get action from agent, don't forget to set the explore flag to False
        action = agent.act(obs, explore=False)
        # TODO execute action, get reward and new observation
        new_obs, reward, done = env.step(action)
        # TODO record path of agent
        path.append(new_obs)
        # TODO increment cumulated reward by received reward
        cumulated_reward += reward
        # TODO set obs to new observation obs_1
        obs = new_obs
        n_steps += 1
    return cumulated_reward, path

Test the agent and print the cumlative reward:

In [15]:
cumulated_reward, path = test(agent, env)
print('cumulated reward', cumulated_reward)

cumulated reward 0.0


We also want to show the path, the agent has taken. For this we implement a render method:

In [16]:
def render_path(env, path):
    states = env.render(show_render=False)
    for position in path:
        states[tuple(position)] = 3
    print(states)

The rendered path:

In [17]:
render_path(env, path)

[[ 3.  0.  0.  8.  0.  0.  0.  1.]
 [ 3.  0.  0.  8.  0.  0.  0. -1.]
 [ 3.  0.  0.  8.  0.  0.  0.  0.]
 [ 3.  0.  0.  8.  0.  0.  0.  0.]
 [ 3.  0.  0.  0.  0.  0.  0.  0.]
 [ 3.  0.  0.  8.  0.  0.  0.  0.]
 [ 3.  0.  0.  8.  0.  0.  0.  0.]
 [ 3.  0.  0.  8.  0.  0.  0.  0.]]


### Step 4: Let's scale it up

Now we want to test whether the agent performs in a larger GridWorld.

In [18]:
# dimensions of the GridWorld
world_shape = (8, 8)

# initial position of the agent
agent_init_pos = (7, 0)

# list of blocking state positions
blocking_states = [(0, 3),
                   (1, 3),
                   (2, 3),
                   (3, 3),
                   (5, 3),
                   (6, 3),
                   (7, 3)]

# list of terminal state positions
terminal_states = [(0,7), (1,7)]

# dictionary of rewards with key: position and value: reward
reward_states = {
    (0,7): 1,
    (1,7): -1
}

Like before, we initialze the environment and the agent

In [19]:
env = GridWorld(world_shape, agent_init_pos, blocking_states, terminal_states, reward_states)
obs = env.reset()
env.render()

agent = QAgent(len(env.possible_actions), world_shape)

[[ 0.  0.  0.  8.  0.  0.  0.  1.]
 [ 0.  0.  0.  8.  0.  0.  0. -1.]
 [ 0.  0.  0.  8.  0.  0.  0.  0.]
 [ 0.  0.  0.  8.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  8.  0.  0.  0.  0.]
 [ 0.  0.  0.  8.  0.  0.  0.  0.]
 [ 4.  0.  0.  8.  0.  0.  0.  0.]]


We train the agent and print the learned policy:

In [20]:
train(agent, env, 15000)

print('argmax q')
print(agent.q_table.argmax(axis=0))

# Reminder: integer-action
# 0: up
# 1: down
# 2: right
# 3: left

argmax q
[[0 2 1 0 2 2 2 0]
 [0 0 1 0 1 0 0 0]
 [2 2 1 0 2 2 0 1]
 [2 2 1 0 2 2 0 0]
 [2 2 2 2 2 0 0 0]
 [2 2 0 0 2 0 0 0]
 [2 2 0 0 2 0 0 0]
 [2 2 0 0 0 0 0 0]]


We test the agent and show the path it has taken:

In [21]:
cumulated_reward, path = test(agent, env)
print('cumulated reward', cumulated_reward)
render_path(env, path)

cumulated reward 1.0
[[ 0.  0.  0.  8.  0.  0.  3.  3.]
 [ 0.  0.  0.  8.  0.  0.  3. -1.]
 [ 0.  0.  0.  8.  0.  0.  3.  0.]
 [ 0.  0.  0.  8.  0.  3.  3.  0.]
 [ 0.  0.  3.  3.  3.  3.  0.  0.]
 [ 0.  0.  3.  8.  0.  0.  0.  0.]
 [ 0.  0.  3.  8.  0.  0.  0.  0.]
 [ 3.  3.  3.  8.  0.  0.  0.  0.]]


### Step 5 (TODO): Epsilon-greedy policy
In our Agent implementation, we sample random actions during training to ensure explocation. An alternative would be the **epsilon-greedy** policy, that samples random actions only with epsilon probability, and acts according to the already partially learned policy otherwise. 

In [22]:
# We inherit from the QAgent class
class EGreedyQAgent(QAgent):
    # We override the constructor of QAgent. *args is just an unpacked list of arguments 
    def __init__(self, epsilon, n_actions, world_shape, learning_rate=0.2, discount_factor=0.9):
        # We call the constructor of the superclass QAgent, with the unpacked list of arguments
        super(EGreedyQAgent, self).__init__(n_actions, world_shape, learning_rate=0.2, discount_factor=0.9)
        # TODO set new class variable epsilon
        self.epsilon = epsilon

    # We only override the act method of the superclass to implement the epsilon-greedy policy
    def act(self, observation, explore=True):
        # TODO get the q-values from the q-table at the given position (observation is the same as the agent's position)
        # hint: you have access to the class variables of the superclass, e.g. self.q_table
        q_values = self.q_table[:, observation[0], observation[1]]
        if explore:
            # We sample a random action with epsilon probability and act according to the learned policy otherwise
            if np.random.uniform(0, 1) < self.epsilon:
                # TODO explore the action space sample random action during training e.g. a random integer in [0, n_actions - 1]
                # hint: you have access to the class variables of the superclass, e.g. self.n_actions
                action = np.random.randint(self.n_actions - 1)
            else:
                # TODO get the q-values from the q-table at the given position (observation is the same as the agent's position)
                # hint: use numpy array slicing, with the ":" operator
                q_values = self.q_table[:, observation[0], observation[1]]
                # TODO find the action with the maximal q-value
                action = np.argmax(q_values)
        else:
            # TODO find the action with the maximal q-value
            q_values = self.q_table[:, observation[0], observation[1]]
            action = np.argmax(q_values)
        return action

We now want to test the new agent. We reset the environment and initialiye the agent.

In [23]:
obs = env.reset()
agent = EGreedyQAgent(0.9, len(env.possible_actions), world_shape)

Like before, we train the agent and print the learned policy.

In [24]:
train(agent, env, 5000)
        
print('argmax q')
print(agent.q_table.argmax(axis=0))

# Reminder: integer-action
# 0: up
# 1: down
# 2: right
# 3: left

argmax q
[[0 0 1 0 2 2 2 0]
 [0 0 1 0 2 2 0 0]
 [0 0 1 0 0 0 0 1]
 [0 1 1 0 2 2 0 0]
 [0 1 2 2 2 0 0 0]
 [2 2 0 0 0 0 0 0]
 [2 2 0 0 0 0 0 0]
 [2 2 0 0 0 0 0 0]]


After training, the agent is tested:

In [25]:
cumulated_reward, path = test(agent, env)
print('cumulated reward', cumulated_reward)
render_path(env, path)

cumulated reward 1.0
[[ 0.  0.  0.  8.  0.  0.  3.  3.]
 [ 0.  0.  0.  8.  0.  0.  3. -1.]
 [ 0.  0.  0.  8.  0.  0.  3.  0.]
 [ 0.  0.  0.  8.  0.  3.  3.  0.]
 [ 0.  0.  3.  3.  3.  3.  0.  0.]
 [ 0.  0.  3.  8.  0.  0.  0.  0.]
 [ 0.  0.  3.  8.  0.  0.  0.  0.]
 [ 3.  3.  3.  8.  0.  0.  0.  0.]]


### Step 6: Comparison: random vs. epsilon-greedy
Now we want tu compare the random agend and the epsilon-greedy agent. For this, we train multiple agents for the same amount of steps, and test them.

In [26]:
n_runs = 50
n_steps = 5000

# Initialize environment
env = GridWorld(world_shape, agent_init_pos, blocking_states, terminal_states, reward_states)

cumulated_rewards_random = []
cumulated_rewards_epsilon_greedy = []

for _ in range(n_runs):
    # Initialize, train and test QAgent
    random_agent = QAgent(len(env.possible_actions), world_shape)
    train(random_agent, env, n_steps)
    cumulated_reward, path = test(random_agent, env)
    # Store the cumulative reward
    cumulated_rewards_random.append(cumulated_reward)
    
    # Initialize, train and test EGreedyQAgent
    epsilon_greedy_agent = EGreedyQAgent(0.99, len(env.possible_actions), world_shape)
    train(epsilon_greedy_agent, env, n_steps)
    cumulated_reward, path = test(epsilon_greedy_agent, env)
    # Store the cumulative reward
    cumulated_rewards_epsilon_greedy.append(cumulated_reward)

# Print results
print(sum(cumulated_rewards_random) / n_runs)
print(sum(cumulated_rewards_epsilon_greedy) / n_runs)

0.46
0.5


In this case, the epsilon-greedy policy does not necessarily outperform the random policy. Why? (Hint: initial q-values are zeros)

What happens if we change the initial q-values to random values instead of zeros?

To implement this, we simply inherit from our agents and overwrite the initial q-table.

In [20]:
class QAgentRandomInit(QAgent):
    def __init__(self, n_actions, world_shape, learning_rate=0.2, discount_factor=0.9):
        # We call the constructor of the superclass QAgent, with the unpacked list of arguments
        super(QAgentRandomInit, self).__init__(n_actions, world_shape, learning_rate=0.2, discount_factor=0.9)
        # We overwrite the q_table with random values
        self.q_table = np.random.rand(n_actions, *world_shape)
        
class EGreedyQAgentRandomInit(EGreedyQAgent):
    def __init__(self, epsilon, n_actions, world_shape, learning_rate=0.2, discount_factor=0.9):
        # We call the constructor of the superclass EGreedyQAgent, with the unpacked list of arguments
        super(EGreedyQAgentRandomInit, self).__init__(epsilon, n_actions, world_shape, learning_rate=0.2, discount_factor=0.9)
        # We overwrite the q_table with random values
        self.q_table = np.random.rand(n_actions, *world_shape)

Now we compare both models the same way as before.

In [23]:
n_runs = 50
n_steps = 25000

# Initialize environment
env = GridWorld(world_shape, agent_init_pos, blocking_states, terminal_states, reward_states)

cumulated_rewards_random = []
cumulated_rewards_epsilon_greedy = []

for _ in range(n_runs):
    # Initialize, train and test QAgentRandomInit
    random_agent = QAgentRandomInit(len(env.possible_actions), world_shape)
    train(random_agent, env, n_steps)
    cumulated_reward, path = test(random_agent, env)
    # Store the cumulative reward
    cumulated_rewards_random.append(cumulated_reward)
    
    # Initialize, train and test EGreedyQAgentRandomInit
    epsilon_greedy_agent = EGreedyQAgentRandomInit(0.99, len(env.possible_actions), world_shape)
    train(epsilon_greedy_agent, env, n_steps)
    cumulated_reward, path = test(epsilon_greedy_agent, env)
    # Store the cumulative reward
    cumulated_rewards_epsilon_greedy.append(cumulated_reward)
    
print(sum(cumulated_rewards_random) / n_runs)
print(sum(cumulated_rewards_epsilon_greedy) / n_runs)

0.0
0.0
