# Assignment 2

In [None]:
import numpy as np
import matplotlib.pyplot as plt

from gridworld import StochasticGridworld, DeterministicGridworld, make_random_gridworld
from agents import RandomAgent, Agent

## Task 1

In [None]:
task = StochasticGridworld()
random_agent = RandomAgent(task.observation_space, task.action_space)

obs = task.reset()

rewards = np.zeros((100, 100))
for run in range(100):
    for step in range(100):
        act = random_agent(obs)
        obs, rew, done, info = task.step(act)
        rewards[run, step] = rew
        
print("Average return: {}".format(rewards.sum(1).mean()))
print("Standard deviation: {}".format(rewards.sum(1).std()))

## Task 2A

In [None]:
class QLearningAgent(Agent):
    
    def __init__(self, observation_space, action_space, gamma=0.9):   
        self.actions = action_space
        self.states = observation_space
        
        # policy estimate for each state, update in self.q_learning
        self.policy = np.zeros((self.states,), dtype=np.int)

        # Q-value estimate in each state, update in self.q_learning
        self.q_values  = np.zeros((self.states,self.actions), dtype=np.int)
        
        self.gamma = gamma
        
    def q_learning(self, states, actions, rewards):
        """
        states: np.array of size (N+1) with state at each time (integer) 
        The last state is only used to calculate the terminal Q-value; 
        it is not associated with an update

        actions: np.array of size (N) with action at each time (integer)
        rewards: np.array of size (N) with single step rewards (float)
        
        returns nothing, should modify self.policy, self.values in place

        """
        ??? #TODO: Insert code here
    
    def collect_data(self, task):
        # Do not modify
        obs = task.reset()

        rewards = np.zeros((100,))
        states = np.zeros((101,))
        actions = np.zeros((100))

        for run in range(100):
            for step in range(100):
                states[step] = obs
                act = self.__call__(obs)
                obs, rew, done, info = task.step(act)
                rewards[step] = rew
                actions[step] = act
            states[-1] = obs
        return states, actions, rewards

# Do not modify
agent = QLearningAgent(task.observation_space, task.action_space)

for i in range(50):
  data = agent.collect_data(task)
  agent.q_learning(*data)

# Final Benchmarking
obs = task.reset()

rewards = np.zeros((100, 100))
states = np.zeros((100, 101))
actions = np.zeros((100, 100))

for run in range(100):
    for step in range(100):
        states[run, step] = obs
        act = agent(obs)
        obs, rew, done, info = task.step(act)
        rewards[run, step] = rew
        actions[run, step] = act
    states[run, -1] = obs
        
print("Average return: {}".format(rewards.sum(1).mean()))
print("Standard deviation: {}".format(rewards.sum(1).std()))

## Task 2B

In [None]:
class EpsilonQLearningAgent(QLearningAgent):
    
    def __init__(self, observation_space, action_space, gamma=0.9, epsilon=1.0):   
        self.actions = action_space
        self.states = observation_space
        
        # policy estimate for each state
        self.policy = np.zeros((self.states,), dtype=np.int)
        
        # Q-value estimate in each state
        self.q_values  = np.zeros((self.states,self.actions), dtype=np.int)
        
        self.gamma = gamma
        self.epsilon = epsilon
    
    def epsilon_greedy_policy(self, obs):
        """
        obs: integer representing state

        returns integer representing action for current state, 
        according to epsilon-greedy policy (see handout)

        epsilon is stored in self.epsilon
        
        Hint:
        act = random_agent(obs) #obtains a random action for obs
        act = self.__call__(obs) #obtains action according to self.policy
        """
        ??? #TODO: Insert code here

    def collect_data(self, task):
        # Do not modify
        obs = task.reset()

        rewards = np.zeros((100,))
        states = np.zeros((101,))
        actions = np.zeros((100))

        for run in range(100):
            for step in range(100):
                states[step] = obs
                act = self.epsilon_greedy_policy(obs)
                obs, rew, done, info = task.step(act)
                rewards[step] = rew
                actions[step] = act
            states[-1] = obs
        return states, actions, rewards

In [None]:
# Substitute different values for epsilon below, comment on final performance
EPSILON = 1.0

# Do not modify
agent = EpsilonQLearningAgent(task.observation_space, task.action_space, epsilon=EPSILON)

for i in range(50):
  data = agent.collect_data(task)
  agent.q_learning(*data)

# Final Benchmarking
obs = task.reset()

rewards = np.zeros((100, 100))
states = np.zeros((100, 101))
actions = np.zeros((100, 100))

for run in range(100):
    for step in range(100):
        states[run, step] = obs
        act = agent(obs)
        obs, rew, done, info = task.step(act)
        rewards[run, step] = rew
        actions[run, step] = act
    states[run, -1] = obs
        
print("Epsilon: {}".format(EPSILON))
print("Average return: {}".format(rewards.sum(1).mean()))
print("Standard deviation: {}".format(rewards.sum(1).std()))

How does the model perform for different values of epsilon?

## Task 3A

You can reuse code here where applicable

In [None]:
class TDAgent(Agent):
    
    def __init__(self, observation_space, action_space, gamma=0.9):   
        self.actions = action_space
        self.states = observation_space
        
        self.policy = np.zeros((self.states,), dtype=np.int)
        self.values  = np.zeros((self.states,), dtype=np.int)
        
        self.gamma = gamma
    def td_learning(self, states, actions, rewards):
        """
        states: np.array of size (N+1) with state at each time (integer) 
        The last state is only used to calculate the terminal Q-value; 
        it is not associated with an update

        actions: np.array of size (N) with action at each time (integer)
        rewards: np.array of size (N) with single step rewards (float)
        
        returns nothing, should modify self.policy, self.values in place
        uses the TD(0) algorithm described in the lectures
        """
        ??? #TODO: Insert code here

    def collect_data(self, task):
        # Do not modify
        obs = task.reset()

        rewards = np.zeros((100,))
        states = np.zeros((101,))
        actions = np.zeros((100))

        for run in range(100):
            for step in range(100):
                states[step] = obs
                act = self(obs)
                obs, rew, done, info = task.step(act)
                rewards[step] = rew
                actions[step] = act
            states[-1] = obs
        return states, actions, rewards
        
# Do not modify
agent = TDAgent(task.observation_space, task.action_space)

for i in range(50):
  data = agent.collect_data(task)
  agent.td_learning(*data)

# Final Benchmarking
obs = task.reset()

rewards = np.zeros((100, 100))
states = np.zeros((100, 101))
actions = np.zeros((100, 100))

for run in range(100):
    for step in range(100):
        states[run, step] = obs
        act = agent(obs)
        obs, rew, done, info = task.step(act)
        rewards[run, step] = rew
        actions[run, step] = act
    states[run, -1] = obs
        
print("Average return: {}".format(rewards.sum(1).mean()))
print("Standard deviation: {}".format(rewards.sum(1).std()))

Do the results conform with your expectations? Write a small discussion.

In [None]:
class TD_nstep_Agent(TDAgent):
    
    def __init__(self, observation_space, action_space, gamma=0.9, nstep=0):   
        self.actions = action_space
        self.states = observation_space
        
        self.policy = np.zeros((self.states,), dtype=np.int)
        self.values  = np.zeros((self.states,), dtype=np.int)
        
        self.gamma = gamma

        #
        self.nstep = nstep 

    def td_learning(self, states, actions, rewards):
        """
        states: np.array of size (N+1) with state at each time (integer) 
        The last state is only used to calculate the terminal Q-value; 
        it is not associated with an update

        actions: np.array of size (N) with action at each time (integer)
        rewards: np.array of size (N) with single step rewards (float)
        
        returns nothing, should modify self.policy, self.values in place
        uses the TD nstep algorithm described in the handout

        The nstep can be found in self.nstep, self.nstep = 0 should correspond
        to TDAgent

        Hint: if there are only k < nstep future observations available for
        a given time, then simply use k instead of nstep for that time.

        """
        ??? #TODO: Insert code here

In [None]:
# Set nstep here:
NSTEP = 0

# Do not modify
agent = TD_nstep_Agent(task.observation_space, task.action_space, nstep=NSTEP)

for i in range(50):
  data = agent.collect_data(task)
  agent.td_learning(*data)

# Final Benchmarking
obs = task.reset()

rewards = np.zeros((100, 100))
states = np.zeros((100, 101))
actions = np.zeros((100, 100))

for run in range(100):
    for step in range(100):
        states[run, step] = obs
        act = agent(obs)
        obs, rew, done, info = task.step(act)
        rewards[run, step] = rew
        actions[run, step] = act
    states[run, -1] = obs

print("Nsteps: {}".format(NSTEP))    
print("Average return: {}".format(rewards.sum(1).mean()))
print("Standard deviation: {}".format(rewards.sum(1).std()))