In [1]:
from environment import GameEnv
import numpy as np

In [2]:
def monte_carlo_sampler(env, policy, n_samples,seed=123):
    """Implements a Monte Carlo Sampler which
    uses the provided policy in the environment to sample trajectories
    
    Args:
        env (gym.Env): gym environment
        policy (function): function that takes a state as input and returns an action
        n_samples (int): number of trajectories to sample

    Returns:
        trajectories (list): trajectories represented by state sequences
        rewards (list): respective rewards of the trajectories
    
    """
    
    rewards = []
    trajectories = []
    
    for i in range(n_samples):
        state,_ = env.reset(seed=seed)
        episode_trajectories = [0,]
        episode_rewards = []
        done = False
        while not done:
            if callable(policy):
                action = policy(state)
            else:
                action = policy.policy(state)
            state, reward, done = env.step(action)
            episode_rewards.append(reward)
            episode_trajectories.append(state)
        rewards.append(episode_rewards)
        trajectories.append(episode_trajectories)


    return trajectories, rewards

In [3]:
# Actions ={ Jump: 0, Walk:1 }

# π1: {always jump}
def policy_one(state):
    return 0


# π2: {first walk, then always jump}
class PolicyTWo:
    def __init__(self,counter=0):
        self.counter = counter
    def policy(self,state):
        self.counter += 1
        if self.counter == 1:
            return 1
        else:
            return 0


In [4]:
# Define environments inputs as given in PoC_exercise_09.pdf
transitions = np.array([[[0.2, 0.8, 0. , 0. , 0. ],
                        [0. , 0. , 0.5, 0.5, 0. ],
                        [0. , 0. , 0.6, 0.4, 0. ],
                        [0. , 0. , 0.2, 0.2, 0.6],
                        [0. , 0. , 0. , 0. , 1. ]],

                        [[0. , 0. , 1. , 0. , 0. ],
                        [0. , 0. , 0.8, 0.2, 0. ],
                        [0. , 0. , 0.9, 0.1, 0. ],
                        [0. , 0. , 0.4, 0.4, 0.2],
                        [0. , 0. , 0. , 0. , 1. ]]])




rewards = np.array([[-2,  0],
                    [-1, -1],
                    [-2,  0],
                    [-1, -1],
                    [ 0,  0]])

In [5]:
# Create environment
env = GameEnv(transitions, rewards) 

In [6]:
# Sample for always jump policy
monte_carlo_sampler(env, policy=policy_one, n_samples=3,seed=123)



([[0, 1, 2, 3, 4], [0, 1, 3, 4], [0, 1, 2, 2, 3, 2, 3, 4]],
 [[-2, -1, -2, -1], [-2, -1, -1], [-2, -1, -2, -2, -1, -2, -1]])

In [7]:
# Sample for first walk then always jump policy
policy_two= PolicyTWo(counter=0)
monte_carlo_sampler(env, policy=policy_two, n_samples=3, seed=123)


([[0, 2, 2, 2, 3, 4], [0, 1, 2, 2, 2, 2, 2, 2, 3, 4], [0, 1, 3, 4]],
 [[0, -2, -2, -2, -1], [-2, -1, -2, -2, -2, -2, -2, -2, -1], [-2, -1, -1]])