In [1]:
%matplotlib inline

import gym
import matplotlib
import numpy as np
import sys

from collections import defaultdict
import itertools

if "../" not in sys.path:
  sys.path.append("../") 
#from lib.envs.blackjack import BlackjackEnv
from lib.envs.gridworld import GridworldEnv
from lib import plotting

matplotlib.style.use('ggplot')

In [2]:
env = GridworldEnv()

In [3]:
def td_prediction(policy, env, num_episodes, discount_factor=1.0, alpha=0.01):
    """
    Monte Carlo prediction algorithm. Calculates the value function
    for a given policy using sampling.
    
    Args:
        policy: A function that maps an observation to action probabilities.
        env: OpenAI gym environment.
        num_episodes: Number of episodes to sample.
        discount_factor: Gamma discount factor.
    
    Returns:
        A dictionary that maps from state -> value.
        The state is a tuple and the value is a float.
    """

    # Keeps track of sum and count of returns for each state
    # to calculate an average. We could use an array to save all
    # returns (like in the book) but that's memory inefficient.
    returns_sum = defaultdict(float)
    returns_count = defaultdict(float)
    
    # The final value function
    V = defaultdict(float)
    
    for i_episode in range(1, num_episodes + 1):
        # Print out which episode we're on, useful for debugging.
        if i_episode % 1000 == 0:
            print("\rEpisode {}/{}.".format(i_episode, num_episodes), end="")
            sys.stdout.flush()

        # Generate an episode.
        # An episode is an array of (state, action, reward) tuples
        episode = []
        state = env.reset()
        for t in range(100):
            action = policy(state)
            next_state, reward, done, _ = env.step(action)
            episode.append((state, action, reward))
            #returns_count[state] += 1.0
            V[state] = V[state] + alpha * (reward + discount_factor * V[next_state] - V[state])
            if done:
                break
            state = next_state

        

    return V    

In [4]:

def sample_policy(observation):
    """
    A policy that sticks if the player score is >= 20 and hits otherwise.
    """
    #score, dealer_score, usable_ace = observation
    return np.random.choice(4)
    #return 0 if score >= 20 else 1

In [5]:
V_500k = td_prediction(sample_policy, env, num_episodes=500000)

Episode 500000/500000.

In [10]:
V_500k

defaultdict(float,
            {0: 0.0,
             1: -13.827436006509453,
             2: -20.669669130185888,
             3: -22.522497786313973,
             4: -14.6719056841876,
             5: -18.697013101095727,
             6: -20.522689122969805,
             7: -20.47938289098938,
             8: -19.79737068329557,
             9: -20.39485115798147,
             10: -17.989486688605904,
             11: -13.549718713744124,
             12: -22.25570302338372,
             13: -20.56085148957566,
             14: -14.741013528329358,
             15: 0.0})

In [103]:
# deberia ser:
np.array([0, -14, -20, -22, -14, -18, -20, -20, -20, -20, -18, -14, -22, -20, -14, 0])

array([  0, -14, -20, -22, -14, -18, -20, -20, -20, -20, -18, -14, -22,
       -20, -14,   0])