In [1]:
%matplotlib inline

import gym
import matplotlib
import numpy as np
import sys
from IPython.core.debugger import Tracer

from collections import defaultdict

if "../" not in sys.path:
    sys.path.append("../") 
from lib.envs.blackjack import BlackjackEnv
from lib import plotting

matplotlib.style.use('ggplot')

In [2]:
env = BlackjackEnv()

In [84]:
def mc_prediction(policy, env, num_episodes, discount_factor=1.0):
    """
    Monte Carlo prediction algorithm. Calculates the value function
    for a given policy using sampling.
    
    Args:
        policy: A function that maps an observation to action probabilities.
        env: OpenAI gym environment.
        num_episodes: Nubmer of episodes to sample.
        discount_factor: Lambda discount factor.
    
    Returns:
        A dictionary that maps from state -> value.
        The state is a tuple and the value is a float.
    """

    # Keeps track of sum and count of returns for each state
    # to calculate an average. We could use an array to save all
    # returns (like in the book) but that's memory inefficient.
    returns_sum = defaultdict(float)
    returns_count = defaultdict(float)
    
    # The final value function
    V = defaultdict(float)
    
    for i_episode in range(num_episodes):
        
        # Run an episode
        episode = []
        state = env.reset()
        for t in range(100):
            action = policy(state)
            n_state, reward, done, info = env.step(action)
            episode.append((state, action, reward))
            if done:
                break
            state = n_state
    
        states = set(i[0] for i in episode)
        
        for state in states:
            
            first_visit_index = next(i for i, x in enumerate(episode) if x[0]==state)
            
    
    return V    

In [85]:
def sample_policy(observation):
    """
    A policy that sticks if the player score is > 20 and hits otherwise.
    """
    score, dealer_score, usable_ace = observation
#     return np.array([1.0, 0.0]) if score >= 20 else np.array([0.0, 1.0])
    return 0 if score >= 20 else 1

In [86]:
V_10k = mc_prediction(sample_policy, env, num_episodes=10000)
plotting.plot_value_function(V_10k, title="10,000 Steps")

V_500k = mc_prediction(sample_policy, env, num_episodes=500000)
plotting.plot_value_function(V_500k, title="500,000 Steps")

0
0
0
1
0
0
0
0
1
0
2
3
0
1
0
1
0
0
0
0
0
0
0
1
2
0
0
1
0
0
0
0
1
0
0
1
0
1
0
2
0
1
0
1
0
0
0
0
1
0
2
0
1
3
0
0
1
0
0
1
2
0
2
1
0
0
0
0
2
1
2
0
1
0
0
1
0
0
1
0
0
0
0
0
1
0
1
0
0
0
0
0
0
0
0
0
3
4
1
0
2
0
2
1
0
0
1
1
0
1
0
2
1
0
3
0
0
0
0
2
1
0
2
1
0
1
0
0
1
0
0
0
1
0
2
0
0
2
1
0
1
0
2
0
1
1
2
0
0
0
0
0
1
0
1
0
0
1
2
0
1
0
0
0
0
1
1
0
2
0
0
0
0
0
0
2
1
0
1
1
2
0
0
0
2
1
0
1
1
0
0
1
0
1
0
2
0
1
0
0
0
1
3
2
0
1
0
1
0
0
0
1
0
1
0
0
0
0
0
0
0
1
2
0
1
0
2
0
0
0
0
1
0
1
0
1
0
0
2
0
1
0
0
0
1
0
0
1
2
1
0
0
1
0
0
1
0
1
3
0
2
0
2
1
0
0
0
0
1
1
0
0
0
0
1
2
1
0
0
0
1
0
1
0
0
1
2
0
0
0
1
0
2
0
0
0
0
0
0
1
2
1
0
0
0
1
0
1
0
1
0
0
1
0
0
0
0
0
0
1
0
0
0
0
0
1
0
0
0
1
0
1
0
1
0
0
0
1
0
0
1
0
1
1
2
0
0
2
1
2
0
1
0
0
0
1
1
0
0
0
1
0
0
0
1
0
0
1
0
0
0
1
0
0
0
1
0
0
0
0
0
0
0
1
0
0
0
1
0
0
0
0
1
0
1
0
3
2
0
0
0
0
0
0
0
0
1
0
0
0
0
0
1
0
0
2
3
1
0
0
2
1
1
0
0
0
1
0
1
0
0
1
0
0
0
2
0
1
0
0
0
0
0
0
1
1
0
0
0
1
0
1
0
1
0
1
0
1
2
0
0
0
0
1
0
0
1
0
3
2
1
0
0
0
1
0
1
0
1
0
3
1
0
2
0
1
0
0
0
1
1
0
0
1
1
0
1
0
2
0


0
0
0
0
1
1
0
2
0
1
0
0
0
0
0
1
2
0
1
0
0
1
0
0
2
1
0
0
0
0
0
0
0
1
0
1
2
0
0
0
0
0
2
1
0
1
0
1
0
1
0
0
0
0
1
2
0
0
0
1
0
0
1
0
1
2
0
1
0
0
0
0
2
0
3
1
1
2
0
0
0
0
1
0
0
1
0
0
0
0
0
0
0
1
0
0
0
1
0
0
0
0
0
0
1
0
0
0
1
0
0
1
3
2
0
1
0
0
0
0
0
1
0
0
0
0
1
0
1
0
0
1
1
0
0
1
0
1
0
0
0
0
0
0
1
0
0
1
0
0
1
3
0
2
1
0
0
0
0
2
1
0
0
1
0
1
0
0
1
0
1
0
2
2
0
1
1
0
1
0
0
0
2
1
0
0
0
1
0
1
0
0
1
2
3
0
0
1
0
0
0
1
1
0
0
0
0
0
0
0
0
0
1
0
1
0
1
0
1
0
0
1
0
1
1
0
0
1
0
0
2
1
0
0
0
1
0
1
0
0
1
2
0
0
0
0
0
1
0
0
1
0
2
0
0
0
0
0
0
0
0
0
1
0
0
0
0
1
0
0
0
1
1
0
0
0
0
2
1
0
0
0
1
0
1
0
0
0
0
0
1
1
0
0
0
0
1
0
0
0
1
0
1
0
2
2
1
0
0
1
0
0
0
0
1
1
0
0
2
0
1
1
0
0
1
2
0
1
0
0
0
0
2
1
0
0
1
0
0
0
0
0
1
0
0
1
0
0
0
0
0
0
0
1
0
0
0
0
0
0
1
2
0
0
1
2
0
2
1
0
0
1
3
2
0
1
0
1
0
0
1
0
0
0
0
1
0
0
0
0
0
0
1
0
0
0
1
2
3
0
0
0
0
2
1
0
0
0
0
0
0
0
0
0
0
0
0
1
0
2
0
0
1
0
2
0
0
1
0
1
0
0
0
0
1
0
2
1
0
0
1
0
0
0
1
0
2
0
1
0
1
0
0
1
0
0
2
1
0
0
0
0
2
1
1
0
0
3
2
1
0
4
0
0
0
0
0
0
1
0
2
0
1
0
0
0
0
1
0
0
1
0
0
0
1
0
0
1
0
1


0
1
0
0
0
1
1
0
0
2
1
0
0
0
1
0
1
0
0
0
0
0
0
1
2
0
0
0
0
0
0
1
0
0
0
1
0
0
0
1
2
0
0
0
1
0
0
1
0
0
1
0
0
0
1
0
1
0
0
1
0
0
0
1
0
0
0
0
1
0
0
0
1
0
1
0
0
0
1
0
0
0
0
1
0
1
2
0
0
0
1
0
0
0
0
2
1
0
0
1
0
1
0
2
0
0
1
0
0
1
0
0
0
0
1
0
1
3
2
0
0
0
1
2
0
0
0
0
0
1
2
1
0
1
0
0
1
1
0
0
1
0
0
0
1
1
0
2
3
0
1
0
0
0
1
2
0
0
1
0
0
0
0
0
0
0
0
1
1
0
0
0
0
0
0
1
0
0
0
0
1
2
3
0
0
0
1
0
0
0
0
1
0
2
0
0
0
0
0
0
1
0
1
0
2
0
1
0
1
0
2
1
0
0
0
1
0
2
1
0
0
0
0
0
1
0
0
0
0
1
0
0
0
0
0
1
0
0
0
0
0
1
0
0
0
0
0
0
0
0
1
2
0
0
0
0
0
1
2
0
0
0
0
1
0
0
2
1
0
0
0
0
0
0
0
0
1
0
0
0
0
1
3
2
0
1
4
0
1
0
0
0
2
1
3
0
0
1
0
0
0
0
0
0
1
0
0
0
0
0
0
1
0
0
1
0
0
0
1
2
0
1
4
3
2
0
1
0
3
1
0
2
1
0
0
0
1
0
0
0
0
0
0
1
0
1
0
0
0
2
1
0
1
0
1
0
0
0
1
0
0
0
1
2
0
0
0
0
0
0
0
1
0
1
0
0
0
1
2
0
0
0
0
1
0
0
1
0
0
0
0
0
1
0
0
0
1
1
0
1
0
0
1
0
2
1
0
0
0
1
0
1
0
2
3
1
0
0
1
2
0
3
0
0
1
0
0
0
1
0
1
1
0
2
1
0
0
0
0
0
0
0
1
0
0
0
1
0
1
0
1
2
0
0
1
0
0
0
0
1
0
0
0
0
0
2
1
0
1
0
0
0
1
0
1
0
0
1
0
0
0
1
0
1
2
0
0
0
0
1
2
0
2
0
3
1
0
0
0
0


0
0
0
0
0
0
1
0
0
0
1
0
0
1
1
0
0
0
0
0
0
0
2
1
0
0
0
1
0
0
1
0
0
0
1
0
0
0
1
0
0
2
3
0
1
1
0
0
0
0
1
0
0
2
1
0
0
0
0
0
0
0
0
0
0
1
0
1
2
0
2
0
1
0
1
2
0
3
0
1
2
3
0
1
1
0
0
0
0
1
2
0
0
0
0
0
2
0
3
1
0
1
1
0
0
1
0
0
2
1
0
0
1
0
0
1
0
0
1
2
1
0
0
0
1
0
0
1
0
0
0
1
2
0
2
1
0
0
0
0
0
1
1
0
1
0
0
0
0
0
0
0
1
0
0
0
0
0
2
3
0
1
2
0
1
0
0
0
1
0
0
0
0
0
1
0
0
1
0
0
1
0
0
2
3
1


ValueError: min() arg is an empty sequence