In [None]:
%matplotlib inline

import gym
import matplotlib
import numpy as np
import sys

from collections import defaultdict

if "../" not in sys.path:
  sys.path.append("../") 
from lib.envs.blackjack import BlackjackEnv
from lib import plotting

matplotlib.style.use('ggplot')

In [None]:
env = BlackjackEnv()

In [None]:
def mc_prediction(policy, env, num_episodes, discount_factor=1.0):
    """
    Monte Carlo prediction algorithm. Calculates the value function
    for a given policy using sampling.
    
    Args:
        policy: A function that maps an observation to action probabilities.
        env: OpenAI gym environment.
        num_episodes: Number of episodes to sample.
        discount_factor: Gamma discount factor.
    
    Returns:
        A dictionary that maps from state -> value.
        The state is a tuple and the value is a float.
    """

    # Keeps track of sum and count of returns for each state
    # to calculate an average. We could use an array to save all
    # returns (like in the book) but that's memory inefficient.
    returns_sum = defaultdict(float)
    returns_count = defaultdict(float)
    
    # The final value function
    V = defaultdict(float)
    
    # for i in range(num_episodes):
    #     # generate episodes
    #     ini_state = env.reset()
    #     observation = ini_state
    #     while True:
    #         # print_observation(observation)
    #         action = policy(observation)
    #         # print("Taking action: {}".format(["Stick", "Hit"][action]))
    #         observation, reward, done, _ = env.step(action)
    #         if done:
    #             returns_sum[ini_state] += reward * discount_factor
    #             returns_count[ini_state] += 1.0
    #             break
    # for key in returns_sum.keys():
    #     V[key] = returns_sum[key] / returns_count[key]
    for i in range(num_episodes):
   
        # Print out which episode we're on, useful for debugging.
        if (i+1) % 1000 == 0:
            print("\rEpisode {}/{}.".format(i, num_episodes), end="")
            sys.stdout.flush()
        
        # generate an episode
        episode = []
        state = env.reset()
        
        
        
        for t in range(50):
            action = policy(state)
            next_state, reward, done, _ = env.step(action)
            episode.append((state, action, reward))
            if done:
                break
            state = next_state
        
        unique_state = set([x[0] for x in episode])
        for state in unique_state:
            unique_state_index = next(i for i, x in enumerate(episode) if x[0] == state)
            G = sum([x[2] * discount_factor ** i for i, x in enumerate(episode[unique_state_index:])])
            returns_sum[state] += G
            returns_count[state] += 1.0
            V[state] = returns_sum[state] / returns_count[state]
        
    return V

In [None]:
def sample_policy(observation):
    """
    A policy that sticks if the player score is > 20 and hits otherwise.
    """
    score, dealer_score, usable_ace = observation
    return 0 if score >= 20 else 1

In [None]:
V_10k = mc_prediction(sample_policy, env, num_episodes=10000)
plotting.plot_value_function(V_10k, title="10,000 Steps")

V_500k = mc_prediction(sample_policy, env, num_episodes=500000)
plotting.plot_value_function(V_500k, title="500,000 Steps")