In [1]:
# to generate Fig 5.1 in Reinforcement Learning by Andrew Barto and Richard S. Sutton
# Ch.5 Monte-Carlo prediction, algorithm: First-visit MC prediction
# gym api is used
import gym
import numpy as np
import matplotlib.pyplot as pl
# importing Black-Jack game API
env = gym.make('Blackjack-v0')

An observation in `Blackjack` consists of 
 1. player's sum 
 2. dealer's face-up card
 3. if player is natural or not

In [2]:
def max_dict(d):
    """ Return the maximum value and its key """
    max_v = float('-inf')
    for key, val in d.items():
        if val > max_v:
            max_v = val
            max_key = key
    return max_key, max_v

In [3]:
def get_state_as_string(state):
    """ Convert the given observation into string of form
    [][][][][][]
    """
    string_state = ''.join(str(int(e)).zfill(2) for e in state)
    return string_state

In [4]:
def get_all_states_as_string():
    """ Function to generate all possible states and assign a 
    string to each states 
    ex: (20, 8, False) --> 200800
    """
    states = []
    for i in range(1, 33):
        for j in range(1,12):
            for k in [False, True]:
                states.append(str(int(i)).zfill(2) + str(int(j)).zfill(2) + 
                              str(int(k)).zfill(2))
    return states

In [5]:
# initialize state value
def initialize_Q():
    """ Initialize the state-action value dictionary.
    """
    Q = {}
    
    all_states = get_all_states_as_string()
    for state in all_states:
        Q[state] = {}
        for action in range(env.action_space.n):
            Q[state][action] = 0
    return Q

In [6]:
def policy(observation, Q):
    """ Generate an action based on previous
    state. Stick if sum is 20 or 21
    """
    state = get_state_as_string(observation)
    action = max_dict(Q[state])
    return action

Implementing first visit Monte-Carlo for estimating state value $v_{\pi}(s)$ according to  policy $\pi$.
Since in BlackJack game, every state is visited only once, there is no need to check if it is a first visit.

In [7]:
# Monte-Carlo simulation 
def MC_stateValue():    
    Q = initialize_Q()
    for _ in range(1000000):
        # policy: stick just when the player sum is 20 or 21
        done = False
        # Exploring start:
        observation = env.reset()
        # generating new episode
        while not done:
            action, _ = policy(observation, Q)
            observation, reward, done, _ = env.step(action)
            state = get_state_as_string(observation)
            Q[state][action] += reward
    return Q


In [8]:
Q = MC_stateValue()

In [9]:
# derive the optimal policy from Q
# not natural
opt_policy = np.zeros((12, 12), dtype=int)
natural = False
for i in range(11, 22):  # player's sum
    for j in range(1, 12):  # dealer's card
        state = str(int(i)).zfill(2) + str(int(j)).zfill(2) + \
                str(int(natural)).zfill(2)
        opt_policy[i-11,j-1] = max_dict(Q[state])[0]

In [10]:
opt_policy

array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
       [1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0],
       [1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [11]:
# derive the optimal policy from Q
# natural
opt_policy = np.zeros((12, 12), dtype=int)
natural = True
for i in range(11, 22):  # player's sum
    for j in range(1, 12):  # dealer's card
        state = str(int(i)).zfill(2) + str(int(j)).zfill(2) + \
                str(int(natural)).zfill(2)
        opt_policy[i-11,j-1] = max_dict(Q[state])[0]
opt_policy

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
       [1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0],
       [1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [12]:
from mpl_toolkits.mplot3d import Axes3D