In [1]:
# to generate Fig 5.1 in Reinforcement Learning by Andrew Barto and Richard S. Sutton
# Ch.5 Monte-Carlo prediction, algorithm: First-visit MC prediction
# gym api is used
import gym
import numpy as np
import matplotlib.pyplot as pl

In [2]:
env = gym.make('Blackjack-v0')

In [3]:
def get_state_as_string(state):
    """ observation is a tuple of 
    1. player total card
    2. dealer face up card
    3. if player is natural or not
    """
    string_state = ''.join(str(int(e)).zfill(2) for e in state)
    return string_state

In [4]:
def get_all_states_as_string():
    """ Function to generate all possible states and assign a 
    string to each states 
    ex: (20, 8, False) --> 200800
    """
    states = []
    for i in range(32):
        for j in range(11):
            for k in [False, True]:
                states.append(str(int(i)).zfill(2) + str(int(j)).zfill(2) + 
                              str(int(k)).zfill(2))
    return states

In [5]:
# initialize state value
def initialize_Q():
    """ initialize the state value dictionary"""
    Q = {}
    Cnt = {}
    all_states = get_all_states_as_string()
    for state in all_states:
        Q[state] = 0
        Cnt[state] = 0 # number of occurance of state
    return Q, Cnt

In [6]:
# Monte-Carlo simulation 
def MC_stateValue():    
    Q, Cnt = initialize_Q()
    for _ in range(100000):
        # policy: stick just when the player sum is 20 or 21
        done = False
        observation = env.reset()
        while not done:
            if observation[0] >= 20 :
                action = 0
            else:
                action = 1
            observation, reward, done, _ = env.step(action)
        # Since the state is unique and is not visited
        # more than once during an episode, do not need to consider the 
        # first visit.
            state = get_state_as_string(observation)
            Q[state] += reward
            Cnt[state] += 1
    return Q, Cnt


In [7]:
Q, Cnt = MC_stateValue()

In [8]:
QQ, _ = initialize_Q()
for state in get_all_states_as_string():
    if not Cnt[state] == 0:
        QQ[state] = Q[state]/Cnt[state]
    else:
        QQ[state] = 0
QQ

{'000000': 0,
 '000001': 0,
 '000100': 0,
 '000101': 0,
 '000200': 0,
 '000201': 0,
 '000300': 0,
 '000301': 0,
 '000400': 0,
 '000401': 0,
 '000500': 0,
 '000501': 0,
 '000600': 0,
 '000601': 0,
 '000700': 0,
 '000701': 0,
 '000800': 0,
 '000801': 0,
 '000900': 0,
 '000901': 0,
 '001000': 0,
 '001001': 0,
 '010000': 0,
 '010001': 0,
 '010100': 0,
 '010101': 0,
 '010200': 0,
 '010201': 0,
 '010300': 0,
 '010301': 0,
 '010400': 0,
 '010401': 0,
 '010500': 0,
 '010501': 0,
 '010600': 0,
 '010601': 0,
 '010700': 0,
 '010701': 0,
 '010800': 0,
 '010801': 0,
 '010900': 0,
 '010901': 0,
 '011000': 0,
 '011001': 0,
 '020000': 0,
 '020001': 0,
 '020100': 0,
 '020101': 0,
 '020200': 0,
 '020201': 0,
 '020300': 0,
 '020301': 0,
 '020400': 0,
 '020401': 0,
 '020500': 0,
 '020501': 0,
 '020600': 0,
 '020601': 0,
 '020700': 0,
 '020701': 0,
 '020800': 0,
 '020801': 0,
 '020900': 0,
 '020901': 0,
 '021000': 0,
 '021001': 0,
 '030000': 0,
 '030001': 0,
 '030100': 0,
 '030101': 0,
 '030200': 0,
 '0302