In [1]:
import gym
import numpy as np
import matplotlib.pyplot as pl

In [2]:
env = gym.make("Blackjack-v0")
# number of all states
MAXSTATES = 32*11*2
# players hands x dealer card x natural 
# Learning parameters
GAMMA = 0.9
ALPHA = 0.01

In [3]:
def max_dict(d):
    """ Intentify the largest value of input deictionary
    and return the key and value"""
    max_v = float('-inf')
    for key, val in d.items():
        if val > max_v:
            max_v = val
            max_key = key
    return max_key, max_v            

In [4]:
# test 'max_dict' function
d = {'a':1, 'b':43, 'c':8, 'd':23}
max_dict(d)

('b', 43)

In [5]:
def get_state_as_string(state):
    """ observation is a tuple of 
    1. player total card
    2. dealer face up card
    3. if player is natural or not
    """
    string_state = ''.join(str(int(e)).zfill(2) for e in state)
    return string_state
        

In [8]:
# test 'get_state_as_string' function
observation = env.reset()
print(observation)
get_state_as_string(observation)

(20, 8, False)


'200800'

In [17]:
# test 'get_all_states_as_string' function
states = get_all_states_as_string()
states[200:205]

['090101', '090100', '090201', '090200', '090301']

In [18]:
def initialize_Q():
    """ Initialize the state-action expected value"""
    Q = {}
    
    all_states = get_all_states_as_string()
    for state in all_states:
        Q[state] = {}
        for action in range(env.action_space.n):
            Q[state][action] = 0
    return Q

In [21]:
# test 'initialize_Q' function
Q = initialize_Q()
print(Q['030100'], Q['300100'])

{0: 0, 1: 0} {0: 0, 1: 0}


In [118]:
from gym import spaces
dd = spaces.Discrete(10)
dd.n

10

In [22]:
def play_one_game(Q, eps=0.5):
    observation = env.reset()
    done = False
    state = get_state_as_string(observation)
    total_reward = 0
    
    while not done:
        if np.random.uniform() < eps:
            act = env.action_space.sample() # eps-greedy
        else:
            act = max_dict(Q[state])[0]
            
        observation, reward, done, _ = env.step(act)
        
        total_reward += reward
        
        state_new = get_state_as_string(observation)
        a1, max_q_s1a1 = max_dict(Q[state_new])
        Q[state][act] += ALPHA*(reward + GAMMA*max_q_s1a1 - Q[state][act])
        state, act = state_new, a1
    return total_reward

In [26]:
def play_many_games(N=1000):
    Q = initialize_Q()
    reward = []
    for n in range(N):
        eps = 1.0/np.sqrt(n+1)
        
        episode_reward = play_one_game(Q, eps)
        
        if n % 100 == 0:
            print(n, '%.f' % eps, episode_reward)
        reward.append(episode_reward)
    return reward

In [27]:
play_many_games()

0 1 1.0
100 0 -1
200 0 -1.0
300 0 1.0
400 0 1.0
500 0 1.0
600 0 -1
700 0 -1.0
800 0 -1
900 0 0.0


[1.0,
 -1.0,
 -1.0,
 -1.0,
 1.0,
 -1.0,
 -1.0,
 -1.0,
 0.0,
 -1.0,
 1.0,
 -1.0,
 -1.0,
 -1.0,
 -1,
 -1.0,
 1.0,
 -1.0,
 -1.0,
 1.0,
 1.0,
 1.0,
 -1.0,
 1.0,
 -1.0,
 -1.0,
 1.0,
 -1,
 -1.0,
 0.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 1.0,
 -1.0,
 1.0,
 0.0,
 -1,
 -1,
 1.0,
 -1.0,
 1.0,
 -1.0,
 1.0,
 -1.0,
 1.0,
 -1.0,
 -1.0,
 1.0,
 -1,
 -1.0,
 1.0,
 -1.0,
 -1.0,
 0.0,
 -1.0,
 -1,
 1.0,
 -1,
 -1.0,
 -1.0,
 1.0,
 1.0,
 -1,
 1.0,
 0.0,
 -1,
 -1.0,
 -1.0,
 1.0,
 1.0,
 1.0,
 -1.0,
 1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 1.0,
 1.0,
 -1.0,
 -1.0,
 -1.0,
 -1,
 -1,
 -1.0,
 -1.0,
 1.0,
 1.0,
 -1.0,
 -1.0,
 1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 1.0,
 1.0,
 1.0,
 -1,
 -1,
 -1.0,
 -1.0,
 1.0,
 1.0,
 1.0,
 -1.0,
 -1.0,
 -1.0,
 1.0,
 -1.0,
 1.0,
 1.0,
 1.0,
 -1.0,
 -1.0,
 -1,
 -1.0,
 -1.0,
 -1.0,
 1.0,
 -1.0,
 1.0,
 1.0,
 0.0,
 0.0,
 1.0,
 1.0,
 -1.0,
 1.0,
 -1.0,
 -1.0,
 -1.0,
 1.0,
 -1.0,
 -1,
 -1.0,
 1.0,
 1.0,
 -1,
 -1,
 1.0,
 -1.0,
 0.0,
 1.0,
 -1,
 -1.0,
 1.0,
 -1.0,
 -1.0,
 -1.0,
 1.0,
 -1,
 0.0,
 0.0,
 1.0,
 0.