In [1]:
from blackjack import BlackJack
import numpy as np
import random

from scipy.special import softmax

random.seed(1234)
np.random.seed(1234)

In [2]:
game = BlackJack()
epochs = 1_000_000
alpha = .1

actions = ["hit", "stand", "double"]

Q = {state : {a : 0 for a in actions} for state in game.states} # Q values
C = {state : {a : 0 for a in actions} for state in game.states}

pi = {state : random.choice(actions) for state in game.states} # policy
mu = {state : {a : p for (a,p) in zip(actions, softmax([random.random() for _ in actions]))} for state in game.states}

rewards = [0] * epochs

New BlackJack game created


In [3]:
def choose_action(mu, state, actions):
    return np.random.choice(actions, p = list(mu[state].values()))

In [4]:
for i in range(epochs):
    
    if i % (epochs / 10) == 0:
        print(i)
        
    
    #game.start()
    game.start_from_state(random.choice(game.states))
    
    visited_states = []
    performed_actions = []
    
    while game.player_hand_sum < 21:
        
        state = game.get_current_state()
        
        action = choose_action(mu, state, actions)
        
        visited_states.append(state)
        performed_actions.append(action)
        
        if action == "hit":
            game.hit()
        elif action == "stand":
            game.stand()
            break
        else:
            game.double()
            break
    
    G = game.get_reward()
    W = 1
    
    rewards[i] = G
    
    for s, a in zip(visited_states[::-1], performed_actions[::-1]):
        C[s][a] += W
        Q[s][a] += (W / C[s][a]) * (G - Q[s][a])
        pi[s] = max(Q[s], key = Q[s].get)
        W /= mu[s][a]

0
100000
200000
300000
400000
500000
600000
700000
800000
900000


In [5]:
rewards = game.test_policy(pi, 1_000_000)

Winrate: 0.4270217096397896
Drawrate: 0.06543118372695574
Loserate: 0.5075471066332546
Ws: 471856; Ls: 560836; Draws: 72301
Profit: -103767
Wins after doubling: 39469; Losses after doubling: 54347


In [6]:
rewards_freq = np.unique(rewards, return_counts = True)[1]
np.save("MCCoff", rewards_freq)