In [1]:
from blackjack import BlackJack
import numpy as np
import random
import matplotlib.pyplot as plt
import pandas as pd

from scipy.special import softmax

pd.set_option('display.max_rows', 180)

In [2]:
def choose_action(state, pi, epsilon, actions):
    
    if random.random() < epsilon:
        return np.random.choice(actions)
        
    return pi[state]

In [3]:
game = BlackJack()
epochs = 1_000_000
alpha = .1

actions = ["hit", "stand", "double"]

Q = {state : {a : 0 for a in actions} for state in game.states}
pi = {state : np.random.choice(actions) for state in game.states}
N = {state : 0 for state in game.states}
rewards = [0] * epochs

for i in range(epochs):
    
    if i % (epochs / 10) == 0:
        print(i)
    
    game.start()
    
    visited_states = []
    performed_actions = []
    
    while game.player_hand_sum < 21: # while the player hasn't busted
        
        state = game.get_current_state()
        
        action = choose_action(state, pi, 0.05, actions)
        
        visited_states.append(state)
        performed_actions.append(action)
        
        if action == "hit":
            game.hit()
        elif action == "stand":
            game.stand() # if we stand, the round ends
            break
        else:
            game.double()
            break
    
    reward = game.get_reward()
    rewards[i] = reward
    
    for s, a in zip(visited_states, performed_actions):
        
        N[s] += 1 # count occurences of states
        
        # update the Q-value
        #Q[s][a] += (reward - Q[s][a]) / N[s]
        Q[s][a] += alpha * (reward - Q[s][a])
        
    for s in visited_states:
        
        # update the policy
        pi[s] = max(Q[s], key = Q[s].get)
    

Object created
0
100000
200000
300000
400000
500000
600000
700000
800000
900000


In [4]:
def get_stats(game):
    wins = game.wins + game.blackjacks
    all_outcomes = wins + game.losses + game.draws
    print("Winrate: " + str(wins / all_outcomes))
    print("Drawrate: " + str(game.draws / all_outcomes))
    print("Loserate: " + str(game.losses / all_outcomes))
    print("BlackJack chance: " + str(game.blackjacks / all_outcomes))
    print("Ws: " + str(game.wins + game.blackjacks) + "; Ls: " + str(game.losses) + 
          "; Draws: " + str(game.draws) + "; BlackJacks: " + str(game.blackjacks))

In [5]:
get_stats(game)

Winrate: 0.42095737864733923
Drawrate: 0.07614036323685797
Loserate: 0.5029022581158028
BlackJack chance: 0.047971751417808066
Ws: 442169; Ls: 528243; Draws: 79977; BlackJacks: 50389


In [6]:
sum(rewards)

-149506

In [7]:
sums = [a for (a,_,_) in pi]
ace = [b for (_,b,_) in pi]
card = [c for (_,_,c) in pi]
ps = np.array(list((map(lambda x: list(softmax(x)), [list(qs.values()) for qs in list(Q.values())]))))

df = pd.DataFrame({"sum" : sums, "ace" : ace, "card" : card, 
                   "action" : pi.values(), "n" : N.values()})

In [8]:
df.sort_values(["sum", "action", "ace", "card"])

Unnamed: 0,sum,ace,card,action,n
144,12,False,1,hit,8564
34,12,False,2,hit,7526
118,12,False,6,hit,8308
8,12,False,7,hit,8286
146,12,False,10,hit,33624
79,12,True,1,hit,795
95,12,True,3,hit,897
124,12,True,7,hit,1098
139,12,True,9,hit,1129
160,12,False,3,stand,8167


In [9]:
game_test = BlackJack()
test_episodes = 1_000_000
rewards = [0]*test_episodes

for i in range(test_episodes):
    
    game_test.start()
    
    while game_test.player_hand_sum < 21: # while the player hasn't busted
        
        state = game_test.get_current_state()
        
        action = pi[state]
        
        if action == "hit":
            game_test.hit()
        elif action == "stand":
            game_test.stand() # if we stand, the round ends
            break
        elif action == "double":
            game_test.double()
            break
    rewards[i] = game_test.get_reward()

Object created


In [10]:
get_stats(game_test)

Winrate: 0.42810590398630255
Drawrate: 0.07579031368766076
Loserate: 0.4961037823260367
BlackJack chance: 0.047728785539952065
Ws: 449563; Ls: 520969; Draws: 79589; BlackJacks: 50121


In [12]:
len(list(filter(lambda x: x == 2, rewards))), len(list(filter(lambda x: x == -2, rewards)))

(7698, 12047)

In [13]:
sum(rewards)

-125876