In [17]:
from blackjack import BlackJack
import numpy as np
import random
import matplotlib.pyplot as plt
import pandas as pd

from scipy.special import softmax

pd.set_option('display.max_rows', 180)

In [2]:
def choose_action(state, pi, epsilon):
    
    if random.random() < epsilon:
        return "hit" if random.random() > .5 else "stand"
    
    return pi[state]

In [3]:
game = BlackJack()
epochs = 1_000_000
alpha = .1

Q = {state : {"hit" : 0, "stand" : 0} for state in game.states}
pi = {state : "hit" if random.random() > .5 else "stand" for state in game.states}
N = {state : 0 for state in game.states}

for _ in range(epochs):
    
    game.start()
    
    visited_states = []
    performed_actions = []
    
    while game.player_hand_sum < 21: # while the player hasn't busted
        
        state = game.get_current_state()
        
        action = choose_action(state, pi, 0.05)
        
        visited_states.append(state)
        performed_actions.append(action)
        
        if action == "hit":
            game.hit()
        else:
            game.stand() # if we stand, the round ends
            break
    
    reward = game.get_reward()
    
    for s, a in zip(visited_states, performed_actions):
        
        N[s] += 1 # count occurences of states
        
        # update the Q-value
        #Q[s][a] += (reward - Q[s][a]) / N[s]
        Q[s][a] += alpha * (reward - Q[s][a])
        
    for s in visited_states:
        
        # update the policy
        pi[s] = max(Q[s], key = Q[s].get)
    

Object created


In [7]:
def get_stats(game):
    wins = game.wins + game.blackjacks
    all_outcomes = wins + game.losses + game.draws
    print("Winrate: " + str(wins / all_outcomes))
    print("Drawrate: " + str(game.draws / all_outcomes))
    print("Loserate: " + str(game.losses / all_outcomes))
    print("BlackJack chance: " + str(game.blackjacks / all_outcomes))
    print("Ws: " + str(game.wins + game.blackjacks) + "; Ls: " + str(game.losses) + 
          "; Draws: " + str(game.draws) + "; BlackJacks: " + str(game.blackjacks))

In [8]:
get_stats(game)

Winrate: 0.42382297309775185
Drawrate: 0.07882479602706122
Loserate: 0.49735223087518693
BlackJack chance: 0.048078689623200985
Ws: 445229; Ls: 522472; Draws: 82806; BlackJacks: 50507


In [11]:
game_test = BlackJack()
test_episodes = 1_000_000
rewards = [0]*test_episodes

for i in range(test_episodes):
    
    game_test.start()
    
    while game_test.player_hand_sum < 21: # while the player hasn't busted
        
        state = game_test.get_current_state()
        
        pi[state]
        
        if action == "hit":
            game_test.hit()
        else:
            game_test.stand() # if we stand, the round ends
            break
    rewards[i] = game_test.get_reward()

Object created


In [16]:
sum(rewards)

-150678

In [12]:
get_stats(game_test)

Winrate: 0.42204269601529876
Drawrate: 0.06059723400184476
Loserate: 0.5173600699828564
BlackJack chance: 0.04811131005584731
Ws: 443374; Ls: 543509; Draws: 63660; BlackJacks: 50543


In [13]:
sums = [a for (a,_,_) in pi]
ace = [b for (_,b,_) in pi]
card = [c for (_,_,c) in pi]
ps = np.array(list((map(lambda x: list(softmax(x)), [list(qs.values()) for qs in list(Q.values())]))))

df = pd.DataFrame({"sum" : sums, "ace" : ace, "card" : card, 
                   "action" : pi.values(), "n" : N.values(), "p_hit" : ps[:, 0], "p_stand" : ps[:, 1]})

In [15]:
df.sort_values(["sum", "action", "ace", "card"])

Unnamed: 0,sum,ace,card,action,n,p_hit,p_stand
144,12,False,1,hit,8563,0.58073,0.41927
160,12,False,3,hit,8162,0.530266,0.469734
104,12,False,4,hit,8130,0.582108,0.417892
176,12,False,5,hit,8232,0.585954,0.414046
8,12,False,7,hit,8164,0.680249,0.319751
132,12,False,8,hit,8289,0.602685,0.397315
20,12,False,9,hit,8369,0.584205,0.415795
146,12,False,10,hit,33214,0.525016,0.474984
79,12,True,1,hit,845,0.563106,0.436894
154,12,True,2,hit,1011,0.581731,0.418269


In [34]:
(game.wins + game.blackjacks) / (game.wins + game.blackjacks + game.losses + game.draws)

0.4279138770541692

In [35]:
game.losses / (game.wins + game.blackjacks + game.losses + game.draws)

0.48748478393183203

In [36]:
game.draws / (game.wins + game.blackjacks + game.losses + game.draws)

0.08460133901399879