In [2]:
from blackjack import BlackJack
import numpy as np
import random
import matplotlib.pyplot as plt
import pandas as pd

from scipy.special import softmax

pd.set_option('display.max_rows', 180)

In [3]:
def choose_action(state, pi, epsilon):
    
    if random.random() < epsilon:
        return "hit" if random.random() > .5 else "stand"
    
    return pi[state]

In [4]:
game = BlackJack()
epochs = 1_000_000
alpha = .1

Q = {state : {"hit" : 0, "stand" : 0} for state in game.states}
pi = {state : "hit" if random.random() > .5 else "stand" for state in game.states}
N = {state : 0 for state in game.states}

for _ in range(epochs):
    
    game.start()
    
    visited_states = []
    performed_actions = []
    
    while game.player_hand_sum < 21: # while the player hasn't busted
        
        state = game.get_current_state()
        
        action = choose_action(state, pi, 0.05)
        
        visited_states.append(state)
        performed_actions.append(action)
        
        if action == "hit":
            game.hit()
        else:
            game.stand() # if we stand, the round ends
            break
    
    reward = game.get_reward()
    
    for s, a in zip(visited_states, performed_actions):
        
        N[s] += 1 # count occurences of states
        
        # update the Q-value
        #Q[s][a] += (reward - Q[s][a]) / N[s]
        Q[s][a] += alpha * (reward - Q[s][a])
        
    for s in visited_states:
        
        # update the policy
        pi[s] = max(Q[s], key = Q[s].get)
    

Object created


In [5]:
game.wins, game.losses, game.draws, game.blackjacks

(394189, 522960, 82851, 50575)

In [9]:
game.wins + game.losses + game.draws

1000000

In [10]:
sums = [a for (a,_,_) in pi]
ace = [b for (_,b,_) in pi]
card = [c for (_,_,c) in pi]
ps = np.array(list((map(lambda x: list(softmax(x)), [list(qs.values()) for qs in list(Q.values())]))))

df = pd.DataFrame({"sum" : sums, "ace" : ace, "card" : card, 
                   "action" : pi.values(), "n" : N.values(), "p_hit" : ps[:, 0], "p_stand" : ps[:, 1]})

In [11]:
df

Unnamed: 0,sum,ace,card,action,n,p_hit,p_stand
0,16,False,1,hit,9068,0.536322,0.463678
1,13,True,2,stand,1385,0.442587,0.557413
2,18,True,6,stand,1457,0.406759,0.593241
3,20,True,7,stand,2087,0.369725,0.630275
4,19,False,9,stand,7101,0.267845,0.732155
5,15,True,5,stand,1495,0.28082,0.71918
6,17,True,7,stand,2355,0.339328,0.660672
7,17,False,9,stand,8395,0.402859,0.597141
8,12,False,7,hit,8279,0.619072,0.380928
9,18,False,7,stand,8678,0.35044,0.64956


In [32]:
game = BlackJack()
test_episodes = 100_000
rewards = [0]*test_episodes

for i in range(test_episodes):
    
    game.start()
    
    while game.player_hand_sum < 21: # while the player hasn't busted
        
        state = game.get_current_state()
        
        row = df[(df["sum"] == state[0]) & (df["ace"] == state[1]) & (df["card"] == state[2])]
        
        #action = np.random.choice(["hit", "stand"], p = row.to_numpy()[0][-2:].astype("float64"))
        action = row["action"].values[0]
        
        if action == "hit":
            game.hit()
        else:
            game.stand() # if we stand, the round ends
            break
    rewards[i] = game.get_reward()

Object created


In [34]:
(game.wins + game.blackjacks) / (game.wins + game.blackjacks + game.losses + game.draws)

0.4279138770541692

In [35]:
game.losses / (game.wins + game.blackjacks + game.losses + game.draws)

0.48748478393183203

In [36]:
game.draws / (game.wins + game.blackjacks + game.losses + game.draws)

0.08460133901399879