In [18]:
%matplotlib notebook

In [19]:
from blackjack import BlackJack
import numpy as np
import random
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
import pandas as pd

from scipy.special import softmax

random.seed(1234)
np.random.seed(1234)

In [20]:
game = BlackJack()
game_test = BlackJack()
epochs = 500_000
test_epochs = 100_000
gamma = .9

actions = ["hit", "stand", "double"]

Q = {state : {a : 0 for a in actions} for state in game.states} # Q values
C = {state : {a : 0 for a in actions} for state in game.states}

pi = {state : random.choice(actions) for state in game.states} # policy
mu = {state : {a : p for (a,p) in zip(actions, softmax([random.random() for _ in actions]))} for state in game.states}

rewards = [0] * (epochs + 1)
stats = []

New BlackJack game created
New BlackJack game created


In [21]:
def choose_action(mu, state, actions):
    return np.random.choice(actions, p = list(mu[state].values()))

In [22]:
for i in range(epochs+1):
    
    if i % (epochs / 100) == 0:
        #print(i)
        stats.append(np.unique(game_test.test_policy(pi, test_epochs), return_counts = True)[1])
        
    
    #game.start()
    game.start_from_state(random.choice(game.states))
    
    visited_states = []
    performed_actions = []
    
    while game.player_hand_sum < 21:
        
        state = game.get_current_state()
        
        action = choose_action(mu, state, actions)
        
        visited_states.append(state)
        performed_actions.append(action)
        
        if action == "hit":
            game.hit()
        elif action == "stand":
            game.stand()
            break
        else:
            game.double()
            break
    
    G = game.get_reward()
    W = 1
    
    rewards[i] = G
    
    first = True
    for s, a in zip(visited_states[::-1], performed_actions[::-1]):
        
        if not first:
            G *= gamma
        first = False
            
        C[s][a] += W
        Q[s][a] += (W / C[s][a]) * (G - Q[s][a])
        pi[s] = max(Q[s], key = Q[s].get)
        W /= mu[s][a]

In [23]:
np.save("data/LearningRate/MonteCarloControlOffPolicyLR", stats)

In [24]:
rewards = game.test_policy(pi, 1_000_000, verbose = 1)

Winrate: 0.4295862101425401
Drawrate: 0.0749222492152511
Loserate: 0.4954915406422088
Ws: 474612; Ls: 547425; Draws: 82775
Profit: -84117
Wins after doubling: 41027; Losses after doubling: 52081


In [35]:
rewards_freq = np.unique(rewards, return_counts = True)[1]
np.save("data/FinalEvaluation/MonteCarloControlOffPolicyFinal", rewards_freq)

In [26]:
df = pd.DataFrame(list(pi.keys()), columns = ["player_hand_sum", "has_usable_ace", "dealer_visible_card"])
df["V"] = [Q[state][pi[state]] for state in pi]
df["action"] = pi.values()
df.head()

Unnamed: 0,player_hand_sum,has_usable_ace,dealer_visible_card,V,action
0,16,False,1,-0.655221,stand
1,13,True,2,-0.02988,double
2,18,True,6,0.391892,double
3,20,True,7,0.781348,stand
4,19,False,9,0.278045,stand


In [27]:
with_ace = df[df["has_usable_ace"]].drop("has_usable_ace", axis = "columns")
no_ace = df[~df["has_usable_ace"]].drop("has_usable_ace", axis = "columns")

with_ace['color'] = with_ace['action'].map({"hit" : "red", "stand" : "blue", "double" : "green"})
no_ace['color'] = no_ace['action'].map({"hit" : "red", "stand" : "blue", "double" : "green"})

In [28]:
ax = plt.axes(projection='3d')

ax.plot_trisurf(with_ace["player_hand_sum"], with_ace["dealer_visible_card"], with_ace["V"], cmap=plt.cm.Spectral)

ax.set_title("State-value function for states with a usable ace")
ax.set_xlabel("value of player's hand")
ax.set_ylabel("dealer's visible card")
ax.set_xticks(np.arange(12,21))
ax.set_yticks(np.arange(1,11))
ax.set_zlabel("V")
plt.show()

<IPython.core.display.Javascript object>

In [32]:
ax = plt.axes(projection='3d')

ax.plot_trisurf(no_ace["player_hand_sum"], no_ace["dealer_visible_card"], no_ace["V"], cmap=plt.cm.Spectral)

ax.set_title("State-value function for states without a usable ace")
ax.set_xlabel("value of player's hand")
ax.set_ylabel("dealer's visible card")
ax.set_xticks(np.arange(12,21))
ax.set_yticks(np.arange(1,11))
ax.set_zlabel("V")
plt.show()

<IPython.core.display.Javascript object>

In [33]:
plt.scatter(with_ace["player_hand_sum"], with_ace["dealer_visible_card"], 
            c = with_ace["color"], marker="s", s = 720)
plt.yticks(np.arange(1,11))
plt.xlabel("Value of player's hand")
plt.ylabel("Dealer's visible card")
plt.title("Learned policy for states with a usable ace")

legend_elements = [Line2D([0], [0], marker='s', color='w', label='hit', markerfacecolor='red', markersize=15),
                   Line2D([0], [0], marker='s', color='w', label='stand', markerfacecolor='blue', markersize=15),
                   Line2D([0], [0], marker='s', color='w', label='double', markerfacecolor='green', markersize=15)]

plt.legend(handles=legend_elements, title = "action", bbox_to_anchor=(1, 1))
plt.axis("scaled")
plt.tight_layout()
plt.xticks(np.arange(12,21))
plt.show()

<IPython.core.display.Javascript object>

In [34]:
plt.scatter(no_ace["player_hand_sum"], no_ace["dealer_visible_card"], 
            c = no_ace["color"], marker="s", s = 720)
plt.yticks(np.arange(1,11))
plt.xlabel("Value of player's hand")
plt.ylabel("Dealer's visible card")
plt.title("Learned policy for states without a usable ace")

legend_elements = [Line2D([0], [0], marker='s', color='w', label='hit', markerfacecolor='red', markersize=15),
                   Line2D([0], [0], marker='s', color='w', label='stand', markerfacecolor='blue', markersize=15),
                   Line2D([0], [0], marker='s', color='w', label='double', markerfacecolor='green', markersize=15)]

plt.legend(handles=legend_elements, title = "action", bbox_to_anchor=(1, 1))
plt.axis("scaled")
plt.tight_layout()
plt.xticks(np.arange(12,21))
plt.savefig("pic.pdf")
plt.show()

<IPython.core.display.Javascript object>