# Monte Carlo Control without exploring starts

In [1]:
from blackjack import BlackJack
import numpy as np
import random
import matplotlib.pyplot as plt
import pandas as pd

pd.set_option('display.max_rows', 180)

In [2]:
# pi is defined to be a e-greedy policy.
# probability of exploiting: 1 - e; probability of exploring: e
def choose_action(pi, state, epsilon, actions):
    if random.random() < epsilon:
        return random.choice(actions)
    return pi[state]

In [3]:
# This is pretty much identical to MCC with ES. The difference is how we choose the actions.
game = BlackJack()
epochs = 1_000_000
alpha = .1
epsilon = .2

actions = ["hit", "stand", "double"]

Q = {state : {a : 0 for a in actions} for state in game.states}
pi = {state : random.choice(actions) for state in game.states}
N = {state : 0 for state in game.states}
rewards = [0] * epochs

for i in range(epochs):
    
    if i % (epochs / 10) == 0:
        print(i)
    
    #game.start()
    game.start_from_state(random.choice(game.states))
    
    first = True
    
    visited_states = []
    performed_actions = []
    
    while game.player_hand_sum < 21:
        
        state = game.get_current_state()
        
        action = choose_action(pi, state, epsilon, actions)
        
        if first:
            action = np.random.choice(actions)
            first = False
        
        visited_states.append(state)
        performed_actions.append(action)
        
        if action == "hit":
            game.hit()
        elif action == "stand":
            game.stand()
            break
        else:
            game.double()
            break
    
    reward = game.get_reward()
    rewards[i] = reward
    
    for s, a in zip(visited_states, performed_actions):
        
        N[s] += 1 # count occurences of states
        
        # update the Q-value
        Q[s][a] += (reward - Q[s][a]) / N[s]
        #Q[s][a] += alpha * (reward - Q[s][a])
        
    for s in visited_states:
        
        # update the policy
        pi[s] = max(Q[s], key = Q[s].get)
    

Object created
0
100000
200000
300000
400000
500000
600000
700000
800000
900000


In [4]:
game.test_policy(pi, 1_000_000)

Winrate: 0.4295282943744254
Drawrate: 0.0803124796397779
Loserate: 0.4901592259857967
Ws: 474670; Ls: 541673; Draws: 88753
Profit: -69128
Wins after doubling: 23141; Losses after doubling: 25097
