In [2]:
import gym
import numpy as np
import random

In [22]:
def run_mc(env, num_episodes):
    '''
    observation_space[0] is the 18 possible player values. (3 through 20)
    observation_space[1] is the 10 possible dealer upcards. (2 through 11)

    Combining these together yields all possible states.

    Multiplying this with hit/stand yields all possible state/action pairs.

    This is the Q map.
    '''
    Q = np.zeros([env.observation_space[0].n * env.observation_space[1].n, env.action_space.n], dtype=np.float16)


    # This map contains the probability distributions for each action (hit or stand) given a state.
    # The state (combo of player hand value and dealer upcard value) index in this array yields a 2-element array
    # The 0th index of this 2-element array refers to the probability of "hit", and the 1st index is the probability of "stand"
    prob = np.zeros([env.observation_space[0].n * env.observation_space[1].n, env.action_space.n], dtype=np.float16) + 0.5

    # The learning rate. Very small to avoid making quick, large changes in our policy.
    alpha = 0.001

    epsilon = 1
    
    # The rate by which epsilon will decay over time.
    # Since the probability we take the option with the highest Q-value is 1-epsilon + probability,
    # this decay will make sure we are the taking the better option more often in the longrun.
    # This allows the algorithm to explore in the early stages, and exploit in the later stages.
    decay = 0.9999
    
    # The lowest value that epsilon can go to.
    # Although the decay seems slow, it actually grows exponentially, and this is magnified when
    # running thousands of episodes.
    epsilon_min = 0.9

    # may have to be tweaked later.
    gamma = 0.8
    
    for _ in range(num_episodes):
        episode = play_game(env, Q, prob)
        
        epsilon = max(epsilon * decay, epsilon_min)
        
        Q = update_Q(env, episode, Q, alpha, gamma)
        prob = update_prob(env, episode, Q, prob, epsilon)
        
    return Q, prob

In [30]:
def play_game(env, Q, prob):
    # Can contain numerous state->action->reward tuples because a round of 
    # Blackjack is not always resolved in one turn.
    # However, there will be no state that has a player hand value that exceeds 20, since only initial
    # states BEFORE actions are made are used when storing state->action->reward tuples.
    episode = []
    
    state = env.reset()
    done = False
    
    while done == False:
        if state[0] == 19: #Player was dealt Blackjack, player_value already subtracted by 2 to get state[0]
            # don't do any episode analysis for this episode. This is a useless episode.
            next_state, reward, done, info = env.step(1) # doesn't matter what action is taken.
        else:
            # Get the index in Q that corresponds to the current state
            Q_state_index = get_Q_state_index(state)
            
            # Use the index to get the possible actions, and use np.argmax()
            # to get the index of the action that has the highest current Q
            # value. Index 0 is hit, index 1 is stand.
            best_action = np.argmax(Q[Q_state_index])
            
            # Go to the prob table to retrieve the probability of this action.
            # This uses the same Q_state_index used for finding the state index
            # of the Q-array.
            prob_of_best_action = get_prob_of_best_action(env, state, Q, prob)

            action_to_take = None

            if random.uniform(0,1) < prob_of_best_action: # Take the best action
                action_to_take = best_action
            else: # Take the other action
                action_to_take = 1 if best_action == 0 else 0
            
            # The agent does the action, and we get the next state, the rewards,
            # and whether the game is now done.
            next_state, reward, done, info = env.step(action_to_take)
            
            # We now have a state->action->reward sequence we can log
            # in `episode`
            episode.append((state, action_to_take, reward))
            
            # update the state for the next decision made by the agent.
            state = next_state
        
    return episode

In [6]:
def update_Q(env, episode, Q, alpha, gamma):
    '''
    THIS IS WHERE THE ALGORITHM HINGES ON BEING FIRST VISIT OR EVERY VISIT.
    I AM GOING TO USE FIRST-VISIT, AND HERE'S WHY.
    
    If you want first-visit, you need to use the cumulative reward of the entire
    episode when updating a Q-value for ALL of the state/action pairs in the
    episode, even the first state/action pair. In this algorithm, an episode
    is a round of Blackjack. Although the bulk of the reward may come from the
    2nd or 3rd decision, deciding to hit on the 1st decision is what enabled
    the future situations to even occur, so it is important to include the
    entire cumulative reward. We can reduce the impact of the rewards of the
    future decisions by lowering gamma, which will lower the G value for our
    early state/action pair in which we hit and did not get any immediate rewards.
    This will make our agent consider future rewards, and not just look at 
    each state in isolation despite having hit previously.
     
    If you want Every-Visit MC, do not use the cumulative rewards when updating Q-values,
    and just use the immediate reward in this episode for each state/action pair.
    '''
    step = 0
    for state, action, reward in episode:
        # calculate the cumulative reward of taking this action in this state.
        # Start from the immediate rewards, and use all the rewards from the
        # subsequent states. Do not use rewards from previous states.
        total_reward = 0
        gamma_exp = 0
        for curr_step in range(step, len(episode)):
            curr_reward = episode[curr_step][2]
            total_reward += (gamma ** gamma_exp) * curr_reward
            gamma_exp += 1
        
        # Update the Q-value
        Q_state_index = get_Q_state_index(state)
        curr_Q_value = Q[Q_state_index][action]
        Q[Q_state_index][action] = curr_Q_value + alpha * (total_reward - curr_Q_value)
        
        # update step to start further down the episode next time.
        step += 1
        
        
    return Q

In [7]:
def update_prob(env, episode, Q, prob, epsilon):
    for state, action, reward in episode:
        # Update the probabilities of the actions that can be taken given the current
        # state. The goal is that the new update in Q has changed what the best action
        # is, and epsilon will be used to create a small increase in the probability
        # that the new, better action is chosen.
        prob = update_prob_of_best_action(env, state, Q, prob, epsilon)
        
    return prob

In [8]:
# Given a state, derive the corresponding index in the Q-array.
# The state is a player hand value + dealer upcard pair,
# so a "hashing" formula must be used to allocate the
# indices of the Q-array properly.
def get_Q_state_index(state):
    # the player value is already subtracted by 1 in the env when it returns the state.
    # subtract by 1 again to fit with the array indexing that starts at 0
    initial_player_value = state[0] - 1
    # the upcard value is already subtracted by 1 in the env when it returns the state.
    # dealer_upcard will be subtracted by 1 to fit with the array indexing that starts at 0
    dealer_upcard = state[1] - 1

    return (env.observation_space[1].n * (initial_player_value)) + (dealer_upcard)

In [9]:
def get_prob_of_best_action(env, state, Q, prob):
    # Use the mapping function to figure out which index of Q corresponds to 
    # the player hand value + dealer upcard value that defines each state.
    Q_state_index = get_Q_state_index(state)
    
    # Use this index in the Q 2-D array to get a 2-element array that yield
    # the current Q-values for hitting (index 0) and standing (index 1) in this state.
    # Use the np.argmax() function to find the index of the action that yields the
    # rewards i.e. the best action we are looking for.
    best_action = np.argmax(Q[Q_state_index])
    
    # Retrieve the probability of the best action using the 
    # state/action pair as indices for the `prob` array,
    # which stores the probability of taking an action (hit or stand)
    # for a given state/action pair.
    return prob[Q_state_index][best_action]
    
def update_prob_of_best_action(env, state, Q, prob, epsilon):

    Q_state_index = get_Q_state_index(state)
    
    best_action = np.argmax(Q[Q_state_index])
    
    # Slightly alter the probability of this best action being taken by using epsilon
    # Epsilon starts at 1.0, and slowly decays over time.
    # Therefore, as per the equation below, the AI agent will use the probability listed 
    # for the best action in the `prob` array during the beginning of the algorithm.
    # As time goes on, the likelihood that the best action is taken is increased from
    # what is listed in the `prob` array.
    # This allows for exploration of other moves in the beginning of the algorithm,
    # but exploitation later for a greater reward.
    #prob[Q_state_index][best_action] = prob[Q_state_index][best_action] + ((1 - epsilon) * (1 - prob[Q_state_index][best_action]))
    prob[Q_state_index][best_action] = min(1, prob[Q_state_index][best_action] + 1 - epsilon)
    
    other_action = 1 if best_action == 0 else 0
    prob[Q_state_index][other_action] = 1 - prob[Q_state_index][best_action]
    
    return prob

In [19]:
import time
env = gym.make('Blackjack-v0')

start_time = time.time()
new_Q, new_prob = run_mc(env, 100000)
end_time = time.time()

print("Total Time for Learning: " + str(end_time - start_time))

Total Time for Learning: 10.195332288742065


In [21]:
new_Q
#new_prob

array([[ 0.0000e+00,  0.0000e+00],
       [ 0.0000e+00,  0.0000e+00],
       [ 0.0000e+00,  0.0000e+00],
       [ 0.0000e+00,  0.0000e+00],
       [ 0.0000e+00,  0.0000e+00],
       [ 0.0000e+00,  0.0000e+00],
       [ 0.0000e+00,  0.0000e+00],
       [ 0.0000e+00,  0.0000e+00],
       [ 0.0000e+00,  0.0000e+00],
       [ 0.0000e+00,  0.0000e+00],
       [ 0.0000e+00,  0.0000e+00],
       [ 0.0000e+00,  0.0000e+00],
       [ 0.0000e+00,  0.0000e+00],
       [ 0.0000e+00,  0.0000e+00],
       [ 0.0000e+00,  0.0000e+00],
       [ 0.0000e+00,  0.0000e+00],
       [ 0.0000e+00,  0.0000e+00],
       [ 0.0000e+00,  0.0000e+00],
       [ 0.0000e+00,  0.0000e+00],
       [ 0.0000e+00,  0.0000e+00],
       [ 0.0000e+00,  0.0000e+00],
       [ 0.0000e+00,  0.0000e+00],
       [ 0.0000e+00,  0.0000e+00],
       [ 0.0000e+00,  0.0000e+00],
       [ 0.0000e+00,  0.0000e+00],
       [ 0.0000e+00,  0.0000e+00],
       [ 0.0000e+00,  0.0000e+00],
       [ 0.0000e+00,  0.0000e+00],
       [ 0.0000e+00,