In [249]:
import random
from copy import deepcopy
import gym
import torch
from collections import namedtuple, defaultdict

seed = 1412343214

# Monte Carlo Methods
This notebook recreates the classic card game Blackjack and trains an agent to play effectively using Monte Carlo Methods.

In [250]:
env = gym.make('Blackjack-v0')
env.seed(seed)  
torch.manual_seed(seed)

<torch._C.Generator at 0x10ddc5b70>

In [251]:
State = namedtuple('State', ['hand', 'dealer', 'usable_ace'])
StateAction = namedtuple('StateAction', ['s', 'a'])
StateActionReward = namedtuple('StateActionReward', ['s', 'a', 'r'])
Outcome = namedtuple('Outcome', ['state', 'reward', 'is_over', 'empty'])

class BlackJackPolicy:
    def __init__(self):
        # state: [(action, prob), (action, prob)]
        self.p = defaultdict(lambda: [(True, 0.5), (False, 0.5)])
    def action(self, state, greedy=False):
        if greedy:
            return max(self.p[state], key=lambda x: x[1])[0]
        else:
            actions = [a[0] for a in self.p[state]]
            probabilities = [p[1] for p in self.p[state]]
            return random.choices(actions, probabilities)[0] # Weighted action

def episode(policy):
    env.reset()
    finished = False
    state_action_pairs = []
    while not finished:
        state = State(*env._get_obs())
        if state.hand >= 21 or state.dealer >= 21:
            break
        action = policy.action(state)
        outcome = Outcome(*env.step(policy.action(state)))
        
        state_action_pairs.append(StateActionReward(state, action, outcome.reward))
        finished = outcome.is_over
    return state_action_pairs


        

# On Policy First-Visit Monte Carlo Control (for epsilon-soft policies)
Estimates optimal policy by generating episodes from the starting state. 'On Policy' refers to using the same policy to guide actions and generate behavior.

In [252]:
def train(epsilon=0.1):
    policy = BlackJackPolicy()
    Q = defaultdict(lambda: defaultdict(lambda: (0, 0)))  # state.action.(sum, count) of post rewards
    
    for i in range(10000):
        ep_state_actions = episode(policy)                            # generate an episode
        for i,sar in enumerate(ep_state_actions):                     # iterate state-action-rewards in episode
            s = sar.s
            a = sar.a
            returns = sum(map(lambda x: x.r, ep_state_actions[i:]))   # sum of rewards that follows first occurrence
            Q[s][a] = ((Q[s][a][0] + returns), Q[s][a][1] + 1)        # update Q(s,a) sum of rewards and count
        for s in map(lambda sar: sar.s, ep_state_actions):
            a_star = max(Q[s], key=lambda a: Q[s][a][0] / Q[s][a][1]) # argmax action based on Q average
            p_a = 1 - epsilon + (epsilon / 2)                         # probability of best action
            p_na = epsilon / 2                                        # probability of all other actions (only 1 other)
            policy.p[s] = [(a_star, p_a), (not a_star, p_na)]         # update policy probability of action based on state
    return policy

In [253]:
optimal_policy = train()

In [None]:
def off_policy_prediction(o_policy):
    b_policy = BlackJackPolicy(greedy=False)
    Q = defaultdict(lambda: defaultdict(lambda: (0, 0)))  # state.action.(sum, count) of post rewards
    
    for i in range(10000):
        ep_state_actions = episode(b_policy)
    