In [60]:
import numpy as np
import gym
from gym import spaces
from gym.utils import seeding

Stuff to figure out:
- Does the blackjack class terminate an episode when a player either reaches 21 or goes bust? I don't think so. 
- Make it terminate when going bust/ reaching 21
- Incorporate gym.Env functionality for stepping the environment, resetting it, rendering, etc.
- Plot state-value function learned in order to determine if it matches with the function found by the book and by Denny Britz

In [19]:
class blackjack_game(gym.Env):
    def __init__(self, dealer_decision_point):
        self.dealer_decision_point = dealer_decision_point
        self.cards = ['Ace', 2, 3, 4, 5, 6, 7, 8, 9, 10, 'face']
        
    def drawtwocards(self):
        rand_cards = np.random.randint(0, len(self.cards), size=2)
        return rand_cards
    
    def drawonecard(self):
        rand_card = np.random.randint(0, len(self.cards))
        return rand_card
    
    def computepoints(self, rand_cards):
        points = 0
        for i in rand_cards:
            if type(self.cards[i]) is int:
                points += self.cards[i]
            elif self.cards[i] is 'face':
                points += 10
            elif self.cards[i] is 'Ace':
                if points <= 10:
                    points += 11
                elif points > 10:
                    points += 1
        return points
    
    def dealerplay(self, point_total):
        # 1 is hit, 0 is stick
        if point_total < self.dealer_decision_point:
            return 1
        elif point_total >= self.dealer_decision_point:
            return 0
        
    def agent_obs(self, agent_cards, dealer_cards):
        a_cards = [self.cards[i] for i in agent_cards]
        d_card = self.cards[dealer_cards[0]]
        a_points = self.computepoints(a_cards)
        d_points = self.computepoints([d_card])
        agent_ace_usable = int(a_points <= 10)
        return a_points, d_points, agent_ace_usable
    
    def check_win(self, agent_cards, dealer_cards, both_done_playing):
        agent_points = self.computepoints(agent_cards)
        dealer_points = self.computepoints(dealer_cards)
        agent_win, dealer_win = 0, 0
        if agent_points == 21: agent_win += 1
        if dealer_points == 21: dealer_win += 1
        
        if agent_points > 21: dealer_win += 1
        if dealer_points > 21: agent_win += 1
        
        if both_done_playing and agent_points > dealer_points:
            agent_win += 1
        if both_done_playing and dealer_points > agent_points:
            dealer_win += 1
         
        if agent_win == dealer_win: return 0
        if agent_win > dealer_win: return 1
        if agent_win < dealer_win: return -1
        
        
    def visualize(self, points, turn, action):
        if turn is 1: player = 'MC'
        elif turn is 0: player = 'dealer'
        
        print(player, 'has', points, ' points')
        print(player, 'chooses to', action)
        print()
        
    def play(self, start, policy):
        sequence = []
        a_cards, d_cards = self.drawtwocards(), self.drawtwocards()
        both_done_playing = False
        while self.check_win(a_cards, d_cards, both_done_playing) is None or 0:
            if start == True:
                start = False
                dealer_turn = False
                a_turn = True
            a_obs = self.agent_obs(a_cards, d_cards)
            a_action = policy(a_obs[0])
            if a_action is 1:
                np.append(a_cards, self.drawonecard())
            elif a_action is 0:
                a_turn = False
                dealer_turn = True
            if dealer_turn is True:
                d_action = self.dealerplay(17)
            if d_action is 0 and a_action is 0:
                both_done_playing = True
                self.check_win(a_cards, d_cards, both_done_playing)
            sequence.append(np.array([a_obs, a_action, self.check_win(a_cards, d_cards, both_done_playing)]))
        return sequence

In [20]:
game = blackjack_game(17)
a_cards, d_cards = game.drawtwocards(), game.drawtwocards()
game.check_win(a_cards, d_cards, True)

-1

In [56]:
class monte_carlo:
    def __init__(self, states, environment):
        self.states = states
        self.env = environment(17)
    
    def first_visit_MC_prediction(self, policy, epochs):
        V_ = np.random.rand(self.states)
        returns = []
        
        for ep in range(epochs):
            seq = self.env.play(True, policy=policy)
            G = 0
            for i in range(len(seq)):
                G = gamma * G + seq[i+1[2]]
                if seq[i] not in seq[:-1]:
                    returns[i] += G
                    V_[i] = np.average(returns[i])
        return V_

In [57]:
num_states = 200
policy = lambda points: int(points is 20 or points is 21)

In [58]:
mc = monte_carlo(200, blackjack_game)

In [59]:
mc.first_visit_MC_prediction(policy, 500000)

array([0.21829018, 0.53822454, 0.95823701, 0.4201937 , 0.43837277,
       0.47883206, 0.65857083, 0.36450569, 0.3038113 , 0.31058791,
       0.08616594, 0.73101671, 0.88518298, 0.59843405, 0.10680931,
       0.69712783, 0.56921367, 0.81873762, 0.60798064, 0.93017007,
       0.66243806, 0.18394337, 0.64461003, 0.53989732, 0.66604234,
       0.98666662, 0.44867214, 0.40930516, 0.26236748, 0.19302723,
       0.45727106, 0.70441434, 0.80785109, 0.07137477, 0.68135228,
       0.82866818, 0.34538189, 0.29795347, 0.10743991, 0.71042047,
       0.74751246, 0.5431503 , 0.66348282, 0.23736131, 0.34135237,
       0.35400368, 0.66692562, 0.68960921, 0.21470136, 0.37782128,
       0.87627054, 0.4018942 , 0.54513   , 0.31354471, 0.11991783,
       0.80328985, 0.3446438 , 0.1416369 , 0.43535068, 0.76671385,
       0.00105864, 0.26640283, 0.52681098, 0.62346269, 0.64545646,
       0.69078655, 0.78512216, 0.46554459, 0.6205387 , 0.60125064,
       0.03973286, 0.64565914, 0.87271168, 0.38412099, 0.39586