In [1]:
import numpy as np
import pickle
import random
import pandas as pd

In [19]:


class BlackJackSolution:

    def __init__(self, lr=0.1, exp_rate=0.3, num_decks=6, initMoney=2500):
        self.player_Q_Values = {}  # key: [(player_value, show_card, usable_ace)][action] = value
        # initialise Q values | (12-21) x (1-10) x (True, False) x (1, 0) 400 in total
        for i in range(12, 22):
            for j in range(1, 11):
                for k in [True, False]:
                    for bet in [10,50,100]:
                        self.player_Q_Values[(i, j, k, bet)] = {}
                        for a in [1, 0]:
                            if (i == 21) and (a == 0):
                                self.player_Q_Values[(i, j, k, bet)][a] = 1                                
                            else:
                                self.player_Q_Values[(i, j, k, bet)][a] = 0                                

        self.player_state_action = []
        self.state = (0, 0, False, 10)  # initial state
        self.actions = [1, 0]  # 1: HIT  0: STAND
        self.end = False
        self.lr = lr
        self.exp_rate = exp_rate
        self.num_decks = num_decks
        self.currentStack = []
        self.testStack = []
        self.initMoney = initMoney
        
        # Added attribute to keep track of how well the model is learning
        self.gameTrack = []
        self.statehist = []
        self.d_value = []
        
        # Dict with faces
        self.f_dict = {'A': 1, '2': 2, '3': 3, '4': 4, '5': 5, '6': 6,
                 '7': 7, '8': 8, '9': 9, '10': 10, 'J': 10, 'Q': 10,
                 'K': 10}

    def makeStack(self):
        
        # Create empty stack
        new_stack = []
        test_stack = []
        
        # Define new list with faces
        f_list = ['A','2','3','4','5','6','7','8','9','10','J','Q','K']
        
        # List of cards
        c_list = list(range(1, 11)) + [10, 10, 10]
        
        # Extend empty stack by 4*num_decks*(list of cards)
        for i in range(self.num_decks):
            for j in range(4):
                new_stack.extend(c_list)
                test_stack.extend(f_list)
        
        # Shuffle the stack
        random.shuffle(new_stack)
        random.shuffle(test_stack)
        
        # Set the new stack as the current stack
        self.currentStack = new_stack
        self.testStack = test_stack
        
    # give card
    #@staticmethod
    def giveCard(self):
        # 1 stands for ace
        # **** Update to use a fresh deck that's shuffled each game ****
        # **** For dealing: 6 decks per stack, when below 52 cards, make new stack ****
        # **** Need to convert dealer list to dict so we can keep track of face (to make splitting possible) ****
        # **** Need to add actions for doubling down and for splitting ****
        
        # Code for face-stack list
        if (len(self.testStack) < 52):
            
            self.makeStack()
            
        # Remove the first card from the stack and set it to card to deal
        cardToDeal = self.testStack.pop(0)
        
        return cardToDeal
        
        # If we have 51 or fewer cards in stack, return a new one
        #if (len(self.currentStack) < 52):
            
            # If we have too few cards, make a new stack
        #    self.makeStack()
        
        #cardToDeal = self.currentStack.pop(0)
        
        #return cardToDeal
        #c_list = list(range(1, 11)) + [10, 10, 10]
        #return np.random.choice(c_list)
    

    def dealerPolicy(self, current_value, usable_ace, is_end):
        if current_value > 21:
            if usable_ace:
                current_value -= 10
                usable_ace = False
            else:
                return current_value, usable_ace, True
        # HIT17
        if current_value >= 17:
            return current_value, usable_ace, True
        else:
            card = self.giveCard()
            card_value = self.f_dict[card]
            if card_value == 1:
                if current_value <= 10:
                    return current_value + 11, True, False
                return current_value + 1, usable_ace, False
            else:
                return current_value + card_value, usable_ace, False

    def chooseAction(self,curRound=0,trainRound=1000):
        # if current value <= 11, always hit
        current_value = self.state[0]
        if current_value <= 11:
            return 1

        if np.random.uniform(0, 1) <= (self.exp_rate*(1 - curRound/trainRound)):
            action = np.random.choice(self.actions)
        #             print("random action", action)
        else:
            # greedy action
            v = -999
            action = 0
            for a in self.player_Q_Values[self.state]:
                if self.player_Q_Values[self.state][a] > v:
                    action = a
                    v = self.player_Q_Values[self.state][a]
        #             print("greedy action", action)
        return action

    # one can only have 1 usable ace
    # return next state
    def playerNxtState(self, action):
        current_value = self.state[0]
        show_card = self.state[1]
        usable_ace = self.state[2]

        if action:
            # action hit
            card = self.giveCard()
            if self.f_dict[card] == 1:
                if current_value <= 10:
                    current_value += 11
                    usable_ace = True
                else:
                    current_value += 1
            else:
                current_value += self.f_dict[card]
        else:
            # action stand
            self.end = True
            return (current_value, show_card, usable_ace)

        if current_value > 21:
            if usable_ace:
                current_value -= 10
                usable_ace = False
            else:
                self.end = True
                return (current_value, show_card, usable_ace)

        return (current_value, show_card, usable_ace, random.choice([10,50,100]))

    def winner(self, player_value, dealer_value):
        # player 1 | draw 0 | dealer -1
        winner = 0
        if player_value > 21:
            winner = -1
        else:
            if dealer_value > 21:
                winner = 1
            else:
                if player_value < dealer_value:
                    winner = -1
                elif player_value > dealer_value:
                    winner = 1
                else:
                    # draw
                    winner = 0
        return winner

    def _giveCredit(self, player_value, dealer_value):
        reward = self.winner(player_value, dealer_value)
        # backpropagate reward
        for s in reversed(self.player_state_action):
            state, action = s[0], s[1]
            reward = self.player_Q_Values[state][action] + self.lr*(reward - self.player_Q_Values[state][action])
            self.player_Q_Values[state][action] = round(reward, 3)

    def reset(self):
        self.player_state_action = []
        self.state = (0, 0, False, 10)  # initial state
        self.end = False

    # Show is set to False if cards go to player, True if cards go to house
    def deal2cards(self, show=False):
        # return value after 2 cards and usable ace
        value, usable_ace = 0, False
        
        cards = [self.giveCard(), self.giveCard()]
        card_values = [self.f_dict[cards[0]],self.f_dict[cards[1]]]
        if 1 in card_values:
            value = sum(card_values) + 10
            usable_ace = True
        else:
            value = sum(card_values)
            usable_ace = False

        if show:
            return value, usable_ace, card_values[0]
        else:
            return value, usable_ace

    def play(self, rounds=1000):
        for i in range(rounds):
            #print('*********')
            #print(f'round {i}')
            #print('----------')
            
            # give 2 cards
            dealer_value, d_usable_ace, show_card = self.deal2cards(show=True)
            player_value, p_usable_ace = self.deal2cards(show=False)

            self.state = (player_value, show_card, p_usable_ace, random.choice([10,50,100]))
            #print(i, self.state)
            self.d_value.append(dealer_value)
            # judge winner after 2 cards
            if player_value == 21 or dealer_value == 21:
                
                whoWon = self.winner(player_value, dealer_value)
            
                if whoWon == 1:
                    self.gameTrack.append(1)
                    self.statehist.append(self.state)
                    #gameplayhist = gameplayhist.append(i, self.player_Q_Values)
                else:
                    self.gameTrack.append(0)
                    self.statehist.append(self.state)
                    #gameplayhist = gameplayhist.append(i, self.player_Q_Values)
                
                # game end
                next
            else:
                while True:
                    action = self.chooseAction(i,rounds)  # state -> action
                    if self.state[0] >= 12:
                        state_action_pair = [self.state, action]
                        self.player_state_action.append(state_action_pair)
                    # update next state
                    self.state = self.playerNxtState(action)
                    if self.end:
                        break

                        # dealer's turn
                is_end = False
                while not is_end:
                    dealer_value, d_usable_ace, is_end = self.dealerPolicy(dealer_value, d_usable_ace, is_end)

                # judge winner
                # give reward and update Q value
                player_value = self.state[0]
                
                whoWon = self.winner(player_value, dealer_value)
            
                if whoWon == 1:
                    self.gameTrack.append(1)
                    self.statehist.append(self.state)
                    #gameplayhist = gameplayhist.append(self.player_Q_Values)
                else:
                    self.gameTrack.append(0)
                    self.statehist.append(self.state)
                    #gameplayhist = gameplayhist.append(self.player_Q_Values)
                
                # Print values of player and dealer
                #print("player value {} | dealer value {}".format(player_value, dealer_value))
                self._giveCredit(player_value, dealer_value)
  
            self.reset()

    def savePolicy(self, file="policy"):
        fw = open(file, 'wb')
        pickle.dump(self.player_Q_Values, fw)
        fw.close()

    def loadPolicy(self, file="policy"):
        fr = open(file, 'rb')
        self.player_Q_Values = pickle.load(fr)
        fr.close()

    # trained robot play against dealer
    def playWithDealer(self, rounds=1000):
        self.reset()
        self.loadPolicy()
        self.exp_rate = 0

        result = np.zeros(3)  # player [win, draw, lose]
        for _ in range(rounds):
            # hit 2 cards each
            # give 2 cards
            dealer_value, d_usable_ace, show_card = self.deal2cards(show=True)
            player_value, p_usable_ace = self.deal2cards(show=False)

            self.state = (player_value, show_card, p_usable_ace, random.choice([10,50,100]))

            # judge winner after 2 cards
            if player_value == 21 or dealer_value == 21:
                if player_value == dealer_value:
                    result[1] += 1
                elif player_value > dealer_value:
                    result[0] += 1
                else:
                    result[2] += 1
            else:
                # player's turn
                while True:
                    action = self.chooseAction()
                    # update next state
                    self.state = self.playerNxtState(action)
                    if self.end:
                        break

                        # dealer's turn
                is_end = False
                while not is_end:
                    dealer_value, d_usable_ace, is_end = self.dealerPolicy(dealer_value, d_usable_ace, is_end)

                # judge
                player_value = self.state[0]
                self.d_value = self.d_value.append(dealer_value)
                # print("player value {} | dealer value {}".format(player_value, dealer_value))
                w = self.winner(player_value, dealer_value)
                if w == 1:
                    result[0] += 1
                elif w == 0:
                    result[1] += 1
                else:
                    result[2] += 1
            self.reset()
        return result

In [20]:
sol = BlackJackSolution(lr=0.3,exp_rate=0.5)

In [21]:
sol.play(100)

In [5]:
sol.player_Q_Values

{(12, 1, True, 10): {1: 0, 0: 0},
 (12, 1, True, 50): {1: 0, 0: 0},
 (12, 1, True, 100): {1: 0, 0: 0},
 (12, 1, False, 10): {1: 0, 0: 0},
 (12, 1, False, 50): {1: 0, 0: 0},
 (12, 1, False, 100): {1: -0.09, 0: 0},
 (12, 2, True, 10): {1: 0, 0: 0},
 (12, 2, True, 50): {1: 0, 0: 0},
 (12, 2, True, 100): {1: 0, 0: 0},
 (12, 2, False, 10): {1: 0, 0: 0},
 (12, 2, False, 50): {1: -0.09, 0: 0},
 (12, 2, False, 100): {1: 0.09, 0: 0},
 (12, 3, True, 10): {1: 0, 0: 0},
 (12, 3, True, 50): {1: 0, 0: 0},
 (12, 3, True, 100): {1: 0, 0: 0},
 (12, 3, False, 10): {1: 0, 0: 0},
 (12, 3, False, 50): {1: 0, 0: 0},
 (12, 3, False, 100): {1: 0, 0: 0},
 (12, 4, True, 10): {1: 0, 0: 0},
 (12, 4, True, 50): {1: 0, 0: 0},
 (12, 4, True, 100): {1: 0, 0: 0},
 (12, 4, False, 10): {1: 0, 0: 0},
 (12, 4, False, 50): {1: 0, 0: 0},
 (12, 4, False, 100): {1: 0, 0: 0},
 (12, 5, True, 10): {1: 0, 0: 0},
 (12, 5, True, 50): {1: 0, 0: 0},
 (12, 5, True, 100): {1: 0, 0: 0},
 (12, 5, False, 10): {1: 0, 0: 0},
 (12, 5, False,

In [23]:
sol.gameTrack

[1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [24]:
sol.statehist

[(21, 8, False),
 (12, 7, False),
 (16, 10, True),
 (20, 2, False),
 (20, 7, False),
 (21, 3, True, 10),
 (27, 1, False),
 (24, 5, False),
 (26, 1, False),
 (26, 10, False),
 (21, 10, True, 10),
 (21, 4, True, 50),
 (21, 5, False),
 (27, 8, False),
 (21, 9, True),
 (22, 10, False),
 (24, 1, False),
 (17, 4, False),
 (29, 8, False),
 (20, 6, False),
 (18, 4, False),
 (23, 5, False),
 (13, 3, False),
 (21, 7, True, 100),
 (21, 5, False),
 (27, 10, False),
 (28, 10, False),
 (9, 10, False, 10),
 (28, 10, False),
 (20, 10, False),
 (28, 7, False),
 (17, 2, False),
 (18, 7, False),
 (25, 10, False),
 (15, 8, False),
 (29, 2, False),
 (22, 7, False),
 (21, 10, True, 50),
 (25, 9, False),
 (30, 6, False),
 (25, 10, False),
 (19, 6, False),
 (29, 6, False),
 (25, 4, False),
 (18, 10, False),
 (21, 9, False),
 (21, 2, True, 50),
 (20, 10, False),
 (13, 8, True),
 (24, 6, False),
 (18, 1, False),
 (15, 8, False),
 (25, 7, False),
 (23, 5, False),
 (20, 4, False),
 (22, 10, False),
 (25, 10, Fals

In [22]:
sol.d_value

[11,
 17,
 20,
 12,
 17,
 9,
 18,
 15,
 20,
 20,
 21,
 6,
 15,
 15,
 19,
 20,
 16,
 14,
 17,
 12,
 14,
 8,
 11,
 17,
 15,
 16,
 16,
 21,
 18,
 17,
 18,
 10,
 11,
 20,
 18,
 7,
 15,
 20,
 18,
 15,
 15,
 13,
 14,
 14,
 14,
 13,
 10,
 20,
 18,
 16,
 20,
 12,
 17,
 15,
 13,
 20,
 20,
 15,
 15,
 7,
 21,
 16,
 18,
 5,
 16,
 17,
 17,
 12,
 17,
 12,
 19,
 13,
 21,
 13,
 12,
 16,
 11,
 5,
 16,
 15,
 15,
 13,
 17,
 14,
 10,
 14,
 20,
 13,
 5,
 15,
 17,
 6,
 16,
 18,
 15,
 12,
 14,
 13,
 15,
 10]

In [None]:
#sol.gameTrack
countWindow = 100
sumGames = np.add.reduceat(sol.gameTrack, np.arange(0, len(sol.gameTrack), countWindow))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
import math

In [None]:
coef = np.polyfit(np.arange(len(sumGames)),sumGames,1)
poly1d_fn = np.poly1d(coef)

In [None]:
test = [1,2,3,4]
best = test.pop(0)
print(best)

In [None]:
lr_l = [0.1, 0.3, 0.5]
exp_rate_l = [0.3,0.5,1]
combos = list(itertools.product(lr_l,exp_rate_l))
f, axes = plt.subplots(3, 3, figsize=(12, 12))

for index, combo in enumerate(combos):
    sol = BlackJackSolution(lr=combo[0],exp_rate=combo[1])
    print(f'Playing Set {index}...')
    sol.play(10000)
    
    countWindow = 100
    sumGames = np.add.reduceat(sol.gameTrack, np.arange(0, len(sol.gameTrack), countWindow))
    
    # Create regression 
    coef = np.polyfit(np.arange(len(sumGames)),sumGames,1)
    poly1d_fn = np.poly1d(coef)
    
    sns.lineplot(data=poly1d_fn(np.arange(len(sumGames))), ax=axes[math.floor(index/3),index % 3])
    sns.lineplot(data=sumGames,ax=axes[math.floor(index/3),index % 3])

In [None]:
f, axes = plt.subplots(2, 2, figsize=(12, 12))

sns.set_style("darkgrid")
sns.lineplot(data=poly1d_fn(np.arange(len(sumGames))), label=f"Wins per {countWindow}", ax=axes[0,0])
sns.lineplot(data=sumGames,label='WinPct',ax=axes[0,0])

#plt.plot(sumGames)
#plt.plot(np.arange(len(sumGames)),poly1d_fn(np.arange(len(sumGames))),'--k')

In [None]:
sol.player_Q_Values