In [None]:
import numpy as np
import pickle

In [None]:
class Qlearning:
    # learn_r -> learning rate      exp_r -> experimenting rate (how often to randomly choose)
    def __init__(self, learn_r=0.1, exp_r=0.3):  

        self.player_Q_Values = {}  
        
        # key: [(player_val, up_card, ace)][action] = value
        # initialise Q values | (12-21) x (1-10) x (True, False) x (1, 0) 400 in total
            # This essentially represents each possible action in the game. Each combination of
                # player sum (hard and soft)/ dealer up card with 1 and 0 to represent the choice for that combination
        
        for i in range(12, 22):
            for j in range(1, 11):
                for k in [True, False]:
                    self.player_Q_Values[(i, j, k)] = {}
                    for a in [1, 0]:
                        if i == 21 and a == 0:
                            self.player_Q_Values[(i, j, k)][a] = 1
                        else:
                            self.player_Q_Values[(i, j, k)][a] = 0
        
        
        self.state = (0, 0, False)  # default state
        self.actions = [1, 0]  # 1 -> hit, 0 -> stand
        self.player_state_action = []
        self.end = False
        self.learn_r = learn_r
        self.exp_r = exp_r

    # Same logic here as in MC
    @staticmethod
    def giveCard():
        cards = [1,2,3,4,5,6,7,8,9,10,10,10,10]
        return np.random.choice(cards)
    
    # Used to allow auto win if 21 is pulled off deal
    def deal2cards(self, show=False):
        cards = [self.giveCard(), self.giveCard()]
        
        val = sum(cards)
        if 1 in cards:
            val += 10
            ace = True
        else:
            ace = False

        if show:
            return val, ace, cards[0]
        else:
            return val, ace
    
    def dealerLogic(self, val, ace):
        if val > 21:
            if ace:
                val -= 10
                ace = False
            else:
                return val, ace, True    

        # Assuming dealer stands on hard, hits soft 17, can be changed later
        if val > 17 or (val == 17 and not ace):
            return val, ace, True
        
        card = self.giveCard()
        if card == 1:
            if val + 11 > 21:
                # Card must be hard ace (1)
                return val + 1, ace, False
            else:
                # Card can be a soft ace (11)
                return val + 11, True, False
        else:
            return val+card, ace, False
    

    def chooseAction(self):
        # Always hit if val <= 11. Can change for testing
        print("Here:", self.state)
        current_val = self.state[0]
        if current_val <= 11:
            return 1
        
        # Here we balance experimentation vs explotiation
            # This means exp_r precent of the time, hit or stay is chosen at random
                # Otherwise, we rely on action that gains the besr reward
        
        # 'Decide' if a random choice will be made 
        if np.random.uniform(0, 1) <= self.exp_r:
            decision = np.random.choice(self.actions)
        else:
            # Greedy action
            v = float('-inf')
            decision = 0
            for a in self.player_Q_Values[self.state]:
                if self.player_Q_Values[self.state][a] > v:
                    decision = a
                    v = self.player_Q_Values[self.state][a]
        return decision

    # Take in action, update to next state, and determine if game is over
    def playerNxtState(self, action):
            print("Action:",action)
            current_val, up_card, ace = self.state      #unpack tuple
            
            if action:
                card = self.giveCard()
                if card == 1:
                    if current_val <= 10:
                        current_val += 11
                        ace = True
                    else:
                        current_val += 1
                else:
                    current_val += card
            else:
                # Player stands
                self.end = True
            
            if current_val > 21:
                if ace:
                    current_val -= 10
                    ace = False
                else:
                    self.end = True
            self.state = (current_val, up_card, ace)

        
    def reward(self, player_val, dealer_val, end=True):
        reward = 0
        if end:
            if player_val > 21:
                reward = -1
            else:
                if dealer_val > 21 or player_val > dealer_val:
                    reward = 1
                else:
                    reward = -1 if player_val < dealer_val else 0

        # Backpropagate the determined reward
        # Called a Q-value update gradient descent
        for s in reversed(self.player_state_action):
            state, action = s
            reward = self.learn_r * (reward - self.player_Q_Values[state][action]) + self.player_Q_Values[state][action]
            self.player_Q_Values[state][action] = round(reward, 3)


    def reset(self):
        self.player_state_action = []
        self.state = (0, 0, False)  # initial state
        self.end = False


    def play(self, rounds=1000):
        for round in range(rounds):
            if round % 1000 == 0:  # Check every 1000 rounds
                print(f"Round: {round}")

            # Deal
            dealer_val, d_ace, up_card = self.deal2cards(show=True)
            player_val, p_ace = self.deal2cards(show=False)

            self.state = (player_val, up_card, p_ace)
            print("init", self.state)

            if player_val != 21 and dealer_val != 21:
                while not self.end:
                    action = self.chooseAction() 
                    if self.state[0] >= 12:
                        state_action_pair = [self.state, action]
                        self.player_state_action.append(state_action_pair)
                    self.playerNxtState(action)

                # Dealer plays
                end = False
                while not end:
                    dealer_val, d_ace, end = self.dealerLogic(dealer_val, d_ace)

                # Give reward and update Q value
                player_val = self.state[0]
                print("player value {} | dealer value {}".format(player_val, dealer_val))
                self.reward(player_val, dealer_val)

            self.reset()

    def saveStrategy(self, file="strategy"):
        fw = open(file, 'wb')
        pickle.dump(self.player_Q_Values, fw)
        fw.close()

In [None]:
bot = Qlearning()
bot.play(100000)
print("Training Complete")
bot.saveStrategy()