In [1]:
from typing import List, Dict
import random
import numpy as np
import sys

Actions = ['B', 'C']  # bet/call vs check/fold

In [2]:
class InformationSet():
    def __init__(self):
        '''
        - self ---> self <__main__.InformationSet object at 0x0000028C82531370> ---> instance of class, current object created with this class
        - self.cumulative_regrets ---> [ 0.38 -1.88]
        - self.strategy_sum ---> [3.50 0.50]
        - self.num_actions ---> 2
        '''
        self.cumulative_regrets = np.zeros(shape=len(Actions))
        self.strategy_sum = np.zeros(shape=len(Actions))
        self.num_actions = len(Actions)

    def normalize(self, strategy: np.array) -> np.array:
        '''
        Normalize a strategy. If there are no positive regrets, use a uniform random strategy
        What we need for a strategy is a probability distribution, so the items in our array need to be non-negative and sum to 1
        - strategy before normalization ---> [1.50 0.50]
        - strategy after normalization ---> [0.75 0.25]
        - self.num_actions ---> 2
        '''
        if sum(strategy) > 0:
            strategy /= sum(strategy)
        else:
            strategy = np.array([1.0 / self.num_actions] * self.num_actions) # = array([0.50, 0.50, 0.50])
        return strategy

    def get_strategy(self, reach_probability: float) -> np.array:
        '''
        Return regret-matching strategy
        What we need for a strategy is a probability distribution, so the items in our array need to be non-negative and sum to 1
        - self.cumulative_regrets ---> [ 0.38 -1.88] ---> Sum of regret array for each action
        - strategy (before normalization) ---> [0.38 0.00]
        - strategy (after normalization) ---> [1.00 0.00]
        - reach_probability ---> 1.0. We assume counterfactual reach_probability= 1 always, as if player i’s strategy was modified to have intentionally played to information set Ii.
        - self.strategy_sum ---> [2.50 0.50] + 1.0*[1.00 0.00] = [3.50 0.50]. 
        '''
        strategy = np.maximum(0, self.cumulative_regrets) # This removes the -1.88 in the self.cumulative_regrets=[ 0.38 -1.88] array
        strategy = self.normalize(strategy)
        self.strategy_sum += reach_probability * strategy
        return strategy

    def get_average_strategy(self) -> np.array:
        '''
        Gets called only once for each of the 12 infoset_map.items() [K, J, Q, QB, QC, JB, JC, KB, KC, KCB, JCB, QCB]
        - self ---> self <__main__.InformationSet object at 0x0000028C8200AE20> ---> instance of class, current object created with this class
        - self.strategy_sum.copy() ---> [3.50 0.50]
        - self.normalize(self.strategy_sum.copy()) ---> [0.88 0.12]
        '''
        return self.normalize(self.strategy_sum.copy())

In [3]:
class KuhnPoker():
    '''
    Note that this class is just a namespace for two static methods, 
    so we could also move the functions to a module and get rid of the class altogether.
    '''
    @staticmethod
    def is_terminal(history: str) -> bool:
        '''
        - @staticmethod ---> notice functions with @staticmethod have no 'self' argument, because they can. Static methods do not require a class instance, they are not dependent on the state of the object, they do not access the object propoerties. 
        - history ---> CBB or B or CC
        - history in ['BC', 'BB', 'CC', 'CBB', 'CBC'] ---> True or False. If True, game state h we are in is terminal
        '''
        return history in ['BC', 'BB', 'CC', 'CBB', 'CBC']

    @staticmethod
    def get_payoff(history: str, cards: List[str]) -> int:
        '''
        - @staticmethod ---> notice functions with @staticmethod have no 'self' argument, because they can. Static methods do not require a class instance, they are not dependent on the state of the object, they do not access the object propoerties. 
        - history ---> BB
        - cards ---> ['K', 'Q']
        - history in ['BC', 'CBC'] ---> 
        - payoff ---> 2 or 1
        - active_player ---> 0 or 1
        - player_card ---> J, Q or K
        - opponent_card ---> J, Q or K
        '''
        """get payoff for 'active' player in terminal history"""
        if history in ['BC', 'CBC']: # if opponent folded (these are the two cases where opponent folded)
            return +1
        else:  # CC or BB or CBB, i.e. There's a showdown, they both see each other's cards
            payoff = 2 if 'B' in history else 1
            active_player = len(history) % 2
            player_card = cards[active_player]
            opponent_card = cards[(active_player + 1) % 2]
            if player_card == 'K' or opponent_card == 'J':
                return payoff
            else: # either player_card is Q and opponent_card is K, or player_card is J and opponent_card is Q
                return -payoff

In [71]:
class KuhnCFRTrainer():
    def __init__(self):
        '''
        - self.infoset_map ---> {}
        '''
        self.infoset_map: Dict[str, InformationSet] = {} # InformationSet is the class we created earlier. Dict[keys type, values type]

    def get_information_set(self, card_and_history: str) -> InformationSet:
        '''
        add if needed and return
        - card_and_history ---> K or JB or JC
        - self.infoset_map[card_and_history] ---> <__main__.InformationSet object at 0x0000021310C9BF40>
        - self.infoset_map:  
                            {'K': <__main__.InformationSet object at 0x0000021310C9BF40>, #with infoset properties cumulative_regrets, strategy_sum, num_actions
                            'JB': <__main__.InformationSet object at 0x000002131069D160>, 
                            'JC': <__main__.InformationSet object at 0x000002131069D9A0>}
        '''
        if card_and_history not in self.infoset_map:
            self.infoset_map[card_and_history] = InformationSet() # creates instance of InformationSet class (creates an object). With properties: cumulative_regrets, strategy_sum, num_actions
        print("self.infoset_map['Q'].__dict__.values()",self.infoset_map['Q'].__dict__.values())
        return self.infoset_map[card_and_history]

    def cfr(self, cards: List[str], history: str, reach_probabilities: np.array, active_player: int):
        '''
        """add if needed and return"""
        Arguments coming into function
        - self ---> <__main__.KuhnCFRTrainer object at 0x000001D9498F7490>
        - cards ---> ['J', 'K']
        - history ---> B ---> only calls and bets, no cards.
        - reach_probabilities ---> [0.50 1.00]
        - active_player ---> 1 (or 0)

        Variables created in function
        - mycard ---> K
        - info_set ---> <__main__.InformationSet object at 0x000001D948F97DF0>
        - strategy ---> [0.50 0.50]
        - opponent ---> 0
        - counterfactual_values ---> [0.00 0.00]
        
        - node_value ---> -1.5
        '''
        #print()
        #print('self: ', self)
        #print('cards: ', cards)
        #print('history: ', history)
        #print('reach_probabilities: ', reach_probabilities)
        #print('active_player: ', active_player)

        if KuhnPoker.is_terminal(history):
            return KuhnPoker.get_payoff(history, cards)

        my_card = cards[active_player]
        #print('my_card: ', my_card)
        info_set = self.get_information_set(my_card + history)
        #print('info_set: ', info_set)

        strategy = info_set.get_strategy(reach_probabilities[active_player])
        #print('strategy: ', strategy)
        opponent = (active_player + 1) % 2
        #print('opponent: ', opponent)
        counterfactual_values = np.zeros(len(Actions))
        #print('counterfactual_values: ', counterfactual_values)

        for ix, action in enumerate(Actions):
            '''
            - action_probability ---> 0.5
            - new_reach_probabilities ---> [0.50 1.00]
            - new_reach_probabilities[active_player] ---> 0.5
            - counterfactual_values[ix] ---> -2.0, updates the two counterfactual_values for that game state h one index at a time
            '''
            #print(ix, action)
            action_probability = strategy[ix]
            #print('action_probability: ', action_probability)

            # compute new reach probabilities after this action
            new_reach_probabilities = reach_probabilities.copy()
            #print('new_reach_probabilities: ', new_reach_probabilities)
            new_reach_probabilities[active_player] *= action_probability
            #print('new_reach_probabilities[active_player]: ', new_reach_probabilities[active_player])

            # recursively call cfr method, next player to act is the opponent
            counterfactual_values[ix] = -self.cfr(cards, history + action, new_reach_probabilities, opponent)
            #print('counterfactual_values[ix]: ', counterfactual_values[ix])
            #print()

        # Value of the current game state is just counterfactual values weighted by action probabilities
        node_value = counterfactual_values.dot(strategy)
        #print('node_value: ', node_value)
        #print()
        #print('reach_probabilities: ', reach_probabilities)
        #print('counterfactual_values : ', counterfactual_values )
        #print('info_set.cumulative_regrets: ', info_set.cumulative_regrets)
        for ix, action in enumerate(Actions):
            '''
            - ix ---> 0
            - action ---> B
            - reach_probabilities[opponent] ---> 0.5 from [0.50 1.00]
            - counterfactual_values[ix] ---> 2.00 from [2.00 -1.00]
            - node_value ---> 0.5
            - info_set.cumulative_regrets[ix] ---> 1.50 from [ 1.50 -4.50]
            '''
            info_set.cumulative_regrets[ix] += reach_probabilities[opponent] * (counterfactual_values[ix] - node_value)
        return node_value # counterfactual utility/happiness from being at this game node h

    def train(self, num_iterations: int) -> int:
        util = 0
        kuhn_cards = ['J', 'Q', 'K']
        for i in range(num_iterations):
            '''
            - cards ---> ['J', 'K']
            - history ---> '' always
            - reach_probabilities ---> [1.00 1.00] always
            - util ---> -4.625
            '''
            print()
            print('iteration: ', i)
            cards = random.sample(kuhn_cards, 2)
            history = ''  # where history is created
            reach_probabilities = np.ones(2)
            util += self.cfr(cards, history, reach_probabilities, 0)
        return util

In [72]:
if __name__ == "__main__":
    num_iterations = 100
    np.set_printoptions(precision=2, floatmode='fixed', suppress=True)

    cfr_trainer = KuhnCFRTrainer()
    util = cfr_trainer.train(num_iterations)

    print(f"\nRunning Kuhn Poker chance sampling CFR for {num_iterations} iterations")
    print(f"\nExpected average game value (for player 1): {(-1./18):.3f}")
    print(f"Computed average game value               : {(util / num_iterations):.3f}\n")

    print("We expect the bet frequency for a Jack to be between 0 and 1/3")
    print("The bet frequency of a King should be three times the one for a Jack\n")

    print(f"History  Bet  Pass")
    for name, info_set in sorted(cfr_trainer.infoset_map.items(), key=lambda s: len(s[0])):
        print(f"{name:3}:    {info_set.get_average_strategy()}")


iteration:  0


KeyError: 'Q'