In [1]:
from games.kuhn import KuhnPoker
from agents.counterfactualregret_t import CounterFactualRegret

In [2]:
g = KuhnPoker()

In [3]:
agent_classes = [CounterFactualRegret, CounterFactualRegret]
my_agents = {}
for i, agent in enumerate(g.agents):
    my_agents[agent] = agent_classes[i](game=g, agent=agent)

In [4]:
g.reset()
while not g.done():
    g.render()
    print(f"Agent {g.agent_selection}")
    action = my_agents[g.agent_selection].action()
    print(f"Action {action} - move {g.action_move(action)}")
    g.step(action)
g.render()
for agent in g.agents:
    print(f"Reward {agent} = {g.reward(agent)}")

agent_0 K 
agent_1 J 
Agent agent_1
Node not found! Playing randomly.
Action 0 - move p
agent_0 K p
agent_1 J p
Agent agent_0
Node not found! Playing randomly.
Action 1 - move b
agent_0 K pb
agent_1 J pb
Agent agent_1
Node not found! Playing randomly.
Action 0 - move p
agent_0 K pbp
agent_1 J pbp
Reward agent_0 = -1
Reward agent_1 = 1


In [5]:
for agent in g.agents:
    print('Training agent ' + agent)
    my_agents[agent].train(10000)
    print(dict(map(lambda n: (n, my_agents[agent].node_dict[n].policy()), my_agents[agent].node_dict.keys())))

Training agent agent_0
{'2': array([9.99175179e-01, 8.24820806e-04]), '1p': array([9.99850813e-01, 1.49186931e-04]), '2pb': array([2.68474470e-04, 9.99731526e-01]), '1b': array([0.24609896, 0.75390104]), '0': array([9.99559665e-01, 4.40334654e-04]), '0pb': array([0.5, 0.5]), '1': array([9.99850052e-01, 1.49947518e-04]), '2p': array([9.99851478e-01, 1.48522204e-04]), '1pb': array([0.5, 0.5]), '2b': array([1.48522204e-04, 9.99851478e-01]), '0p': array([9.99847723e-01, 1.52276534e-04]), '0b': array([0.99040658, 0.00959342])}
Training agent agent_1
{'2': array([9.99847909e-01, 1.52091255e-04]), '0p': array([9.99847677e-01, 1.52322925e-04]), '2pb': array([0.5, 0.5]), '0b': array([9.99847677e-01, 1.52322925e-04]), '0': array([9.99851874e-01, 1.48126204e-04]), '2p': array([9.99850724e-01, 1.49276011e-04]), '0pb': array([9.99888897e-01, 1.11102881e-04]), '2b': array([0.5, 0.5]), '1': array([9.99850232e-01, 1.49767860e-04]), '1pb': array([0.5, 0.5]), '1p': array([9.99851610e-01, 1.48389969e-04]

In [6]:
for obs, node in my_agents[agent].node_dict.items():
    print(f"{obs} regrets: {node.cum_regrets}, policy: {node.policy()}")

2 regrets: [ 1.000e+00 -6.573e+03], policy: [9.99847909e-01 1.52091255e-04]
0p regrets: [ 1.250000e-01 -1.640375e+03], policy: [9.99847677e-01 1.52322925e-04]
2pb regrets: [0. 0.], policy: [0.5 0.5]
0b regrets: [ 0.75 -2.25], policy: [9.99847677e-01 1.52322925e-04]
0 regrets: [ 3.75000e-01 -8.50375e+02], policy: [9.99851874e-01 1.48126204e-04]
2p regrets: [ 1.0000e+00 -4.1785e+03], policy: [9.99850724e-01 1.49276011e-04]
0pb regrets: [ 0.75 -0.75], policy: [9.99888897e-01 1.11102881e-04]
2b regrets: [0. 0.], policy: [0.5 0.5]
1 regrets: [ 1.0000e+00 -4.1385e+03], policy: [9.99850232e-01 1.49767860e-04]
1pb regrets: [0. 0.], policy: [0.5 0.5]
1p regrets: [ 1.250000e-01 -4.239875e+03], policy: [9.99851610e-01 1.48389969e-04]
1b regrets: [ 0.75 -2.25], policy: [9.99851610e-01 1.48389969e-04]


In [7]:
cum_rewards = dict(map(lambda agent: (agent, 0.), g.agents))
niter = 2000
for _ in range(niter):
    g.reset()
    turn = 0
    while not g.done():
        print('Turn: ', turn)
        print('\tPlayer: ', g.agent_selection)
        print('\tObservation: ', g.observe(g.agent_selection))
        a = my_agents[g.agent_selection].action()
        print('\tAction: ', g._moves[a])
        g.step(action=a)
        turn += 1
    print('Rewards: ', g.rewards)
    for agent in g.agents:
        cum_rewards[agent] += g.rewards[agent]
print('Average rewards:', dict(map(lambda agent: (agent, cum_rewards[agent]/niter), g.agents)))


Turn:  0
	Player:  agent_1
	Observation:  1
	Action:  p
Turn:  1
	Player:  agent_0
	Observation:  2p
	Action:  p
Rewards:  {'agent_0': 1, 'agent_1': -1}
Turn:  0
	Player:  agent_1
	Observation:  2
	Action:  b
Turn:  1
	Player:  agent_0
	Observation:  0b
	Action:  p
Rewards:  {'agent_0': 1, 'agent_1': -1}
Turn:  0
	Player:  agent_1
	Observation:  2
	Action:  p
Turn:  1
	Player:  agent_0
	Observation:  0p
	Action:  p
Rewards:  {'agent_0': -1, 'agent_1': 1}
Turn:  0
	Player:  agent_1
	Observation:  2
	Action:  p
Turn:  1
	Player:  agent_0
	Observation:  1p
	Action:  p
Rewards:  {'agent_0': -1, 'agent_1': 1}
Turn:  0
	Player:  agent_1
	Observation:  1
	Action:  p
Turn:  1
	Player:  agent_0
	Observation:  0p
	Action:  p
Rewards:  {'agent_0': -1, 'agent_1': 1}
Turn:  0
	Player:  agent_1
	Observation:  1
	Action:  p
Turn:  1
	Player:  agent_0
	Observation:  2p
	Action:  p
Rewards:  {'agent_0': 1, 'agent_1': -1}
Turn:  0
	Player:  agent_1
	Observation:  1
	Action:  p
Turn:  1
	Player:  agent_0