In [46]:
from games.kuhn import KuhnPoker
from agents.counterfactualregret import CounterFactualRegret
from collections import OrderedDict

In [47]:
g = KuhnPoker()

In [48]:
agent_classes = [ CounterFactualRegret, CounterFactualRegret ]
my_agents = {}
g.reset()
for i, agent in enumerate(g.agents):
    my_agents[agent] = agent_classes[i](game=g, agent=agent)

In [49]:
for agent in g.agents:
    print('Training agent ' + agent)
    my_agents[agent].train(100000)
    print('Agent ' + agent + ' policies:')
    print(OrderedDict(map(lambda n: (n, my_agents[agent].node_dict[n].policy()), sorted(my_agents[agent].node_dict.keys()))))
    print('')

Training agent agent_0
Agent agent_0 policies:
OrderedDict([('0', array([0.66168462, 0.33831538])), ('0b', array([9.99970110e-01, 2.98900048e-05])), ('0p', array([0.6746311, 0.3253689])), ('0pb', array([9.99965989e-01, 3.40108267e-05])), ('1', array([9.99804843e-01, 1.95157120e-04])), ('1b', array([0.65904141, 0.34095859])), ('1p', array([9.99638107e-01, 3.61892699e-04])), ('1pb', array([0.3382338, 0.6617662])), ('2', array([0.00265483, 0.99734517])), ('2b', array([2.99508806e-05, 9.99970049e-01])), ('2p', array([8.98526417e-05, 9.99910147e-01])), ('2pb', array([0.00847208, 0.99152792]))])

Training agent agent_1
Agent agent_1 policies:
OrderedDict([('0', array([0.77244686, 0.22755314])), ('0b', array([9.99970071e-01, 2.99293667e-05])), ('0p', array([0.66384884, 0.33615116])), ('0pb', array([9.99970807e-01, 2.91927503e-05])), ('1', array([9.99649287e-01, 3.50712681e-04])), ('1b', array([0.66212234, 0.33787766])), ('1p', array([9.99729754e-01, 2.70245924e-04])), ('1pb', array([0.4425050

In [50]:
cum_rewards = dict(map(lambda agent: (agent, 0.), g.agents))
niter = 2000
for _ in range(niter):
    g.reset()
    turn = 0
    while not g.done():
        #print('Turn: ', turn)
        #print('\tPlayer: ', g.agent_selection)
        #print('\tObservation: ', g.observe(g.agent_selection))
        a = my_agents[g.agent_selection].action()
        #print('\tAction: ', g._moves[a])
        g.step(action=a)
        turn += 1
    #print('Rewards: ', g.rewards)
    for agent in g.agents:
        cum_rewards[agent] += g.rewards[agent]
print('Average rewards:', dict(map(lambda agent: (agent, cum_rewards[agent]/niter), g.agents)))


Average rewards: {'agent_0': -0.073, 'agent_1': 0.073}


In [51]:
print('Check learned policies against theoretical policies:')

Check learned policies against theoretical policies:


In [52]:
JX_b = my_agents[g.agents[0]].node_dict['0'].policy()[1]
print(f'Agent: 0 - Hand: J_ - History: [] - Probability of betting: {JX_b}')


Agent: 0 - Hand: J_ - History: [] - Probability of betting: 0.33831538274773443


In [53]:
QX_pb_b = my_agents[g.agents[0]].node_dict['1pb'].policy()[1]
print(f'Agent: 0 - Hand: Q_ - History: pb - Probability of betting: {QX_pb_b} - Theoretic value: {JX_b+1/3} -  Difference: {abs(QX_pb_b - (JX_b+1/3))}')


Agent: 0 - Hand: Q_ - History: pb - Probability of betting: 0.6617662023416314 - Theoretic value: 0.6716487160810678 -  Difference: 0.009882513739436383


In [54]:
KX_b = my_agents[g.agents[0]].node_dict['2'].policy()[1]
print(f'Agent: 0 - Hand: K_ - History: [] - Probability of betting: {KX_b} - Theoretic value: {3 * JX_b} -  Difference: {abs(KX_b - 3 * JX_b)}')


Agent: 0 - Hand: K_ - History: [] - Probability of betting: 0.9973451726175223 - Theoretic value: 1.0149461482432032 -  Difference: 0.01760097562568086


In [55]:
XJ_p_b = my_agents[g.agents[0]].node_dict['0p'].policy()[1]
print(f'Agent: 0 - Hand: _J - History: p - Probability of betting: {XJ_p_b} - Theoretic value: {1/3} -  Difference: {abs(XJ_p_b - 1/3)}')

Agent: 0 - Hand: _J - History: p - Probability of betting: 0.32536890447323547 - Theoretic value: 0.3333333333333333 -  Difference: 0.00796442886009785


In [56]:
XQ_b_b = my_agents[g.agents[0]].node_dict['1b'].policy()[1]
print(f'Agent: 0 - Hand: _Q - History: b - Probability of betting: {XQ_b_b} - Theoretic value: {1/3} -  Difference: {abs(XQ_b_b - 1/3)}')

Agent: 0 - Hand: _Q - History: b - Probability of betting: 0.3409585869261022 - Theoretic value: 0.3333333333333333 -  Difference: 0.007625253592768888
