In [6]:
from games.kuhn.kuhn import KuhnPoker
from agents.counterfactual_regret import CounterFactualRegret
from collections import OrderedDict

In [7]:
g = KuhnPoker()

In [8]:
agent_classes = [ CounterFactualRegret, CounterFactualRegret ]
my_agents = {}
g.reset()
for i, agent in enumerate(g.agents):
    my_agents[agent] = agent_classes[i](game=g, agent=agent)

In [9]:
for agent in g.agents:
    print('Training agent ' + agent)
    my_agents[agent].train(100000)
    print('Agent ' + agent + ' policies:')
    print(OrderedDict(map(lambda n: (n, my_agents[agent].node_dict[n].policy()), sorted(my_agents[agent].node_dict.keys()))))
    print('')

Training agent agent_0
Agent agent_0 policies:
OrderedDict([('0', array([1.49772346e-05, 9.99985023e-01])), ('0b', array([1.50652325e-05, 9.99984935e-01])), ('0p', array([0.00224472, 0.99775528])), ('0pb', array([0.5, 0.5])), ('1', array([1.51080224e-05, 9.99984892e-01])), ('1b', array([9.99984959e-01, 1.50407605e-05])), ('1p', array([0.00221099, 0.99778901])), ('1pb', array([0.5, 0.5])), ('2', array([0.99636091, 0.00363909])), ('2b', array([9.99985105e-01, 1.48951382e-05])), ('2p', array([9.99985105e-01, 1.48951382e-05])), ('2pb', array([7.48525087e-06, 9.99992515e-01]))])

Training agent agent_1
Agent agent_1 policies:
OrderedDict([('0', array([1.50852316e-05, 9.99984915e-01])), ('0b', array([0.5, 0.5])), ('0p', array([1.49790294e-05, 9.99985021e-01])), ('0pb', array([0.5, 0.5])), ('1', array([9.99984990e-01, 1.50096061e-05])), ('1b', array([1.50294577e-05, 9.99984971e-01])), ('1p', array([1.50294577e-05, 9.99984971e-01])), ('1pb', array([9.99992495e-01, 7.50491572e-06])), ('2', arra

In [10]:
cum_rewards = dict(map(lambda agent: (agent, 0.), g.agents))
niter = 2000
for _ in range(niter):
    g.reset()
    turn = 0
    while not g.done():
        #print('Turn: ', turn)
        #print('\tPlayer: ', g.agent_selection)
        #print('\tObservation: ', g.observe(g.agent_selection))
        a = my_agents[g.agent_selection].action()
        #print('\tAction: ', g._moves[a])
        g.step(action=a)
        turn += 1
    #print('Rewards: ', g.rewards)
    for agent in g.agents:
        cum_rewards[agent] += g.rewards[agent]
print('Average rewards:', dict(map(lambda agent: (agent, cum_rewards[agent]/niter), g.agents)))


Average rewards: {'agent_0': -0.1195, 'agent_1': 0.1195}


In [11]:
print('Check learned policies against theoretical policies:')

Check learned policies against theoretical policies:


In [12]:
JX_b = my_agents[g.agents[0]].node_dict['0'].policy()[1]
print(f'Agent: 0 - Hand: J_ - History: [] - Probability of betting: {JX_b}')


Agent: 0 - Hand: J_ - History: [] - Probability of betting: 0.9999850227653966


In [13]:
QX_pb_b = my_agents[g.agents[0]].node_dict['1pb'].policy()[1]
print(f'Agent: 0 - Hand: Q_ - History: pb - Probability of betting: {QX_pb_b} - Theoretic value: {JX_b+1/3} -  Difference: {abs(QX_pb_b - (JX_b+1/3))}')


Agent: 0 - Hand: Q_ - History: pb - Probability of betting: 0.5 - Theoretic value: 1.3333183560987298 -  Difference: 0.8333183560987298


In [14]:
KX_b = my_agents[g.agents[0]].node_dict['2'].policy()[1]
print(f'Agent: 0 - Hand: K_ - History: [] - Probability of betting: {KX_b} - Theoretic value: {3 * JX_b} -  Difference: {abs(KX_b - 3 * JX_b)}')


Agent: 0 - Hand: K_ - History: [] - Probability of betting: 0.0036390860847953827 - Theoretic value: 2.99995506829619 -  Difference: 2.9963159822113945


In [15]:
XJ_p_b = my_agents[g.agents[0]].node_dict['0p'].policy()[1]
print(f'Agent: 0 - Hand: _J - History: p - Probability of betting: {XJ_p_b} - Theoretic value: {1/3} -  Difference: {abs(XJ_p_b - 1/3)}')

Agent: 0 - Hand: _J - History: p - Probability of betting: 0.997755280363976 - Theoretic value: 0.3333333333333333 -  Difference: 0.6644219470306427


In [16]:
XQ_b_b = my_agents[g.agents[0]].node_dict['1b'].policy()[1]
print(f'Agent: 0 - Hand: _Q - History: b - Probability of betting: {XQ_b_b} - Theoretic value: {1/3} -  Difference: {abs(XQ_b_b - 1/3)}')

Agent: 0 - Hand: _Q - History: b - Probability of betting: 1.5040760460848901e-05 - Theoretic value: 0.3333333333333333 -  Difference: 0.3333182925728725
