# Playing Simple Games with Neural Nets

In this notebook, we implement equilibria learning viea self play for simple games such as Battle of the Sexes and Matching Pennies.

In [2]:
import os
import sys
root_path = os.path.abspath(os.path.join('..'))
if root_path not in sys.path:
    sys.path.append(root_path)
    
import torch
from bnelearn.strategy import MatrixGameStrategy
from bnelearn.bidder import Bidder, Player, MatrixGamePlayer
from bnelearn.mechanism import PrisonersDilemma, BattleOfTheSexes, MatchingPennies
from bnelearn.optimizer import ES
from bnelearn.environment import Environment, AuctionEnvironment, MatrixGameEnvironment

In [3]:
from tensorboardX import SummaryWriter
import numpy as np
import matplotlib.pyplot as plt

In [4]:
torch.cuda.is_available()

True

## Symmetric Game: Prisoners' Dilemma

In [4]:
experiment_name = 1
logdir = os.path.join(root_path, 'notebooks', 'pd', str(experiment_name))

In [5]:
## Experiment setup
n_players = 2

## Environment settings
#training batch size
batch_size = 64
input_length = 1


# optimization params
epoch = 25
learning_rate = 1
lr_decay = False
lr_decay_every = 1000
lr_decay_factor = 0.8

sigma = 5 #ES noise parameter
n_perturbations = 8


In [6]:
# Wrapper transforming a strategy to bidder, used by the optimizer
# this is a dummy, valuation doesn't matter
def strat_to_player(strategy, batch_size, player_position=None):
    return MatrixGamePlayer(strategy, batch_size = batch_size, n_players=2, player_position=player_position)

In [15]:
model = MatrixGameStrategy(n_actions=2).cuda()

In [16]:
game = PrisonersDilemma()

In [17]:
env = AuctionEnvironment(game, 
                 agents=[],
                 max_env_size =1,
                 n_players=2,
                 batch_size=batch_size,
                 strategy_to_bidder_closure=strat_to_player)

In [18]:
optimizer = ES(model=model, environment = env, lr = learning_rate, sigma=sigma, n_perturbations=n_perturbations)

In [19]:
list(env.agents)[0].player_position

0

In [20]:
def log_hyperparams(writer):
    writer.add_scalar('hyperparams/batch_size', batch_size)
    writer.add_scalar('hyperparams/learning_rate', learning_rate)
    writer.add_scalar('hyperparams/sigma', sigma)
    writer.add_scalar('hyperparams/n_perturbations', n_perturbations)    

Training

In [21]:
torch.cuda.empty_cache()
writer = SummaryWriter(log_dir=logdir)
log_hyperparams(writer)

for e in range(epoch+1):    
    
    # lr decay?
    if lr_decay and e % lr_decay_every == 0 and e > 0:
        learning_rate = learning_rate * lr_decay_factor
        for param_group in optimizer.param_groups:
            param_group['lr'] = learning_rate
        writer.add_scalar('hyperparams/learning_rate', learning_rate, e)
        
    # always: do optimizer step
    utility = -optimizer.step()
    writer.add_scalar('eval/utility', utility, e) 
    writer.add_scalar('eval/prob_action_0', model.distribution.probs[0], e)    
    #print(list(model.named_parameters()))
    print(e)

torch.cuda.empty_cache()
writer.close()

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25


In [22]:
player = strat_to_player(model, 10)

In [23]:
player

<bnelearn.bidder.MatrixGamePlayer at 0x1e203a49fd0>

In [34]:
player.get_action().float().mean()

tensor(1., device='cuda:0')

## Assymmetric Games, BoS and Matching Pennies

In [6]:
## Experiment setup
n_players = 2

## Environment settings
#training batch size
batch_size = 2**10
input_length = 1


# optimization params
epoch = 100
learning_rate = 1
lr_decay = False
lr_decay_every = 1000
lr_decay_factor = 0.8

sigma = 5 #ES noise parameter
n_perturbations = 8

game = MatchingPennies()
directory_name = 'matching_pennies'
experiment_name = '04-01-1'
logdir = os.path.join(root_path, 'notebooks', directory_name, str(experiment_name))

In [7]:
# Wrapper transforming a strategy to bidder, used by the optimizer
# this is a dummy, valuation doesn't matter
def strat_to_player(strategy, batch_size, player_position=None):
    return MatrixGamePlayer(strategy, batch_size = batch_size, n_players=2, player_position=player_position)

In [8]:
model1 = MatrixGameStrategy(n_actions=2).cuda()
model2 = MatrixGameStrategy(n_actions=2).cuda()

In [9]:
env = MatrixGameEnvironment(game, agents=[model1, model2],
                 n_players=2,
                 batch_size=batch_size,
                 strategy_to_player_closure=strat_to_player,
                 env_type = 'fixed'
                 )

In [10]:
optimizer1 = ES(model=model1, environment = env, lr = learning_rate, sigma=sigma, n_perturbations=n_perturbations, env_type='fixed', player_position=0)
optimizer2 = ES(model=model2, environment = env, lr = learning_rate, sigma=sigma, n_perturbations=n_perturbations, env_type='fixed', player_position=1)
optimizers = [optimizer1, optimizer2]

In [11]:
def log_hyperparams(writer):
    writer.add_scalar('hyperparams/batch_size', batch_size)
    writer.add_scalar('hyperparams/learning_rate', learning_rate)
    writer.add_scalar('hyperparams/sigma', sigma)
    writer.add_scalar('hyperparams/n_perturbations', n_perturbations)    

# 

In [12]:
model1.distribution.probs

tensor([0.2785, 0.7215])

In [13]:
model2.distribution.probs

tensor([0.4916, 0.5084])

In [14]:
torch.cuda.empty_cache()
writer = SummaryWriter(log_dir=logdir)
log_hyperparams(writer)

for e in range(epoch+1):    
    
    # lr decay?
    if lr_decay and e % lr_decay_every == 0 and e > 0:
        learning_rate = learning_rate * lr_decay_factor
        writer.add_scalar('hyperparams/learning_rate', learning_rate, e)
        for optimizer in optimizers:
            for param_group in optimizer.param_groups:
                param_group['lr'] = learning_rate
        
        
    # always: do optimizer step
    utility1 = -optimizer1.step()
    writer.add_scalar('eval/p1_utility', utility1, e) 
    writer.add_scalar('eval/p1_prob_action_0', model1.distribution.probs[0], e)
    
    utility2 =  -optimizer2.step()
    writer.add_scalar('eval/p2_utility', utility2, e)
    writer.add_scalar('eval/p2_prob_action_0', model2.distribution.probs[0], e)
    #print(list(model.named_parameters()))
    print(e)
        
torch.cuda.empty_cache()
writer.close()

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100


In [21]:
rowP, colP = env.agents

In [22]:
rowP

<bnelearn.bidder.MatrixGamePlayer at 0x2462394aa90>

In [23]:
colP = MatrixGamePlayer(batch_size=batch_size, n_players=2, player_position=1,cuda=True, strategy=model)

In [24]:
env.agents

[<bnelearn.bidder.MatrixGamePlayer at 0x2462394aa90>,
 <bnelearn.bidder.MatrixGamePlayer at 0x2462394add8>]

In [25]:
rowP.strategy

MatrixGameStrategy(
  (logits): Linear(in_features=1, out_features=2, bias=True)
)

In [26]:
optimizer1.environment

<bnelearn.environment.MatrixGameEnvironment at 0x2462394ac88>

In [27]:
next(env._generate_agent_actions(exclude = set([rowP.player_position])))

(1, tensor([[0],
         [0],
         [1],
         [1],
         [0]], device='cuda:0'))

In [84]:
pop = (optimizer1._perturb_model(optimizer1.model) for _ in range(n_perturbations))

In [36]:
from bnelearn.strategy import Strategy

In [37]:
pmodel

MatrixGameStrategy(
  (logits): Linear(in_features=1, out_features=2, bias=True)
)

In [38]:
if isinstance(pmodel, Strategy):
            pmodel: MatrixGamePlayer = env._strategy_to_player(
                    pmodel,
                    batch_size=env.batch_size,
                    player_position=rowP.player_position
                    )

In [45]:
action_profile

tensor([[1, 1],
        [1, 0],
        [1, 0],
        [1, 1],
        [1, 0]])

In [58]:
pop

<generator object <genexpr> at 0x00000246239E5E58>

In [62]:
m, e = next(pop)

In [64]:
e

tensor([ 9.8027,  2.7784, -2.4508, -6.0138], device='cuda:0')

In [76]:
reward = env.get_reward(m, rowP.player_position)

In [77]:
reward.view(5) == reward.view(-1)

tensor([1, 1, 1, 1, 1], device='cuda:0', dtype=torch.uint8)

In [85]:
rewards, epsilons = (torch.cat(tensors).view(n_perturbations, -1)
                             for tensors in zip(*(
                                (env.get_reward(model, rowP.player_position),
                                 epsilon)
                                for (model, epsilon) in pop
                                ))
                            )

In [87]:
rewards

tensor([[3., 0., 3., 3., 0.],
        [3., 3., 3., 3., 0.],
        [0., 2., 0., 2., 2.],
        [2., 0., 0., 2., 2.],
        [0., 0., 0., 0., 2.],
        [0., 0., 0., 3., 0.],
        [0., 0., 0., 2., 0.],
        [3., 0., 0., 2., 2.]], device='cuda:0')

In [91]:
epsilons

tensor([[-0.7815, -2.4193,  1.0875, -3.7693],
        [-6.6288, -7.0637,  3.5438, -6.4348],
        [-1.2483, -5.8030, -5.6775,  4.0233],
        [-9.6593,  3.0905,  8.4659, -0.3457],
        [ 0.4876,  3.0859, -3.7280, -0.0278],
        [-0.1170, -8.5489,  1.6273,  4.7993],
        [-0.1089, -1.1818, -5.6371,  4.1358],
        [ 1.3979,  2.0456, -6.7191, -4.2421]], device='cuda:0')

In [89]:
weighted_noise = (rewards * epsilons).sum(dim=0)

RuntimeError: The size of tensor a (5) must match the size of tensor b (4) at non-singleton dimension 1

In [61]:
rowP.get_utility(*outcome)

tensor([3., 0., 0., 3., 0.], device='cuda:0')

In [58]:
outcome = env.game.play(action_profile.view(batch_size, 2, -1))

In [59]:
type(outcome)

tuple

In [53]:
a,p = (alloc, pay)

In [54]:
rowP.get_utility(alloc, pay)

tensor([3., 0., 0., 3., 0.], device='cuda:0')

In [49]:
-pay[:, 0]

tensor([3., 0., 0., 3., 0.], device='cuda:0')