# Playing Simple Games with Neural Nets

In this notebook, we implement equilibria learning viea self play for simple games such as Battle of the Sexes and Matching Pennies.

In [1]:
import os, sys, time
root_path = os.path.abspath(os.path.join('..'))
if root_path not in sys.path:
    sys.path.append(root_path)
    
import torch
from bnelearn.strategy import MatrixGameStrategy
from bnelearn.bidder import Bidder, Player, MatrixGamePlayer
from bnelearn.mechanism import PrisonersDilemma, BattleOfTheSexes, MatchingPennies, RockPaperScissors
from bnelearn.optimizer import ES
from bnelearn.environment import Environment, AuctionEnvironment, MatrixGameEnvironment

In [2]:
from torch.utils.tensorboard import SummaryWriter
import numpy as np
import matplotlib.pyplot as plt

In [3]:
torch.cuda.is_available()

True

## Symmetric Game: Prisoners' Dilemma

In [4]:
run_name = time.strftime('%Y-%m-%d %a %H:%M')
logdir = os.path.join(root_path, 'notebooks', 'matrix', 'pd', run_name)

In [5]:
logdir

'/home/heidekrueger/bnelearn/notebooks/pd/2019-07-14 Sun 14:46'

In [6]:
## Experiment setup
n_players = 2

## Environment settings
#training batch size
batch_size = 64
input_length = 1


# optimization params
epoch = 25
learning_rate = 1
lr_decay = False
lr_decay_every = 1000
lr_decay_factor = 0.8

sigma = 5 #ES noise parameter
n_perturbations = 8


In [7]:
# Wrapper transforming a strategy to bidder, used by the optimizer
# this is a dummy, valuation doesn't matter
def strat_to_player(strategy, batch_size, player_position=None):
    return MatrixGamePlayer(strategy, batch_size = batch_size, player_position=player_position)

In [8]:
model = MatrixGameStrategy(n_actions=2).cuda()

In [9]:
game = PrisonersDilemma()

In [15]:
env = MatrixGameEnvironment(game, 
                 agents=[strat_to_player(model, batch_size, i) for i in range(n_players)],
                 n_players=2,
                 batch_size=batch_size,
                 strategy_to_player_closure=strat_to_player)

In [16]:
optimizer = ES(model=model, environment = env, lr = learning_rate, sigma=sigma, n_perturbations=n_perturbations)

In [7]:
def log_hyperparams(writer):
    writer.add_scalar('hyperparams/batch_size', batch_size)
    writer.add_scalar('hyperparams/learning_rate', learning_rate)
    writer.add_scalar('hyperparams/sigma', sigma)
    writer.add_scalar('hyperparams/n_perturbations', n_perturbations)    

Training

In [18]:
with SummaryWriter(log_dir=logdir, flush_secs=30) as writer:
    torch.cuda.empty_cache()
    log_hyperparams(writer)

    for e in range(epoch+1):    

        # lr decay?
        if lr_decay and e % lr_decay_every == 0 and e > 0:
            learning_rate = learning_rate * lr_decay_factor
            for param_group in optimizer.param_groups:
                param_group['lr'] = learning_rate
            writer.add_scalar('hyperparams/learning_rate', learning_rate, e)

        # always: do optimizer step
        utility = -optimizer.step()
        writer.add_scalar('eval/utility', utility, e) 
        writer.add_scalar('eval/prob_action_0', model.distribution.probs[0], e)    
        #print(list(model.named_parameters()))
        print(e)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25


In [19]:
player = strat_to_player(model, 10)

In [20]:
player

<bnelearn.bidder.MatrixGamePlayer at 0x7f83e0032978>

In [21]:
player.get_action().float().mean()

tensor(1., device='cuda:0')

## Assymmetric Games, BoS and Matching Pennies

In [30]:
## Experiment setup
n_players = 2

## Environment settings
#training batch size
batch_size = 2**10
input_length = 1


# optimization params
epoch = 1000
learning_rate = 1
lr_decay = False
lr_decay_every = 100
lr_decay_factor = 0.8

sigma = 5 #ES noise parameter
n_perturbations = 10

game = MatchingPennies()
directory_name = 'matching_pennies'
n_actions = 2

game= BattleOfTheSexes()
game_name = 'bos'
n_actions =2

game = RockPaperScissors()
game_name = 'rps'
n_actions = 3

run_name = time.strftime('%Y-%m-%d %a %H:%M')
logdir = os.path.join(root_path, 'notebooks', 'matrix', game_name, run_name)
print(logdir)

/home/heidekrueger/bnelearn/notebooks/matrix/rps/2019-07-14 Sun 15:09


In [31]:
# Wrapper transforming a strategy to bidder, used by the optimizer
# this is a dummy, valuation doesn't matter
def strat_to_player(strategy, batch_size, player_position=None):
    return MatrixGamePlayer(strategy, batch_size = batch_size,  player_position=player_position)

In [32]:
model1 = MatrixGameStrategy(n_actions=n_actions).cuda()
model2 = MatrixGameStrategy(n_actions=n_actions).cuda()

In [33]:
env = MatrixGameEnvironment(game, agents=[model1, model2],
                 n_players=2,
                 batch_size=batch_size,
                 strategy_to_player_closure=strat_to_player
                 )

In [34]:
optimizer1 = ES(model=model1, environment = env, lr = learning_rate, sigma=sigma, n_perturbations=n_perturbations, strat_to_player_kwargs={'player_position':0})
optimizer2 = ES(model=model2, environment = env, lr = learning_rate, sigma=sigma, n_perturbations=n_perturbations, strat_to_player_kwargs={'player_position':1})
optimizers = [optimizer1, optimizer2]

In [35]:
def log_hyperparams(writer):
    writer.add_scalar('hyperparams/batch_size', batch_size)
    writer.add_scalar('hyperparams/learning_rate', learning_rate)
    writer.add_scalar('hyperparams/sigma', sigma)
    writer.add_scalar('hyperparams/n_perturbations', n_perturbations)    

In [44]:
model1.distribution.probs

tensor([1.5551e-06, 2.1746e-02, 9.7825e-01], device='cuda:0')

In [45]:
model2.distribution.probs

tensor([2.1298e-09, 7.0145e-05, 9.9993e-01], device='cuda:0')

True

In [47]:
env.agents[0].get_action()

tensor([[2],
        [2],
        [2],
        ...,
        [2],
        [2],
        [2]], device='cuda:0')

In [38]:
env.agents[0].get_action().view(-1).cpu().numpy()

array([2, 0, 1, ..., 2, 1, 0])

In [None]:
hist_utility_1 = 0
hist_utility_2 = 0
with SummaryWriter(log_dir=logdir) as writer:
    torch.cuda.empty_cache()
    log_hyperparams(writer)

    for e in range(epoch+1):    

        # lr decay?
        if lr_decay and e % lr_decay_every == 0 and e > 0:
            learning_rate = learning_rate * lr_decay_factor
            writer.add_scalar('hyperparams/learning_rate', learning_rate, e)
            for optimizer in optimizers:
                for param_group in optimizer.param_groups:
                    param_group['lr'] = learning_rate

        # always: do optimizer step
        utility1 = -optimizer1.step()     
        utility2 = -optimizer2.step()
        
        
        if e > 0:
            hist_utility_1 = (e * hist_utility_1 + utility1)/ (e+1)
            hist_utility_2 = (e * hist_utility_2 + utility2)/ (e+1)
        else:
            hist_utility_1 = utility1
            hist_utility_2 = utility2
            
        writer.add_histogram('eval/p1_action_distribution', env.agents[0].get_action().view(-1).cpu().numpy(), e)
            
        writer.add_scalar('eval/p1_utility', utility1, e)
        writer.add_scalar('eval/p1_historic_utility', hist_utility_1, e) 
        writer.add_scalar('eval/p1_prob_action_0', model1.distribution.probs[0], e)
        
        writer.add_scalar('eval/p2_utility', utility2, e)
        writer.add_scalar('eval/p2_historic_utility', hist_utility_2, e)
        writer.add_scalar('eval/p2_prob_action_0', model2.distribution.probs[0], e)
        #print(list(model.named_parameters()))
        if not e % 50: print(e)

In [42]:
(utility1.item(), utility2.item())

(-0.0166015625, 0.0283203125)

In [43]:
(hist_utility_1.item(), hist_utility_2.item())

(-0.0055627841502428055, 0.06153518706560135)