# Playing Simple Games with Neural Nets

In this notebook, we implement equilibria learning viea self play for simple games such as Battle of the Sexes and Matching Pennies.

In [None]:
import os, sys, time
root_path = os.path.abspath(os.path.join('..'))
if root_path not in sys.path:
    sys.path.append(root_path)
    
import torch
from bnelearn.strategy import MatrixGameStrategy
from bnelearn.bidder import MatrixGamePlayer
from bnelearn.mechanism import PrisonersDilemma, BattleOfTheSexes, MatchingPennies, RockPaperScissors
from bnelearn.learner import ESPGLearner as ES
from bnelearn.environment import Environment, AuctionEnvironment, MatrixGameEnvironment

In [None]:
from torch.utils.tensorboard import SummaryWriter
import numpy as np
import matplotlib.pyplot as plt

In [None]:
torch.cuda.is_available()

## Symmetric Game: Prisoners' Dilemma

In [None]:
run_name = time.strftime('%Y-%m-%d %a %H:%M')
logdir = os.path.join(root_path, 'notebooks', 'matrix', 'pd', run_name)

In [None]:
logdir

In [None]:
## Experiment setup
n_players = 2

## Environment settings
#training batch size
batch_size = 64
input_length = 1


# optimization params
epoch = 25
learning_rate = 1.0
learning_rate = 1
lr_decay = False
lr_decay_every = 1000
lr_decay_factor = 0.8
optimizer_hyperparams = {'lr': learning_rate}

sigma = 5 #ES noise parameter
n_perturbations = 8

learner_hyperparams = {
    'sigma': sigma,
    'population_size': n_perturbations,
    'scale_sigma_by_model_size': False
}



In [None]:
# Wrapper transforming a strategy to bidder, used by the optimizer
# this is a dummy, valuation doesn't matter
def strat_to_player(strategy, batch_size, player_position=None):
    return MatrixGamePlayer(strategy, batch_size = batch_size, player_position=player_position)

In [None]:
model = MatrixGameStrategy(n_actions=2).cuda()

In [None]:
game = PrisonersDilemma()

In [None]:
env = MatrixGameEnvironment(game, 
                 agents=[strat_to_player(model, batch_size, i) for i in range(n_players)],
                 n_players=2,
                 batch_size=batch_size,
                 strategy_to_player_closure=strat_to_player)

In [None]:
learner = ES(model=model, environment = env, hyperparams=learner_hyperparams, 
             optimizer_type= torch.optim.SGD, optimizer_hyperparams=optimizer_hyperparams)

In [None]:
def log_hyperparams(writer):
    writer.add_scalar('hyperparams/batch_size', batch_size)
    writer.add_scalar('hyperparams/learning_rate', learning_rate)
    writer.add_scalar('hyperparams/sigma', sigma)
    writer.add_scalar('hyperparams/n_perturbations', n_perturbations)    

Training

In [None]:
with SummaryWriter(log_dir=logdir, flush_secs=30) as writer:
    torch.cuda.empty_cache()
    log_hyperparams(writer)

    for e in range(epoch+1):

        # lr decay?
        if lr_decay and e % lr_decay_every == 0 and e > 0:
            learning_rate = learning_rate * lr_decay_factor
            for param_group in learner.optimizer.param_groups:
                param_group['lr'] = learning_rate
            writer.add_scalar('hyperparams/learning_rate', learning_rate, e)

        # always: do optimizer step
        utility = -learner.update_strategy_and_evaluate_utility()
        writer.add_scalar('eval/utility', utility, e)
        writer.add_scalar('eval/prob_action_0', model.distribution.probs[0], e)
        #print(list(model.named_parameters()))
        print(e)

In [None]:
player = strat_to_player(model, 10)

In [None]:
player

In [None]:
player.get_action().float().mean()

## Assymmetric Games, BoS and Matching Pennies

In [None]:
## Experiment setup
n_players = 2

## Environment settings
#training batch size
batch_size = 2**10
input_length = 1


# optimization params
epoch = 1000
learning_rate = 1
lr_decay = False
lr_decay_every = 100
lr_decay_factor = 0.8

optimizer_hyperparams = {'lr': learning_rate}

sigma = 5 #ES noise parameter
n_perturbations = 8

learner_hyperparams = {
    'sigma': sigma,
    'population_size': n_perturbations,
    'scale_sigma_by_model_size': False
}

game = MatchingPennies()
directory_name = 'matching_pennies'
n_actions = 2

game= BattleOfTheSexes()
game_name = 'bos'
n_actions =2

game = RockPaperScissors()
game_name = 'rps'
n_actions = 3

run_name = time.strftime('%Y-%m-%d %a %H:%M')
logdir = os.path.join(root_path, 'notebooks', 'matrix', game_name, run_name)
print(logdir)

In [None]:
# Wrapper transforming a strategy to bidder, used by the optimizer
# this is a dummy, valuation doesn't matter
def strat_to_player(strategy, batch_size, player_position=None):
    return MatrixGamePlayer(strategy, batch_size = batch_size,  player_position=player_position)

In [None]:
model1 = MatrixGameStrategy(n_actions=n_actions).cuda()
model2 = MatrixGameStrategy(n_actions=n_actions).cuda()

In [None]:
env = MatrixGameEnvironment(game, agents=[model1, model2],
                 n_players=2,
                 batch_size=batch_size,
                 strategy_to_player_closure=strat_to_player
                 )

In [None]:
learner1 = ES(model=model1, environment = env,
              hyperparams= learner_hyperparams,
              optimizer_type = torch.optim.SGD, optimizer_hyperparams = optimizer_hyperparams,
              strat_to_player_kwargs={'player_position': 0})
learner2 = ES(model=model2, environment = env,
              hyperparams= learner_hyperparams,
              optimizer_type = torch.optim.SGD, optimizer_hyperparams = optimizer_hyperparams,
              strat_to_player_kwargs={'player_position': 1})
learners = [learner1, learner2]

In [None]:
def log_hyperparams(writer):
    writer.add_scalar('hyperparams/batch_size', batch_size)
    writer.add_scalar('hyperparams/learning_rate', learning_rate)
    writer.add_scalar('hyperparams/sigma', sigma)
    writer.add_scalar('hyperparams/n_perturbations', n_perturbations)    

In [None]:
model1.distribution.probs

In [None]:
model2.distribution.probs

In [None]:
hist_utility_1 = 0
hist_utility_2 = 0
with SummaryWriter(log_dir=logdir) as writer:
    torch.cuda.empty_cache()
    log_hyperparams(writer)

    for e in range(epoch+1):    

        # lr decay?
        if lr_decay and e % lr_decay_every == 0 and e > 0:
            learning_rate = learning_rate * lr_decay_factor
            writer.add_scalar('hyperparams/learning_rate', learning_rate, e)
            for optimizer in optimizers:
                for param_group in optimizer.param_groups:
                    param_group['lr'] = learning_rate

        # always: do optimizer step
        utility1 = -learner1.update_strategy_and_evaluate_utility() 
        utility2 = -learner2.update_strategy_and_evaluate_utility()
        
        
        if e > 0:
            hist_utility_1 = (e * hist_utility_1 + utility1)/ (e+1)
            hist_utility_2 = (e * hist_utility_2 + utility2)/ (e+1)
        else:
            hist_utility_1 = utility1
            hist_utility_2 = utility2
            
        writer.add_histogram('eval/p1_action_distribution', env.agents[0].get_action().view(-1).cpu().numpy(), e)
            
        writer.add_scalar('eval_player_1/utility', utility1, e)
        writer.add_scalar('eval_player_1/historic_utility', hist_utility_1, e) 
        writer.add_scalar('eval_player_1/prob_action_0', model1.distribution.probs[0], e)
        
        writer.add_scalar('eval_player_2/utility', utility2, e)
        writer.add_scalar('eval_player_2/historic_utility', hist_utility_2, e)
        writer.add_scalar('eval_player_2/prob_action_0', model2.distribution.probs[0], e)
        #print(list(model.named_parameters()))
        if not e % 50: print(e)

In [None]:
(utility1.item(), utility2.item())

In [None]:
(hist_utility_1.item(), hist_utility_2.item())