In [1]:
import gc
import dill
import numpy as np
import pandas as pd
import axelrod as axl
from time import time
from pprint import pprint
import matplotlib.pyplot as plt

np.set_printoptions(precision=3)
pd.options.display.float_format = "{:,.2f}".format

# the following imports are from network / axl_utils folder
import network
from network.dqn import DQN
from network.replay import ReplayMemory
from axl_utils.nnplayer import NNplayer, State
from axl_utils.game import *

In [2]:
C = axl.Action.C
D = axl.Action.D

# config game rules
# doubled game length for some extra complexity
GAME_LEN = 20 + 1
GAME = axl.Game(r=3, s=0, t=5, p=1)
Match = set_match(game=GAME, turns=GAME_LEN)
play = set_play(Match)

### this stage will use axl.tournament function to let our DQN play against multiple strategies

Detailed documentation and more strategies can be found here: https://axelrod.readthedocs.io/en/stable/_modules/index.html

In [3]:
from axl_utils.tournament import Tournament

In [4]:
# to create a complex environment
# I'm trying to encourage provocative defection     (against cooperative strategies)
# and retaliative defection                         (against provocative)
# while at the same time punish excessive defection (against retaliative)

cooperative = (axl.Cooperator(), axl.TitFor2Tats())
provocative = (axl.Prober(), axl.Prober4(), axl.RemorsefulProber())
retaliative = (axl.TitForTat(), axl.Grudger(), axl.Punisher())
noise       = ()  #(axl.Random(), axl.Alternator())

players = [*cooperative, *provocative, *retaliative, *noise]
tournament = Tournament(players, game=GAME, turns=GAME_LEN)
results = tournament.play()

Playing matches: 100%|████████████████████████████████████████████████████████████████| 36/36 [00:00<00:00, 196.26it/s]
Analysing: 100%|██████████████████████████████████████████████████████████████████████| 25/25 [00:00<00:00, 142.26it/s]


In [5]:
# tournament result
# explanation of the elements can be found here: https://axelrod.readthedocs.io/en/stable/tutorials/advanced/tournament_results.html
summary = results.summarise()
headers = "Rank,Name,Median_score,Cooperation_rating,Wins,Initial_C_rate,CC_rate,CD_rate,DC_rate,DD_rate,CC_to_C_rate,CD_to_C_rate,DC_to_C_rate,DD_to_C_rate".split(',')
pd.DataFrame(map(list, summary), columns=headers)

# median score is based on normalized score, which represent the average return for each turn
# e.g. if 2 player cooperates the whole game, this should be 3.00 for both

Unnamed: 0,Rank,Name,Median_score,Cooperation_rating,Wins,Initial_C_rate,CC_rate,CD_rate,DC_rate,DD_rate,CC_to_C_rate,CD_to_C_rate,DC_to_C_rate,DD_to_C_rate
0,0,Tit For Tat,2.82,0.88,0.0,1.0,0.78,0.1,0.09,0.03,1.0,0.0,1.0,0.0
1,1,Grudger,2.61,0.68,2.0,1.0,0.66,0.02,0.08,0.24,1.0,0.0,0.0,0.0
2,2,Remorseful Prober: 0.1,2.6,0.67,3.0,1.0,0.52,0.15,0.18,0.16,0.89,0.0,0.84,0.66
3,3,Punisher,2.56,0.67,2.0,1.0,0.65,0.02,0.08,0.26,1.0,0.0,0.0,0.0
4,4,Tit For 2 Tats,2.5,0.87,0.0,1.0,0.77,0.1,0.02,0.12,1.0,0.78,1.0,0.0
5,5,Prober 4,2.41,0.5,4.0,1.0,0.23,0.27,0.3,0.2,0.35,0.34,0.76,0.7
6,6,Prober,2.38,0.36,2.0,0.0,0.23,0.13,0.27,0.37,0.8,0.54,0.77,0.0
7,7,Cooperator,2.36,1.0,0.0,1.0,0.78,0.22,0.0,0.0,1.0,1.0,0.0,0.0


In [9]:
dqn = DQN([
                    network.Flatten_layer(),
                    network.Linear_layer(GAME_LEN*2, 300),
                    network.Activation_layer('ReLU'),
                    network.Linear_layer(300, 150),
                    network.Activation_layer('ReLU'),
                    network.Linear_layer(150, 80),
                    network.Activation_layer('ReLU'),
                    network.Linear_layer(80, 40),
                    network.Activation_layer('ReLU'),
                    network.Linear_layer(40, 2),
                    ],
            ReplayMemory(8000), gamma=0.9, greedy=0.2)
p1 = NNplayer(dqn, State(GAME_LEN))
del dqn
gc.collect()

param = {"lr": 1e-4, 'batch': 32, "momentum": 0.9, "mode": "train", "eps": 1e-16, "beta":(0.9, 0.999), 
         "epoch": 0, 'optimizer': 'adam', 't': 1, 'clip': 1.0, 'regularizer': ('l2', 1e-3), "loss_fn":"mse"}

In [10]:
from random import shuffle     

def train_against(trainee, trainers, iterations=40):
    shuffle(trainers)
    for trainer in trainers:
        for _  in range(iterations):
            play(trainee, trainer, show=False)

In [11]:
train_against(p1, players)
len(p1.network.memory)      # 21(turns) * 10(# of players) * 40(iterations)

6720

In [None]:
# train loop as usual
for _ in range(10):
    start = time()
    p1.train(30, param)
    
    train_against(p1, players)
    
    print(f'loss: {p1.network.loss},            time: +{time()-start:.2f} sec')

In [None]:
p1.plot()
# but it seems our policy net didn't get enough training before the target net is updated
# causing the divergence we've seen in stage 1 section 2

# evident by the cyclic, overall upward trend in loss:

In [12]:
p1.network.reset()
param = {"lr": 7e-5, 'batch': 32, "momentum": 0.9, "mode": "train", "eps": 1e-16, "beta":(0.9, 0.999), 
         "epoch": 0, 'optimizer': 'adam', 't': 1, 'clip': 1.0, 'regularizer': ('l2', 1e-3), "loss_fn":"mse"}

Network reinitialized.


In [None]:
# time to take a nap
ls = []
loss_ls = []
for i in range(80):
    
    start = time()
    p1.train(250, param, loss_targ=0.03)
    
    train_against(p1, players)
    
    print(f'loss: {p1.network.loss},            time: +{time()-start:.2f} sec')
    
    # test DQN in tournament
    if i % 2 == 0:
        loss_ls.append(np.sum(p1.network.loss))
        with p1:
            p1.network.verbosity = False
            tournament = Tournament([p1, *players], game=GAME, turns=GAME_LEN)
            results = tournament.play()
            summary = results.summarise()
            ls.append(pd.DataFrame(map(list, summary), columns=headers))
               
        # backing up
        with open('data/s2/tn_results.pkl', "wb") as file:
            dill.dump(ls, file)

        with open(f'data/s2/p1_{i}.pkl', "wb") as file:
            dill.dump(p1, file)

terminated at 87 epochs
loss: [0.288 0.318],            time: +109.86 sec


Playing matches: 100%|████████████████████████████████████████████████████████████████| 45/45 [00:00<00:00, 134.33it/s]
Analysing: 100%|██████████████████████████████████████████████████████████████████████| 25/25 [00:00<00:00, 142.86it/s]


terminated at 84 epochs
loss: [0.195 0.293],            time: +114.98 sec


In [None]:
p1.plot()

In [None]:
p1.plot(min_ran=500, max_ran=800)

In [None]:
with p1:
    play(p1, axl.RemorsefulProber())

In [None]:
[x.set_index('Name', inplace=True) for x in ls];

In [None]:
ls[2]

In [None]:
things = ['Rank', 'Median_score', 'Cooperation_rating', 'Initial_C_rate', 'CC_to_C_rate', 'CD_to_C_rate']
pd.DataFrame([[l.loc['NNplayer',t] for t in things] for l in ls], columns=things)