In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import network

import gc
import copy
import random
import pickle
import numpy as np
import axelrod as axl
from itertools import permutations
from collections import namedtuple, deque

In [4]:
Config = {}
GAME_LEN = 20
C = axl.Action.C
D = axl.Action.D
GAME = axl.Game(r=30, s=0, t=50, p=10)

In [5]:
def Match(players, turns=GAME_LEN, reset=False):
    return axl.Match(players, turns=turns, reset=reset, game=GAME)

In [6]:
players = (axl.Alternator(), axl.Random())
game = Match(players, turns=GAME_LEN)
actions = game.play()
scores = game.scores()

In [7]:
Transition = namedtuple('Transition', 
                        ('state', 'action', 'next_state', 'reward'))
#tran_history = []
class ReplayMemory(object):

    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)

    def push(self, *args):
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)
    
    def __repr__(self):
        if len(self) >= 100:
            out = list(self.memory)[:100]
        else:
            out = self.memory
        return str(out).replace("), ", "),\n")
    
    def save(self, path):
        with open(path, "wb") as file:
            pickle.dump(self, file)
    
    def load(self, path, mode='overwrite'):
        with open(path, "rb") as file:
            if mode == 'overwrite':
                self = pickle.load(file)
            elif mode == 'add':
                for i in pickle.load(file).memory:
                    self.memory.append(i)
memory = ReplayMemory(2000)

In [8]:
def extract_states(history, size, memory, N=-1, mode='int'):
    temp = deque([N for i in range(size)], maxlen=size)
    memory.append(list(temp))
    for state in history:
        temp.append(state)
        memory.append(list(temp))

In [9]:
def extract_transitions(actions, scores, size, memory):
    """
    Extract transitions from a game, and push them into a given replay memory,
    player should be in the 1st position of tuples,
    
    Arguments:
    -------
    (list) actions: action history of the game e.g. [(C,C), (D,C), ...]
    (list) scores: score history of the game e.g. [(3,3), (5,0), ...]
    (Maybe int) size: desired player memory size, could be 'all'
    (ReplayMemory) memory: replay memory to save the transitions, must support a push(*args) method
    """
    # format inputs
    assert len(actions) == len(scores), "Length not matching!"
    actions, scores = map(lambda x: list(list(zip(*x))[0])+[0], (actions, scores))  # extract column then pad for iterator

    # extract states from history
    states = []
    extract_states(scores, size, states)
    
    # save transitions(state, action, next_state, reward) into replay memory
    iterator = iter(zip(states, actions, scores))
    s, a, r = next(iterator)
    while True:
        try:
            s_, a_, r_ = next(iterator)
            memory.push(s, a, s_, r)
            s, a, r = (s_, a_, r_)
        except StopIteration:
            break

In [10]:
memory = ReplayMemory(1000)
extract_transitions(actions, scores, GAME_LEN, memory)

In [11]:
memory.sample(10)

[Transition(state=[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 10, 30, 10, 30, 10, 30, 50], action=C, next_state=[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 10, 30, 10, 30, 10, 30, 50, 30], reward=30),
 Transition(state=[-1, -1, -1, -1, 0, 10, 30, 10, 30, 10, 30, 50, 30, 50, 0, 10, 30, 50, 0, 50], action=C, next_state=[-1, -1, -1, 0, 10, 30, 10, 30, 10, 30, 50, 30, 50, 0, 10, 30, 50, 0, 50, 30], reward=30),
 Transition(state=[-1, 0, 10, 30, 10, 30, 10, 30, 50, 30, 50, 0, 10, 30, 50, 0, 50, 30, 50, 0], action=D, next_state=[0, 10, 30, 10, 30, 10, 30, 50, 30, 50, 0, 10, 30, 50, 0, 50, 30, 50, 0, 50], reward=50),
 Transition(state=[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 10, 30, 10, 30, 10, 30, 50, 30], action=D, next_state=[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 10, 30, 10, 30, 10, 30, 50, 30, 50], reward=50),
 Transition(state=[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 10], action=C, next_state=[-1, -1, -1, -1, -1, -1, -1, -1, -1, -

In [12]:
def collect_exp(players, memory):
    old = len(memory)
    for pair in players:
        game = axl.Match(pair, turns=GAME_LEN)
        actions = game.play()
        scores = game.scores()
        extract_transitions(actions, scores, GAME_LEN, memory)
    new = len(memory)
    print(f"Collected {new-old} experience.")
players = permutations([axl.TitForTat(), axl.TitForTat(), axl.Random(), axl.Alternator()], 2)
collect_exp(players, memory)

Collected 240 experience.


In [13]:
class NNplayer(axl.Player):
    """
    """
    
    # These are various properties for the strategy
    name = 'NNplayer'
    classifier = {
        'memory_depth': 1,  # Four-Vector = (1.,0.,1.,0.)
        'stochastic': False,
        'inspects_source': False,
        'manipulates_source': False,
        'manipulates_state': False
    }   
    
    def __init__(self, network, memory_depth, greedy=0.2, gamma=0.999, capacity=1000):
        super().__init__()
        
        self.memory_depth = memory_depth
        self.decision = (axl.Action.C, axl.Action.D)
        self.replay = ReplayMemory(capacity=capacity)
        self.scores = deque([-1 for _ in range(memory_depth)], maxlen=memory_depth)
        self.policy_net, self.target_net = (copy.deepcopy(network) for _ in range(2))
        
        self.loss = None
        self.gamma = gamma
        self.greedy = greedy

    def strategy(self, opponent):
        """Make decision"""
        if random.random() < self.greedy:
            return random.choice(self.decision)
        else:
            Q_values = self.policy_net(np.array(self.scores))
            return self.decision[np.argmax(Q_values)]
    
    
    # TODO: upgrade to "record"? which writes interaction into the replay memory
    def score(self, play, coplay):
        """Numerical representation of history, readable by neural networks"""
        new = axl.interaction_utils.compute_scores([(play, coplay)])[0][0]
        self.scores.append(new)
        
    # overwrite update_history to update self.score
    def update_history(self, play, coplay):
        self.history.append(play, coplay)
        self.score(play, coplay)
        
        
    def update_network(self):
        """Overwrite the freezed target network with policy network"""
        self.target_net = copy.deepcopy(self.policy_net)
        
    def train(self, opponents):
        pass
    
    def learn(self, epoch, param, batch_size=32, verbosity=0):
        assert len(self.replay) >= batch_size
        self.policy_net.set_loss_func('mse')
        for _ in range(epoch):
            
            param['epoch'] += 1
            param['mode'] = 'train'
            
            # get batch
            batch = Transition(*zip(*memory.sample(batch_size)))
            state_batch = np.array(batch.state)
            action_batch = batch.action
            action_batch = np.array([[True, False] if a==C else [False, True] for a in action_batch])
            next_batch = np.array(batch.next_state)
            reward_batch = np.array(batch.reward, ndmin=2).T
            
            # calculate q values
            # Q value = value of current state = value of most suitable action
            Q_values = self.policy_net(state_batch, param=param) * action_batch
            
            # E(Q value of next state) = reward + value of most suitable action next state
            Q_values_ = np.max(self.target_net(next_batch), axis=1, keepdims=True)
            E_values = self.gamma*Q_values_ + reward_batch
            
            # feedback
            loss, _ = self.policy_net.calc_loss(E_values, Q_values)
            loss = loss * action_batch
            if verbosity:
                print(Q_values)
                print(loss)
            if not self.loss:
                self.loss = np.mean(np.max(np.abs(loss),axis=1))
            else:
                self.loss = 0.9*self.loss + 0.1*np.mean(np.max(np.abs(loss),axis=1))  # track training loss
            self.policy_net.backprop(loss, param)
    
    def push(self, memory):
        """Push iterable containing transition tuples into replay memory"""
        try:
            list(memory)
        except TypeError:
            memory = memory.memory
        for m in memory:
            try:
                assert len(m.state) == self.memory_depth
                self.replay.push(*m)
            except AssertionError:
                print("Invalid memory found /r.", end='')

In [14]:
nn = network.NeuralNetwork([
#                     network.Maxout_layer(GAME_LEN, 20),
#                     network.Maxout_layer(20, 100),
#                     network.BatchNorm_layer(100),
#                     network.Maxout_layer(100, 60),
#                     network.Maxout_layer(60, 40),
#                     network.BatchNorm_layer(40),
#                     network.Maxout_layer(40, 20),
#                     network.Maxout_layer(20, 2),
    
    
                    network.Linear_layer(GAME_LEN, 100),
                    network.Activation_layer('RELU'),
                    network.Linear_layer(100, 200),
                    network.BatchNorm_layer(200),
                    network.Activation_layer('RELU'),
                    network.Linear_layer(200, 40),
                    network.Activation_layer('RELU'),
                    network.Linear_layer(40, 2),
                    ])
# define output[0] to be Cooperation, output[1] to be Defection

p1 = NNplayer(nn, GAME_LEN, capacity=10000)
del nn
gc.collect()

0

In [15]:
# TODO: print loss for C & D average

In [16]:
param = {"lr": 1e-6, 'batch': 8, "momentum": 0.9, "mode": "train", "eps": 1e-9, "beta":(0.9, 0.999), 
         "epoch": 0, 'method': 'adam', 't': 1, 'clip': 1.0, 'decay': 0.0}

In [17]:
p1.greedy=0.4
for i in range(60):
    players = (p1, axl.TitForTat())
    game = Match(players)
    actions = game.play()
    scores = game.scores()
    extract_transitions(actions, scores, GAME_LEN, p1.replay)

In [21]:
p1.greedy=0.2
for i in range(100):
    players = (p1, axl.TitForTat())
    game = Match(players)
    actions = game.play()
    scores = game.scores()
    extract_transitions(actions, scores, GAME_LEN, p1.replay)
    p1.learn(10, param)
    if i % 10 == 0:
        p1.update_network()
        print(p1.loss)
p1.loss

4.244093057247601
4.703479826296893
4.790464131069809
4.990202168775884
4.102807704549662
4.5950234952840985
4.737518985229754
4.545680049022509
4.869107754974345
4.462338206418834


4.117449220033134

In [22]:
p1.learn(1, param, verbosity=1)

[[ 0.00000000e+00  5.22239113e-01]
 [-1.62342497e-01  0.00000000e+00]
 [-0.00000000e+00 -1.85272498e-01]
 [-7.69207129e-04  0.00000000e+00]
 [ 0.00000000e+00 -3.56134132e-02]
 [ 1.23445978e-01  0.00000000e+00]
 [ 5.72031932e-01 -0.00000000e+00]
 [ 0.00000000e+00 -2.46484458e-01]
 [ 0.00000000e+00  2.48520855e-01]
 [ 7.02107792e-01 -0.00000000e+00]
 [ 0.00000000e+00  7.94651317e-01]
 [ 3.14100400e-01 -0.00000000e+00]
 [ 0.00000000e+00  9.99720787e-01]
 [ 2.39615331e-01  0.00000000e+00]
 [ 0.00000000e+00  9.30020454e-01]
 [ 1.68644398e-01 -0.00000000e+00]
 [ 4.16447564e-01  0.00000000e+00]
 [ 0.00000000e+00 -2.20769845e+00]
 [ 1.02865888e-01  0.00000000e+00]
 [ 0.00000000e+00  6.83557874e-01]
 [ 0.00000000e+00  1.75767487e-01]
 [ 0.00000000e+00 -6.76483659e-03]
 [ 0.00000000e+00  2.85876820e-01]
 [-1.65889243e-01  0.00000000e+00]
 [-0.00000000e+00  3.78532925e-01]
 [ 5.24271193e+00 -0.00000000e+00]
 [ 0.00000000e+00  7.16554038e-03]
 [ 0.00000000e+00 -7.68013467e-01]
 [ 0.00000000e+00 -2

In [23]:
p1.greedy=0.0
players = (p1, axl.TitForTat())
game = axl.Match(players, turns=GAME_LEN, reset=False, game=axl.Game(r=40, s=0, t=50, p=10))
actions = game.play()
scores = game.scores()
scores

[(40, 40),
 (40, 40),
 (40, 40),
 (40, 40),
 (40, 40),
 (40, 40),
 (40, 40),
 (40, 40),
 (40, 40),
 (40, 40),
 (40, 40),
 (40, 40),
 (40, 40),
 (40, 40),
 (40, 40),
 (40, 40),
 (40, 40),
 (40, 40),
 (40, 40),
 (40, 40)]

In [302]:
p1.policy_net.print_parameters()

--0--
Printing linear layer:
[[-1.95141390e-01  9.53464595e-02  2.51756278e-01  5.12651106e-02
   2.54650467e-01  2.05659944e-02  5.50238638e-01  8.47223199e-01
   2.68310008e-02  8.63060386e-01  6.08925382e-01 -3.68388129e-01
   1.81448266e-01 -4.04916945e-01  2.64355364e-01  6.25382692e-01
  -1.90646399e-01 -9.16520687e-02 -1.20444476e-01 -1.88434330e-01]
 [-6.89798836e-01 -2.16644595e-01 -2.51612231e-01 -2.92077912e-01
  -1.66352001e-01  4.53775772e-01 -2.70139450e-01 -1.06430434e-01
   1.83764854e-01 -1.58231066e-01  6.39343501e-02 -1.60170407e-02
  -1.10936185e-02 -9.02873237e-02  1.40963878e-01 -3.12888707e-01
   7.98499120e-03  2.52925406e-01  1.34420986e-02 -8.22310215e-03]
 [-3.68527464e-01  5.17412711e-01 -5.08604249e-01 -2.67867122e-02
  -6.23998774e-02  2.26461307e-03 -1.46674331e-01  2.20555148e-01
   4.24733065e-02  3.98596645e-01 -6.55088981e-02  7.08675202e-03
  -7.69013257e-01  1.55104626e-01 -1.38788964e-01  3.23839312e-01
  -2.91661261e-01  1.22358746e-01 -5.27288121

In [155]:
p1.target_net.print_parameters()

--0--
Printing maxout layer:
[[-3.91837235e-01  4.72144521e-01  4.99170780e-01  4.65936340e-02
  -1.85803379e-02  3.72416182e-01  6.61156080e-03 -9.70861277e-03
  -5.26724674e-01  1.59084013e-01  1.35568608e-01  1.70972642e-01
   2.66555069e-01 -1.55496213e-01  1.26146371e-01  9.47187250e-02
  -1.51734380e-01 -1.39895725e-01  3.60824000e-01  1.59700977e-01]
 [-4.86758373e-01  2.69431942e-01  2.83903706e-01  2.88289275e-01
  -7.67658977e-02 -3.13852535e-01  3.80370005e-03 -5.33409652e-02
  -2.38423391e-01  2.58851776e-02  2.76243736e-01 -6.21189647e-02
  -1.12881747e-01  1.35663290e-01 -6.63628311e-01  1.41213103e-01
  -1.71691637e-01 -4.25978905e-02  7.95732238e-01  3.35474227e-01]
 [ 6.69334762e-01  9.20967429e-02  2.15931886e-01 -1.04982583e-01
   8.93622742e-02  3.20478906e-01 -1.25073441e-02 -4.10038743e-01
   1.57474206e-01 -2.14819703e-01 -5.74065078e-01 -6.00851232e-01
   4.70738098e-01 -4.04504081e-03 -4.78967771e-03  9.49447484e-01
   1.06943738e-01 -6.14129056e-02 -1.92640348

In [21]:
# TODO: is the last turn really learned?

In [1]:
p1.greedy=0
players = (p1, axl.Alternator())
game = Match(players, turns=GAME_LEN, reset=False)
actions = game.play()
scores = game.scores()
scores

NameError: name 'p1' is not defined

In [118]:
p1.policy_net.print_parameters()

--0--
Printing linear layer
Max = 1.0761803247258823
FCL weights = [[-5.91230449e-01  1.30806603e-01  5.22843177e-01 -1.89551261e-01
  -2.84842719e-01 -8.52874821e-01 -1.78228349e-01 -3.50343045e-01
  -7.16789739e-01 -5.89564775e-01]
 [-8.92461569e-01 -4.52931417e-02  2.98452901e-02 -8.92556479e-01
   7.06759007e-01 -1.62308846e+00  3.70349520e-01  3.27778536e-01
  -9.88005025e-01 -2.44394329e-01]
 [-6.85830301e-01  7.21733484e-01  5.10629924e-01 -5.13788180e-01
   7.13238645e-01 -2.93536433e-01 -1.03972222e+00 -8.82711097e-01
  -2.59519939e-01  1.57993294e-01]
 [-5.70045554e-01  1.43849846e-01  3.77743142e-02 -8.95129636e-02
  -7.21751859e-02 -6.30331044e-01 -1.29424620e+00 -5.61499266e-01
  -3.04436022e-01  5.29803415e-01]
 [ 1.33239655e-01  6.32592031e-01  2.50251668e-01  4.78015532e-01
   1.06507917e+00 -1.39949867e+00 -1.19579222e-01 -7.03688206e-01
  -1.42120792e+00 -1.72995831e-01]
 [ 4.64225251e-05  4.45412905e-01  5.96225420e-01  3.12574636e-01
   3.13986240e-01 -1.66539622e-0