In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from libs.ANN import *
import gc
import copy
import random
import pickle
import numpy as np
import axelrod as axl
from itertools import permutations
from collections import namedtuple, deque

Cupy: Training on GPU.


In [3]:
Config = {}
GAME_LEN = 20
C = axl.Action.C
D = axl.Action.D

In [4]:
def Match(players, turns=GAME_LEN, reset=False):
    return axl.Match(players, turns=turns, reset=reset)

In [5]:
players = (axl.Alternator(), axl.Random())
game = Match(players, turns=GAME_LEN)
actions = game.play()
scores = game.scores()

In [6]:
Transition = namedtuple('Transition', 
                        ('state', 'action', 'next_state', 'reward'))
#tran_history = []
class ReplayMemory(object):

    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)

    def push(self, *args):
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)
    
    def __repr__(self):
        if len(self) >= 100:
            out = list(self.memory)[:100]
        else:
            out = self.memory
        return str(out).replace("), ", "),\n")
    
    def save(self, path):
        with open(path, "wb") as file:
            pickle.dump(self, file)
    
    def load(self, path, mode='overwrite'):
        with open(path, "rb") as file:
            if mode == 'overwrite':
                self = pickle.load(file)
            elif mode == 'add':
                for i in pickle.load(file).memory:
                    self.memory.append(i)
memory = ReplayMemory(2000)

In [7]:
def extract_states(history, size, memory, N=-1, mode='int'):
    temp = deque([N for i in range(size)], maxlen=size)
    memory.append(list(temp))
    for state in history:
        temp.append(state)
        memory.append(list(temp))

In [8]:
def extract_transitions(actions, scores, size, memory):
    """
    Extract transitions from a game, and push them into a given replay memory,
    player should be in the 1st position of tuples,
    
    Arguments:
    -------
    (list) actions: action history of the game e.g. [(C,C), (D,C), ...]
    (list) scores: score history of the game e.g. [(3,3), (5,0), ...]
    (Maybe int) size: desired player memory size, could be 'all'
    (ReplayMemory) memory: replay memory to save the transitions, must support a push(*args) method
    """
    # format inputs
    assert len(actions) == len(scores), "Length not matching!"
    actions, scores = map(lambda x: list(list(zip(*x))[0])+[0], (actions, scores))  # extract column then pad for iterator

    # extract states from history
    states = []
    extract_states(scores, size, states)
    
    # save transitions(state, action, next_state, reward) into replay memory
    iterator = iter(zip(states, actions, scores))
    s, a, r = next(iterator)
    while True:
        try:
            s_, a_, r_ = next(iterator)
            memory.push(s, a, s_, r)
            s, a, r = (s_, a_, r_)
        except StopIteration:
            break

In [9]:
memory = ReplayMemory(1000)
extract_transitions(actions, scores, GAME_LEN, memory)

In [10]:
memory.sample(10)

[Transition(state=[-1, -1, -1, -1, -1, 0, 1, 3, 5, 0, 5, 3, 5, 0, 1, 3, 5, 0, 5, 3], action=D, next_state=[-1, -1, -1, -1, 0, 1, 3, 5, 0, 5, 3, 5, 0, 1, 3, 5, 0, 5, 3, 5], reward=5),
 Transition(state=[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 3, 5, 0, 5, 3, 5, 0], action=D, next_state=[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 3, 5, 0, 5, 3, 5, 0, 1], reward=1),
 Transition(state=[-1, -1, 0, 1, 3, 5, 0, 5, 3, 5, 0, 1, 3, 5, 0, 5, 3, 5, 3, 5], action=C, next_state=[-1, 0, 1, 3, 5, 0, 5, 3, 5, 0, 1, 3, 5, 0, 5, 3, 5, 3, 5, 3], reward=3),
 Transition(state=[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1], action=C, next_state=[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 3], reward=3),
 Transition(state=[-1, 0, 1, 3, 5, 0, 5, 3, 5, 0, 1, 3, 5, 0, 5, 3, 5, 3, 5, 3], action=D, next_state=[0, 1, 3, 5, 0, 5, 3, 5, 0, 1, 3, 5, 0, 5, 3, 5, 3, 5, 3, 5], reward=5),
 Transition(state=[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0,

In [11]:
def collect_exp(players, memory):
    old = len(memory)
    for pair in players:
        game = axl.Match(pair, turns=GAME_LEN)
        actions = game.play()
        scores = game.scores()
        extract_transitions(actions, scores, GAME_LEN, memory)
    new = len(memory)
    print(f"Collected {new-old} experience.")
players = permutations([axl.TitForTat(), axl.TitForTat(), axl.Random(), axl.Alternator()], 2)
collect_exp(players, memory)

Collected 240 experience.


In [12]:
class NNplayer(axl.Player):
    """
    """
    
    # These are various properties for the strategy
    name = 'NNplayer'
    classifier = {
        'memory_depth': 1,  # Four-Vector = (1.,0.,1.,0.)
        'stochastic': False,
        'inspects_source': False,
        'manipulates_source': False,
        'manipulates_state': False
    }   
    
    def __init__(self, network, memory_depth, greedy=0.2, gamma=0.999, capacity=1000):
        super().__init__()
        
        self.memory_depth = memory_depth
        self.decision = (axl.Action.C, axl.Action.D)
        self.replay = ReplayMemory(capacity=capacity)
        self.scores = deque([-1 for _ in range(memory_depth)], maxlen=memory_depth)
        self.policy_net, self.target_net = (copy.deepcopy(network) for _ in range(2))
        
        self.loss = None
        self.gamma = gamma
        self.greedy = greedy

    def strategy(self, opponent):
        """Make decision"""
        if random.random() < self.greedy:
            return random.choice(self.decision)
        else:
            Q_values = self.policy_net(np.array(self.scores))
            return self.decision[np.argmax(Q_values)]
    
    
    # TODO: upgrade to "record"? which writes interaction into the replay memory
    def score(self, play, coplay):
        """Numerical representation of history, readable by neural networks"""
        new = axl.interaction_utils.compute_scores([(play, coplay)])[0][0]
        self.scores.append(new)
        
    # overwrite update_history to update self.score
    def update_history(self, play, coplay):
        self.history.append(play, coplay)
        self.score(play, coplay)
        
        
    def update_network(self):
        """Overwrite the freezed target network with policy network"""
        self.target_net = copy.deepcopy(self.policy_net)
        
    def train(self, opponents):
        pass
    
    def learn(self, epoch, param, batch_size=32, verbosity=0):
        assert len(self.replay) >= batch_size
        for _ in range(epoch):
            
            param['epoch'] += 1
            
            # get batch
            batch = Transition(*zip(*memory.sample(batch_size)))
            state_batch = np.array(batch.state)
            action_batch = batch.action
            action_batch = np.array([[True, False] if a==C else [False, True] for a in action_batch])
            next_batch = np.array(batch.next_state)
            reward_batch = np.array(batch.reward, ndmin=2).T
            
            # calculate q values
            # Q value = value of current state = value of most suitable action
            Q_values = self.policy_net(state_batch) * action_batch
            
            # E(Q value of next state) = reward + value of most suitable action next state
            Q_values_ = np.max(self.target_net(next_batch), axis=1, keepdims=True)
            E_values = self.gamma*Q_values_ + reward_batch
            
            # feedback
            loss, _ = self.policy_net.calculate_loss(E_values, Q_values, function='mse')
            loss = loss * action_batch
            if verbosity:
                print(Q_values)
                print(loss)
            if not self.loss:
                self.loss = np.mean(np.max(np.abs(loss),axis=1))
            else:
                self.loss = 0.9*self.loss + 0.1*np.mean(np.max(np.abs(loss),axis=1))  # track training loss
            self.policy_net.backprop(loss, param)
    
    def push(self, memory):
        """Push iterable containing transition tuples into replay memory"""
        try:
            list(memory)
        except TypeError:
            memory = memory.memory
        for m in memory:
            try:
                assert len(m.state) == self.memory_depth
                self.replay.push(*m)
            except AssertionError:
                print("Invalid memory found /r.", end='')

In [13]:
nn = NeuralNetwork([
                    Linear_layer(GAME_LEN, 200, bias=True),
                    Activation_layer(function='ReLU'),
    
                    Linear_layer(200, 400, bias=True),
                    Activation_layer(function='ReLU'),
                    
                    Linear_layer(400, 200, bias=True),
                    Activation_layer(function='ReLU'),
    
                    Linear_layer(200, 100, bias=True),
                    Activation_layer(function='ReLU'),
    
                    Linear_layer(100, 2, bias=True),
                    ])
# define output[0] to be Cooperation, output[1] to be Defection

p1 = NNplayer(nn, GAME_LEN, capacity=10000)
del nn
gc.collect()

0

In [14]:
# TODO: print loss for C & D average

In [15]:
p1.push(memory)
len(p1.replay)

260

In [16]:
param = {"lr": 1e-6, 'batch': 4, "momentum": 0.9, "mode": "train", "eps": 1e-9, "beta":(0.9, 0.999), 
         "epoch": 0, 'method': 'adam', 't': 1, 'clip': 1.0, 'decay': 0.0}

In [17]:
p1.greedy=0.2
for i in range(9999):
    players = (p1, axl.TitForTat())
    game = Match(players)
    actions = game.play()
    scores = game.scores()
    extract_transitions(actions, scores, GAME_LEN, p1.replay)
    p1.learn(5, param)
    if i % 200 == 0:
        p1.update_network()
        print(p1.loss)
p1.loss

TypeError: Argument 'a' has incorrect type (expected cupy._core.core.ndarray, got numpy.ndarray)

In [69]:
p1.learn(1, param, verbosity=1)

[[ 0.         23.4767086 ]
 [14.40801484  0.        ]
 [ 0.         16.57150439]
 [21.19420027  0.        ]
 [21.5669712   0.        ]
 [ 0.         22.88600695]
 [20.44823759  0.        ]
 [17.80735026  0.        ]
 [16.9279997   0.        ]
 [ 0.         12.10672274]
 [17.01465345  0.        ]
 [11.94413267  0.        ]
 [ 0.         25.00985019]
 [ 0.         18.03364684]
 [ 0.         21.22551763]
 [ 0.         12.79137567]
 [ 0.         11.72848785]
 [14.75389676  0.        ]
 [ 0.         12.27380474]
 [ 0.         12.10672274]
 [10.73668107  0.        ]
 [ 0.         21.0423499 ]
 [ 0.         12.61267528]
 [ 0.          9.62433421]
 [22.34253955  0.        ]
 [11.61323487  0.        ]
 [ 0.         10.80533972]
 [ 0.         16.878488  ]
 [11.85661041  0.        ]
 [12.28996184  0.        ]
 [ 0.         21.22551763]
 [ 0.         11.26917103]]
[[ -0.          -3.31185149]
 [ -1.21647855  -0.        ]
 [ -0.          -1.05834341]
 [ -1.31275318  -0.        ]
 [ -5.00295227  -0.

In [70]:
p1.greedy=0.0
players = (p1, axl.TitForTat())
game = axl.Match(players, turns=GAME_LEN, reset=False, game=axl.Game(r=4, s=0, t=5, p=1))
actions = game.play()
scores = game.scores()
scores

[(5, 0),
 (0, 5),
 (5, 0),
 (0, 5),
 (5, 0),
 (0, 5),
 (5, 0),
 (0, 5),
 (5, 0),
 (1, 1)]

In [34]:
p1.policy_net.print_parameters()

--0--
Printing linear layer
Max = 1.4055903381726378
FCL weights = [[ 1.80893608e-01  6.82322991e-02 -5.98150253e-01 ... -3.71833862e-01
   2.20432766e-01  3.27887453e-01]
 [ 1.63384834e-01 -4.29744093e-01 -1.00282097e-01 ...  9.12175102e-02
  -5.59584902e-01  7.35143780e-01]
 [ 1.01549210e+00 -8.47192202e-02  3.88212756e-01 ...  5.18049800e-01
  -4.80221003e-01  7.91120005e-02]
 ...
 [ 6.62276360e-02 -9.14697388e-01  5.62560933e-01 ... -2.77756175e-01
  -1.10324597e+00 -4.12563678e-01]
 [-1.21096320e-01 -4.69497035e-02 -7.28023899e-02 ...  9.41591040e-01
  -6.83053207e-01 -1.55190332e-01]
 [ 6.74694055e-03  1.06110329e-03  9.81709994e-03 ...  2.01315898e-02
   5.59505182e-03  3.95958527e-03]]
FCL momentum = (array([[-0.00176495, -0.02999401, -0.04241723, ...,  0.02420237,
        -0.00033245, -0.02198007],
       [ 0.01502445, -0.02999401, -0.04241723, ...,  0.00478297,
        -0.00033245, -0.02198007],
       [ 0.01417938, -0.02999401, -0.10282682, ..., -0.02068624,
        -0.0017 

In [155]:
p1.target_net.print_parameters()

--0--
Printing linear layer
Max = 1.3435114635779408
FCL weights = [[ 3.49278552e-02  8.27888759e-02  5.11822242e-01  1.15018269e-01
   4.42317248e-01 -4.34851924e-01 -2.35020202e-02  2.44945238e-01
   2.18949325e-01  1.14945714e-01 -9.99653795e-01 -5.35724814e-01
   1.27183517e-01  3.92521505e-01  2.75279490e-01 -4.00022299e-01
  -3.43932372e-01  2.79327804e-01 -1.61402491e-01 -3.63428838e-01
   2.47348078e-03  6.42733718e-01 -8.65095120e-01 -4.39959571e-01
   4.86571017e-01  1.44602290e-01 -1.00079045e-01  3.79473571e-01
   6.79754093e-01 -1.62143779e-01 -2.26536121e-01  8.15768731e-01
  -4.58184639e-02 -2.39929658e-01 -2.98778574e-01  1.31087420e-01
  -1.26767318e+00  3.20515752e-01 -1.69396831e-01 -6.24863903e-01
  -6.80475702e-01 -2.16198960e-01 -3.98922550e-01  5.75088462e-01
   1.07602525e-01  2.17154971e-01 -8.53844718e-01  6.82222070e-01
   1.17733727e-01 -1.27475073e-02]
 [-5.60279106e-01  1.39025536e-01  1.12869521e+00 -2.97743012e-02
   7.76324815e-01  8.70179813e-01  2.967

In [21]:
# TODO: is the last turn really learned?

In [35]:
p1.greedy=0
players = (p1, axl.Alternator())
game = axl.Match(players, turns=GAME_LEN, reset=False)
actions = game.play()
scores = game.scores()
scores

[(3, 3),
 (1, 1),
 (5, 0),
 (0, 5),
 (5, 0),
 (1, 1),
 (5, 0),
 (1, 1),
 (5, 0),
 (1, 1)]

In [118]:
p1.policy_net.print_parameters()

--0--
Printing linear layer
Max = 1.0761803247258823
FCL weights = [[-5.91230449e-01  1.30806603e-01  5.22843177e-01 -1.89551261e-01
  -2.84842719e-01 -8.52874821e-01 -1.78228349e-01 -3.50343045e-01
  -7.16789739e-01 -5.89564775e-01]
 [-8.92461569e-01 -4.52931417e-02  2.98452901e-02 -8.92556479e-01
   7.06759007e-01 -1.62308846e+00  3.70349520e-01  3.27778536e-01
  -9.88005025e-01 -2.44394329e-01]
 [-6.85830301e-01  7.21733484e-01  5.10629924e-01 -5.13788180e-01
   7.13238645e-01 -2.93536433e-01 -1.03972222e+00 -8.82711097e-01
  -2.59519939e-01  1.57993294e-01]
 [-5.70045554e-01  1.43849846e-01  3.77743142e-02 -8.95129636e-02
  -7.21751859e-02 -6.30331044e-01 -1.29424620e+00 -5.61499266e-01
  -3.04436022e-01  5.29803415e-01]
 [ 1.33239655e-01  6.32592031e-01  2.50251668e-01  4.78015532e-01
   1.06507917e+00 -1.39949867e+00 -1.19579222e-01 -7.03688206e-01
  -1.42120792e+00 -1.72995831e-01]
 [ 4.64225251e-05  4.45412905e-01  5.96225420e-01  3.12574636e-01
   3.13986240e-01 -1.66539622e-0