In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
import network

from network.dqn import DQN

import gc
import copy
import random

import numpy as np
import axelrod as axl
from pprint import pprint
from itertools import permutations
from collections import namedtuple, deque

from network.replay import ReplayMemory

In [213]:
Config = {}
GAME_LEN = 20 + 1
C = axl.Action.C
D = axl.Action.D
GAME = axl.Game(r=4, s=0, t=5, p=1)

def Match(players, turns=GAME_LEN, reset=True):
    return axl.Match(players, turns=turns, reset=reset, game=GAME)

In [214]:
players = (axl.Alternator(), axl.Random())
game = Match(players)
game.play()

[(C, D),
 (D, D),
 (C, C),
 (D, D),
 (C, D),
 (D, C),
 (C, D),
 (D, C),
 (C, D),
 (D, C),
 (C, C),
 (D, D),
 (C, D),
 (D, D),
 (C, D),
 (D, C),
 (C, D),
 (D, C),
 (C, D),
 (D, C),
 (C, D)]

In [218]:
# possible to change the way this class behaves to redefine input structure
class State():
    def __init__(self, depth):
        self.depth = depth
        self.reset()
        
    def reset(self):
        self.state = [deque([-1 for _ in range(self.depth)], maxlen=self.depth) for _ in range(2)]
    
    def __repr__(self):
        return str(s.state).replace("),", "),\n")
    
    def values(self):
        return np.array(self.state, ndmin=3)
    
    def push(self, *args):
        play, coplay = map(self.encode, args)
        self.state[0].append(play)
        self.state[1].append(coplay)
        return self.values()
    
    @staticmethod
    def encode(play):
        if play == axl.Action.C:
            return 1
        else:
            return 0
    
    
s = State(GAME_LEN)
print(s.values())

s.push(C, D)
print(s)

[[[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1]
  [-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1]]]
[deque([-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1], maxlen=21),
 deque([-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0], maxlen=21)]


In [219]:
def extract(game, memory, depth=GAME_LEN):
    """
    extract transitions,
    game = axl.Match object, with a finished game,
    memory = ReplayMemory
    """
    actions = game.result
    rewards = game.scores()
    state = State(depth)
    
    s = state.values()
    iterator = iter(zip(actions, rewards))
    while True:
        a_, r_ = next(iterator)
        s_ = state.push(*a_)

        memory.push(s, a_[0], s_, r_[0])
        s, a, r = (s_, a_, r_)

        # hardcoding the last state
        if s[0,0,1] != -1:
            a_, r_ = next(iterator)
            s_ = state.push(*a_)

            memory.push(s, a_[0], s_, np.NaN)

            break

In [220]:
memory = ReplayMemory(1000)
print(len(memory))
extract(game, memory, GAME_LEN)
print(len(memory))

0
21


In [222]:
memory.sample(1)

[Transition(state=array([[[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
           1,  0,  1,  0,  1],
         [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
           0,  0,  1,  0,  0]]]), action=D, next_state=array([[[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  1,
           0,  1,  0,  1,  0],
         [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  0,
           0,  1,  0,  0,  1]]]), reward=5)]

In [223]:
def collect_exp(players, memory):
    old = len(memory)
    for pair in players:
        game = Match(pair, turns=GAME_LEN)
        game.play()
        extract(game, memory, GAME_LEN)
    new = len(memory)
    print(f"Collected {new-old} experience.")

players = permutations([axl.TitForTat(), axl.TitForTat(), axl.Random(), axl.Alternator()], 2)
collect_exp(players, memory)

Collected 252 experience.


In [224]:
Transition = namedtuple('Transition', 
                        ('state', 'action', 'next_state', 'reward'))

class NNplayer(axl.Player):
    """
    """
    
    # These are various properties for the strategy
    name = 'NNplayer'
    classifier = {
        'memory_depth': 1,
        'stochastic': False,
        'inspects_source': False,
        'manipulates_source': False,
        'manipulates_state': False
    }
    
    decision = (axl.Action.C, axl.Action.D)
    
    def __init__(self, network, memory, greedy=0.2, gamma=0.999):
        super().__init__()
        
        self.network = network
        self.memory  = memory
        self.state   = State(GAME_LEN)
        
        self.greedy  = greedy
        self.gamma   = gamma
        
        self.verbosity = False
        
    def reset(self):
        self.state.reset()
        
    def strategy(self, opponent):
        """Make decision"""
        
        # make random choice to explore
        if random.random() < self.greedy:
            return random.choice(self.decision)
        
        # or query the network to exploit
        else:
            d = self.network.query(self.state.values())
            if self.verbosity:
                print(d)
            return self.decision[np.argmax(d)]
    
    # overwrite update_history to update self state
    def update_history(self, *args):
        self.history.append(*args)
        self.update_state(*args)
        
    def update_state(self, play, coplay):
        """update current game state & record transition into replay memory"""
        s  = self.state.values()
        s_ = self.state.push(play, coplay)
        
        # hardcoding reward as usual
        r  = axl.interaction_utils.compute_scores([(play, coplay)])[0][0] if s[0,0,1]==-1 else np.NaN
        self.memory.push(s, play, s_, r)
        
    def train(self, epoch, param):
        
        length = len(self.memory)
        for _ in range(epoch):
            # organize data
            ts = Transition(*zip(*self.memory.sample(length)))
            ss  = np.vstack(ts.state)
            ss_ = np.vstack(ts.next_state)
            ats = np.array([[True, False] if a==C else [False, True] for a in ts.action])
            rs  = np.array(ts.reward, ndmin=2).T
            
            # pass to network
            self.network.learn((ss, ss_, ats, rs), param, self.gamma)
        self.network.update_target()
        
    # test mode using "with" statement
    def __enter__(self, *args):
        self.verbosity = True
        self.temp = self.greedy
        self.greedy = 0.0
        return self
    
    def __exit__(self, *args):
        self.verbosity = False
        self.greedy = self.temp
        return self

In [225]:
dqn = DQN([
#                     network.Flatten_layer(),
#                     network.Maxout_layer(GAME_LEN*2, 100),
#                     network.BatchNorm_layer(100),
#                     network.Maxout_layer(100, 60),
#                     network.Maxout_layer(60, 40),
#                     network.BatchNorm_layer(40),
#                     network.Maxout_layer(40, 20),
#                     network.Maxout_layer(20, 2),
    
                    network.Flatten_layer(),
                    network.Linear_layer(GAME_LEN*2, 100, bias=0.01),
                    network.Activation_layer('ReLU'),
                    network.Linear_layer(100, 200),
                    #network.BatchNorm_layer(200),
                    network.Activation_layer('ReLU'),
                    network.Linear_layer(200, 40),
                    network.Activation_layer('ReLU'),
                    network.Linear_layer(40, 2),
                    ])
p1 = NNplayer(dqn, ReplayMemory(2000), gamma=0.9)
del dqn
gc.collect()

20

In [226]:
param = {"lr": 5e-6, 'batch': 8, "momentum": 0.9, "mode": "train", "eps": 1e-9, "beta":(0.9, 0.999), 
         "epoch": 0, 'optimizer': 'adam', 't': 1, 'clip': 1.0, 'decay': 0.0}

In [227]:
with p1 as p2:
    players = (p1, axl.TitForTat())
    game = Match(players)
    game.play()
    
# initial Q_values:

[[-2.15839544 -0.63134039]]
[[-2.20140684 -0.61281268]]
[[-1.95826912 -0.54869973]]
[[-1.73254646 -0.42215007]]
[[-1.77692714 -0.13169554]]
[[-1.7300834  -0.40677383]]
[[-1.01277347 -0.19425612]]
[[-1.87723868 -0.96056988]]
[[-1.72453557 -0.62143577]]
[[-1.4315013  -0.56012413]]
[[-1.40251719 -0.67922301]]
[[-1.25287719 -0.78480538]]
[[-0.98834927 -0.84048339]]
[[-0.92582841 -0.59044783]]
[[-1.16066467 -0.62300721]]
[[-1.14478918 -0.75706331]]
[[-1.29498066 -1.04698608]]
[[-0.91053581 -0.86901141]]
[[-0.7721985   0.17022472]]
[[-0.01150818  0.01266829]]
[[-0.84896151 -0.22016632]]


In [228]:
p1.greedy=0.4
for i in range(60):
    players = (p1, axl.TitForTat())
    game = Match(players)
    game.play()

In [229]:
for _ in range(100):
    p1.train(80, param)
    print(p1.network.loss)

0.7653187889862075
0.3268771638668332
0.2082846102993478
0.17437418301697774
0.13374101481635842
0.11184518217963321
0.08340376431184916
0.07303728846324087
0.054813628574762636
0.048513694357636264


KeyboardInterrupt: 

In [230]:
with p1 as p2:
    players = (p1, axl.TitForTat())
    game = Match(players)
    actions = game.play()
    scores = game.scores()
    print(scores[:-1])  # the last turn is not really learnt so I dropped it

[[20.09014751 19.28965886]]
[[20.23459352 19.46024863]]
[[20.2706671  19.95350587]]
[[20.5579992 20.1707804]]
[[20.5406272  20.32908767]]
[[20.62109221 20.15570134]]
[[21.10817875 20.52151663]]
[[21.13247951 20.63932907]]
[[21.45053748 20.06086086]]
[[20.0347515 18.8306824]]
[[18.92366368 18.46645275]]
[[17.2196847  16.12150315]]
[[15.80411222 15.24189632]]
[[14.50016065 13.63532737]]
[[11.9805107 11.5934096]]
[[11.0990306 10.2942951]]
[[7.68895671 7.16165604]]
[[6.4290081  6.05348552]]
[[3.90968133 3.35911109]]
[[1.32830735 2.42515624]]
[[-0.83179959 -0.33905694]]
[(4, 4), (4, 4), (4, 4), (4, 4), (4, 4), (4, 4), (4, 4), (4, 4), (4, 4), (4, 4), (4, 4), (4, 4), (4, 4), (4, 4), (4, 4), (4, 4), (4, 4), (4, 4), (4, 4), (5, 0)]


In [89]:
p1.network.policy_net.print_parameters()

--0--
Printing flatten layer:
{'shape': (1, 2, 20), 'type': 'flatten'}
--1--
Printing linear layer:
{'bias': 0.01,
 'input': array([[0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1.]]),
 'input_nodes': 40,
 'm1': array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.00397783,  0.00508548,  0.00669037, ...,  0.00025475,
         0.00098729,  0.00273878],
       [ 0.00627632, -0.00503945, -0.00333259, ...,  0.00136106,
        -0.00206947, -0.00177802],
       ...,
       [ 0.01836234,  0.0248466 , -0.01717732, ...,  0.00129491,
         0.00098057, -0.00137838],
       [-0.01894563, -0.00393382,  0.02109498, ..., -0.00029959,
        -0.0021889 ,  0.01191326],
       [ 0.02926639, -0.05856798, -0.05899248, ...,  0.01081515,
        -0.01749852, -0.02630802]]),
 'm2': array([[0.00000000e+00, 0.00000000e+

In [58]:
p1.target_net.print_parameters()

--0--
Printing linear layer:
{'bias': 0,
 'input': array([[-1, -1, -1, -1, -1, -1, -1, -1,  3,  3,  3,  3,  3,  3,  3,  3,
         3,  3,  3,  3],
       [-1, -1,  0,  1,  3,  1,  0,  1,  0,  5,  0,  5,  3,  1,  0,  5,
         0,  5,  0,  1],
       [-1, -1, -1, -1, -1, -1,  3,  5,  0,  5,  0,  5,  0,  5,  0,  5,
         0,  5,  0,  5],
       [-1, -1, -1, -1, -1, -1, -1, -1,  0,  1,  1,  1,  1,  1,  5,  3,
         3,  0,  5,  0],
       [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  0,
        50, 30, 50, 30],
       [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1,  3],
       [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1,  3,  3,  3],
       [-1, -1, -1, -1, -1, -1,  0,  1,  3,  1,  0,  1,  0,  5,  0,  5,
         3,  1,  0,  5],
       [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  3,  3,  3,
         3,  3,  3,  3],
       [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    

In [59]:
# TODO: is the last turn really learned?

In [90]:
p1.greedy=0
players = (p1, axl.Alternator())
game = Match(players, turns=GAME_LEN, reset=False)
actions = game.play()
scores = game.scores()
scores

[[68.92941711 59.48685255]]
[[68.92941711 59.48685255]]
[[65.89070999 61.13863025]]
[[65.31874218 57.28523968]]
[[69.77882117 65.38549201]]
[[62.79936975 56.27064137]]
[[64.11538669 62.25264337]]
[[64.66145896 58.80339487]]
[[61.20871222 58.73393978]]
[[63.5015154  59.00924082]]
[[54.62994074 51.31145423]]
[[63.04157207 57.50988618]]
[[60.07373227 55.86161692]]
[[79.04826681 72.43658128]]
[[61.65190939 57.09217373]]
[[70.6464599  63.12759169]]
[[55.56285422 53.55205595]]
[[68.08562421 60.1045108 ]]
[[69.55669565 64.45191093]]
[[75.23313356 66.69023319]]


[(4, 4),
 (0, 5),
 (4, 4),
 (0, 5),
 (4, 4),
 (0, 5),
 (4, 4),
 (0, 5),
 (4, 4),
 (0, 5),
 (4, 4),
 (0, 5),
 (4, 4),
 (0, 5),
 (4, 4),
 (0, 5),
 (4, 4),
 (0, 5),
 (4, 4),
 (0, 5)]

In [33]:


Transition = namedtuple('Transition', 
                        ('state', 'action', 'next_state', 'reward'))

class DQN():
    
    def __init__(self, layers):
        
        # define networks
        self.policy_net = NeuralNetwork(layers)
        self.target_net = deepcopy(self.policy_net)
        self.loss = None
    
    def query(self, state):
        """make decision from given state"""
        #return self.policy_net(state, mode='classification')
        d = self.policy_net(state, mode='rgr')
        print(d)
        return np.argmax(d, axis=1)
    
    def update_target(self):
        self.target_net = deepcopy(self.policy_net)
        
        
        
    def learn(self, memory, param, gamma):
        
        
        length = len(memory)
        batch_size = param['batch']
        sections = length // batch_size
        
        param['epoch'] += 1
        param['mode'] = 'train'
        self.policy_net.set_loss_func('mse')
        
        # get training data
        ts = Transition(*zip(*memory.sample(length)))
        ss  = np.vstack(ts.state)
        ss_ = np.vstack(ts.next_state)
        ats = np.array([[True, False] if a==C else [False, True] for a in ts.action])
        rs  = np.array(ts.reward, ndmin=2).T
        
        # split into batches
        ss, ss_, ats, rs = map(lambda x: np.array_split(x, sections), (ss, ss_, ats, rs))
        
        # train
        for s, s_, at, r in zip(ss, ss_, ats, rs):
            
            # value of current state
            Q_values = self.policy_net(s, param=param, mode='rg') * at
            
            # value of next state
            Q_values_ = np.max(self.target_net(s_, mode='rg'), axis=1, keepdims=True)
            
            # expected value of current state
            E_values = gamma*Q_values_ + r
            
            # feedback
            loss, _ = self.policy_net.loss_fn(E_values, Q_values)
            loss = loss * at  # relocate loss to action taken
            self.policy_net.backprop(loss, param)
            
            # track training loss
            if not self.loss:
                self.loss = np.mean(np.max(np.abs(loss),axis=1))
            else:
                self.loss = 0.9*self.loss + 0.1*np.mean(np.max(np.abs(loss),axis=1))