In [None]:
# Actor Critic A2C?

In [None]:
# remembber to test whether sharing first layers help improving performance

In [1]:
%load_ext autoreload
%autoreload 2

import gc
import copy
import random
import numpy as np
import axelrod as axl
from time import time
from pprint import pprint
from itertools import permutations
from collections import namedtuple, deque

import network
from axl_utils.nnplayer import State
from axl_utils.game import set_match, set_play

In [2]:
C = axl.Action.C
D = axl.Action.D

# config game rules
GAME_LEN = 20 + 1
GAME = axl.Game(r=3, s=0, t=5, p=1)
Match = set_match(game=GAME, turns=GAME_LEN)
play = set_play(Match)

game = play(axl.Prober4(), axl.TitForTat())

[(3, 3), (3, 3), (5, 0), (0, 5), (5, 0), (1, 1), (1, 1), (0, 5), (3, 3), (5, 0), (0, 5), (5, 0), (0, 5), (3, 3), (5, 0), (0, 5), (5, 0), (1, 1), (0, 5), (5, 0)]
Player 1 score = 50
Player 2 score = 45


In [3]:
class NNplayer(axl.Player):
    """
    """
    
    name = 'NNplayer'
    classifier = {
        'memory_depth': -1,
        'stochastic': False,
        'inspects_source': False,
        'manipulates_source': False,
        'manipulates_state': False
    }
    
    decision = (axl.Action.C, axl.Action.D)
    
    def __init__(self, network, state, gamma=0.999, mode="dense", N=-1):
        super().__init__()
        
        self.network = network
        self.state   = state
        
        self.gamma   = gamma
        
        self.mode = 1 if mode=="dense" else 0
        self.N = -1
        self.reset()
        
    def reset(self):
        self.state.reset()
        self.reward = 0
        self.network.reset_state()
        
    def strategy(self, opponent):
        """Query the network to make decision"""
        idx = self.network.query(self.state.values())
        return self.decision[idx]
    
    # overwrite update_history to update self state
    # this function is automatically called by axelrod library
    def update_history(self, *args):
        self.history.append(*args)
        self.update_state(*args)
        
    def update_state(self, play, coplay):
        """update current game state & record transition into replay memory"""
        s  = self.state.values()
        s_ = self.state.push(play, coplay)
        
        # reward
        r  = axl.interaction_utils.compute_scores([(play, coplay)])[0][0]
        
        self.network.rewards.append(r)
#         # dense reward
#         if self.mode:
#             r  = r if s[0,0,1]==-1 else np.NaN  # set last turn reward to NaN
#             self.memory.push(s, play, s_, r)
        
#         # sparse reward
#         else:
#             if s[0,0,1]==self.N:
#                 self.memory.push(s, play, s_, 0)
#                 self.reward += r
#             else:
#                 self.memory.push(s, play, s_, r+self.reward)
#                 self.reward = 0
        
    def on_policy_train(self, epoch, param):
        param['t'] = 1
        length = len(self.memory)
        for _ in range(epoch):
            # organize data
            ts = Transition(*zip(*self.memory.sample(length)))
            ss  = np.vstack(ts.state)
            ss_ = np.vstack(ts.next_state)
            ats = np.array([[True, False] if a==axl.Action.C else [False, True] for a in ts.action])
            rs  = np.array(ts.reward, ndmin=2).T
            
            # pass to network
            self.network.learn((ss, ss_, ats, rs), param, self.gamma)
        
        self.network.update_target()
        self.loss = self.network.loss
              
    def off_policy_train(self, param):
        self.network.learn(param)
    
    def plot(self, **kwargs):
        self.network.plot(**kwargs)
        
    # test mode using "with" statement
    def __enter__(self, *args):
        self.network.test_mode(True)
    
    def __exit__(self, *args):
        self.network.test_mode(False)

In [71]:
class A2C():
    
    def __init__(self, actor, critic, gamma=0.9):
        
        self.actor  = actor
        self.critic = critic
        self.gamma = gamma
        self.test = False
        
        self.reset_state()
    
    def reset_state(self):
        self.actions = []  # [(log_prob(chosen action), state_value)]
        self.rewards = []  # [reward from env]
        
    def forward(self, state):
        
        # probability of actions :: 1x[n actions] array
        probs = self.actor(state)[0]
        
        # state value :: 1x1 array
        value = self.critic(state)
        
        return probs, value
    
    def __call__(self, *args):
        return self.forward(*args)
    
    def query(self, state):
        
        probs, value = self.forward(state)
        
        if self.test:
            print(probs)
            return probs.argmax()
        
        # sample action
        cum_probs = np.cumsum(probs)
        action = (cum_probs > np.random.uniform()).argmax()  # Int index of action
        
        # save
        self.actions.append((action, np.log(probs[action]),value))  # (Int, 1x1 array, 1x1 array)
        
        return action
    
    def learn(self, param):
        
        self.actor.set_optimizer(param)
        self.critic.set_optimizer(param)
        self.actor.set_loss_func('mse')
        self.critic.set_loss_func('mse')
        
        # cumulative discounted reward a.k.a "true" value
        returns = []
        cum_r = 0
        for R in self.rewards[::-1]:
            cum_r = R + self.gamma * cum_r
            returns.insert(0, cum_r)
            
        # standardize for better convergence
        returns = (returns - np.mean(returns)) / np.std(returns)
        
        # calculate losses
        policy_losses = []
        value_losses = []
        for (action, log_prob, value), R in zip(self.actions, returns):
            
            advantage = R - value
            policy = np.array([1, 0]) if action==0 else np.array([0,1])  # HARDCODED FOR NOW

            # record losses
            policy_losses.append(log_prob * advantage * policy)  # DOUBLE CHECK THIS !!
            value_losses.append(self.critic.loss_fn(R, value)[0])
        
        # sum all losses then feedback to networks
        policy_loss = np.array(np.sum(policy_losses))
        value_loss  = np.array(np.sum(value_losses))
        print(policy_loss, value_loss)
        self.actor.backprop(policy_loss, param)
        self.critic.backprop(value_loss, param)
        
    def test_mode(self, on):
        if on:
            self.test = True
        else:
            self.test = False

In [72]:
layer1 = network.Linear_layer(GAME_LEN*2, 100)

actor = network.NeuralNetwork([
                    network.Flatten_layer(),
                    layer1,
                    network.Activation_layer('ReLU'),
                    network.Linear_layer(100, 40),
                    network.Activation_layer('ReLU'),
                    network.Linear_layer(40, 2),
                    network.Activation_layer('Softmax')
                    ])
critic = network.NeuralNetwork([
                    network.Flatten_layer(),
                    layer1,
                    network.Activation_layer('ReLU'),
                    network.Linear_layer(100, 200),
                    network.Activation_layer('ReLU'),
                    network.Linear_layer(200, 1),
                    ])

p2 = NNplayer(A2C(actor, critic), State(GAME_LEN), gamma=0.9)
param = {"lr": 3e-3, 'batch': 16, "momentum": 0.9, "mode": "train", "eps": 1e-16, "beta":(0.9, 0.999), 
         "epoch": 0, 'optimizer': 'adam', 't': 1, 'clip': 1.0, 'decay': 0.0}

del actor, critic
gc.collect()

12

In [73]:
with p2:
    play(p2, axl.TitForTat())

[0.8344772 0.1655228]
[0.5544495 0.4455505]
[0.48705723 0.51294277]
[0.68575345 0.31424655]
[0.63331363 0.36668637]
[0.6344221 0.3655779]
[0.91774032 0.08225968]
[0.95531407 0.04468593]
[0.9805263 0.0194737]
[0.9299767 0.0700233]
[0.82775507 0.17224493]
[0.8015561 0.1984439]
[0.89944398 0.10055602]
[0.94244982 0.05755018]
[0.84641468 0.15358532]
[0.97470409 0.02529591]
[0.90152824 0.09847176]
[0.90965053 0.09034947]
[0.87449496 0.12550504]
[0.95341573 0.04658427]
[0.97940664 0.02059336]
[(3, 3), (3, 3), (5, 0), (0, 5), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3)]
Player 1 score = 59
Player 2 score = 59


In [74]:
# run inifinitely many episodes
for i_episode in range(1000):

    play(p2, axl.TitForTat())

    # perform backprop
    p2.off_policy_train(param)

[(3, 3), (5, 0), (1, 1), (0, 5), (3, 3), (3, 3), (3, 3), (5, 0), (0, 5), (3, 3), (5, 0), (1, 1), (0, 5), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3)]
Player 1 score = 53
Player 2 score = 53
6.747923726738426 13.51691878217234
[(3, 3), (5, 0), (1, 1), (0, 5), (3, 3), (5, 0), (0, 5), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (5, 0), (1, 1), (1, 1)]
Player 1 score = 54
Player 2 score = 49
12.63662399508062 7.53630932721261
[(3, 3), (5, 0), (1, 1), (0, 5), (3, 3), (5, 0), (0, 5), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (5, 0), (0, 5), (5, 0)]
Player 1 score = 57
Player 2 score = 52
3.206259381597075 -0.866318700776024
[(3, 3), (5, 0), (0, 5), (3, 3), (5, 0), (0, 5), (3, 3), (3, 3), (3, 3), (3, 3), (5, 0), (0, 5), (3, 3), (5, 0), (0, 5), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3)]
Player 1 score = 56
Player 2 score = 56
-0.3928272743493975 -1.9389853410789393
[(5, 0), (1, 1), (1, 1), (0, 5), (3, 3), (3, 3),

In [75]:
with p2:
    play(p2, axl.TitForTat())

[0.99679989 0.00320011]
[0.97196063 0.02803937]
[0.98731041 0.01268959]
[0.99203572 0.00796428]
[0.98384696 0.01615304]
[0.9667151 0.0332849]
[0.99251998 0.00748002]
[0.99572982 0.00427018]
[0.99693811 0.00306189]
[0.98232043 0.01767957]
[0.95140288 0.04859712]
[0.93834758 0.06165242]
[0.93403716 0.06596284]
[0.91919962 0.08080038]
[0.91743873 0.08256127]
[0.80846431 0.19153569]
[0.81173434 0.18826566]
[0.85124434 0.14875566]
[0.9421524 0.0578476]
[0.93707951 0.06292049]
[0.88830328 0.11169672]
[(3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3)]
Player 1 score = 60
Player 2 score = 60
