In [None]:
# an attempt on Actor-Critic A2C
# not part of the project as I haven't figured out the correct backpropagation

# code modified from
# https://github.com/pytorch/examples/blob/master/reinforcement_learning/actor_critic.py

In [None]:
# TODO
# - implement it
# - test whether sharing first layers help improving performance

In [1]:
import gc
import random
import numpy as np
import pandas as pd
import axelrod as axl
from time import time
from pprint import pprint
import matplotlib.pyplot as plt
from itertools import permutations
from collections import namedtuple, deque

np.set_printoptions(precision=3)
pd.options.display.float_format = "{:,.2f}".format

import network
from axl_utils import NNplayer, State, set_match, set_play

In [2]:
C = axl.Action.C
D = axl.Action.D

# config game rules
GAME_LEN = 20 + 1
GAME = axl.Game(r=3, s=0, t=5, p=1)
Match = set_match(game=GAME, turns=GAME_LEN)
play = set_play(Match)

game = play(axl.Prober4(), axl.TitForTat())

[(3, 3), (3, 3), (5, 0), (0, 5), (5, 0), (1, 1), (1, 1), (0, 5), (3, 3), (5, 0), (0, 5), (5, 0), (0, 5), (3, 3), (5, 0), (0, 5), (5, 0), (1, 1), (0, 5), (5, 0)]
Player 1 score = 50
Player 2 score = 45


In [85]:
Transition = namedtuple('Transition', 
                        ('state', 'action', 'next_state', 'reward'))

class NNplayer(axl.Player):
    """
    
    """
    
    name = 'NNplayer'
    classifier = {
        'memory_depth': -1,
        'stochastic': False,
        'inspects_source': False,
        'manipulates_source': False,
        'manipulates_state': False
    }
    
    decision = (axl.Action.C, axl.Action.D)
    
    def __init__(self, network, state, reward='dense', policy='off', name='DQN'):
        super().__init__()
        
        self.name    = name
        self.state   = state
        self.network = network
        
        self.policy_mode = True if policy=="off" else False      # off-policy = 1, on-policy = 0
        self.reward_mode = True if reward=="dense" else False    # dense reward = 1, sparse reward = 0
        self.N = self.state.N                                    # how not-yet-happened turn is encoded
        self.reset()
    
    def __str__(self):
        return self.name
    
    # the following 3 functions override the orginal implementation in axelrod library
    # they are automatically called by axl during each game
    def reset(self):
        """Reset the attributes to start a new game"""
        self.reward = 0
        self.state.reset()
        self.transitions = []
        self.network.reset_state()
        self._history = axl.history.History()
        
    def strategy(self, opponent):
        """Query the network (each turn) to make decision"""
        idx = self.network.query(self.state.values())
        return self.decision[idx]
    
    # overwrite update_history to update our state
    def update_history(self, *args):
        self.history.append(*args)
        self.update_state(*args)
    # --------------------------------------------------------------------------------
        
    def update_state(self, play, coplay):
        """Update current game state & record transition into replay memory
        
        Parameters
        ----------
        play : axl.Action
            action from last turn, (C or D)
        coplay: axl.Action
        
        """
        
        # update game state
        s  = self.state.values()
        s_ = self.state.push(play, coplay)
        last_turn = s[0,0,1]!=self.N
        
        # compute reward
        r  = axl.interaction_utils.compute_scores([(play, coplay)])[0][0]
        
        # rewrite action
        action = [True, False] if play==axl.Action.C else [False, True]
        
        # dense reward
        if self.reward_mode:
            r  = r if (not last_turn or not self.policy_mode) else np.NaN  # set last turn reward to NaN (off-policy only)
            transition = Transition(s, action, s_, r)
        
        # sparse reward
        else:
            if not last_turn:
                transition = Transition(s, action, s_, 0)
                self.reward += r
            else:
                transition = Transition(s, action, s_, r+self.reward)
                self.reward = 0
        
        # record transitions for training
        self.transitions.append(transition)
        
        # last turn operations
        if last_turn:
            self.end_episode()
    
    def end_episode(self):
        # for off-policy learner,
        # push all transitions into replay memory
        if self.policy_mode:
            for t in self.transitions:
                self.network.push(t)
            self.transitions = []

        # for on-policy learner,
        # push all rewards,
        # then call train function
        else:
            for t in self.transitions:
                self.network.push(t.reward)
            self.transitions = []
            self.network.train()
    
    
    def train(self, *args, **kwargs):
        self.network.train(*args, **kwargs)
    
    def plot(self, **kwargs):
        """Let the network plot its training loss"""
        self.network.plot(**kwargs)

    # test mode using "with" statement
    def __enter__(self, *args):
        self.network.test_mode(True)
    
    def __exit__(self, *args):
        self.network.test_mode(False)
    
    def set_greedy(self, value):
        self.network.greedy = value

In [112]:
class A2C():
    
    def __init__(self, actor, critic, param, gamma=0.9):
        
        self.actor  = actor
        self.critic = critic
        self.param  = param
        self.gamma  = gamma
        self._test_mode   = False
        
        self.reset_state()
    
    def set_param(self, param):
        self.param = param
    
    def reset_state(self):
        self.saved_actions = []  # [(log_prob(chosen action), state_value)]
        self.saved_rewards = []  # [reward from environment]
        
    def push(self, reward):
        """Push one reward value into memory"""
        self.saved_rewards.append(reward)
        
    def forward(self, state):
        
        # probability of actions :: 1x[n actions] array
        probs = self.actor(state)[0]
        
        # state value :: 1x1 array
        value = self.critic(state)
        
        return probs, value
    
    def query(self, state):
        """Query for action, chosen action & its probability is saved for training"""
        probs, value = self.forward(state)
        
        # for test mode, output probabilities then choose action deterministically
        if self._test_mode:
            print(probs)
            return probs.argmax()
        
        # otherwise choose action with given probability
        else:
            # sample action
            cum_probs = np.cumsum(probs)
            action = (cum_probs > np.random.uniform()).argmax()  # Int index of action

            # save
            self.saved_actions.append((action, np.log(probs[action]),value))  # (Int, 1x1 array, 1x1 array)

            return action
    
    def train(self):
        """
        Train the network, should be called after each game,
        hyperparameters should be given before calling this function (via set_param)
        """
        
        # avoid training under test mode
        if self._test_mode:
            return
        
        # set up optimizers etc.
        assert self.param, """No hyperparameters given"""
        self.actor.set_up(self.param)
        self.critic.set_up(self.param)
        
        # cumulative discounted reward i.e. "true" value
        returns = []
        cum_r = 0
        for R in self.saved_rewards[::-1]:
            cum_r = R + self.gamma * cum_r
            returns.insert(0, cum_r)
        returns = (returns - np.mean(returns)) / np.std(returns) # standardize for better convergence (?)
        
        # calculate losses
        policy_losses = []
        value_losses = []
        for (action, log_prob, value), R in zip(self.saved_actions, returns):
            
            advantage = R - value
            policy = np.array([1, 0]) if action==0 else np.array([0,1])  # HARDCODED FOR NOW

            # record losses
            policy_losses.append(-log_prob * advantage * policy)  # DOUBLE CHECK THIS !!
            value_losses.append(self.critic.loss_fn(R, value)[0])
        
        # sum all losses then feedback to networks
        policy_loss = np.array(np.sum(policy_losses))
        value_loss  = np.array(np.sum(value_losses))
        print(policy_loss, value_loss)
        self.actor.backprop(policy_loss, param)
        self.critic.backprop(value_loss, param)
        
    def test_mode(self, on):
        if on:
            self._test_mode = True
        else:
            self._test_mode = False
            
    def plot(self):
        pass
    
    def __call__(self, *args):
        return self.forward(*args)

In [117]:
layer1 = [network.Flatten_layer(), network.Linear_layer(GAME_LEN*2, 100)]

actor = network.NeuralNetwork([
                    *layer1,
                    network.Activation_layer('ReLU'),
                    network.Linear_layer(100, 40),
                    network.Activation_layer('ReLU'),
                    network.Linear_layer(40, 2),
                    network.Activation_layer('Softmax')
                    ])
critic = network.NeuralNetwork([
                    *layer1,
                    network.Activation_layer('ReLU'),
                    network.Linear_layer(100, 200),
                    network.Activation_layer('ReLU'),
                    network.Linear_layer(200, 1),
                    ])

param = {"lr": 3e-4, 'batch': 16, "mode": "train", "eps": 1e-16, "epoch": 0, 't': 1, 'clip': 1.0,
         'optimizer': ('Adam', 0.9, 0.999), 'regularizer': ('l2', 1e-3), "loss_fn":"mse"}
nn = A2C(actor, critic, param)
p1 = NNplayer(nn, State(GAME_LEN, C=1, D=0.1, N=-1), policy='on')

del actor, critic
gc.collect()

286

In [118]:
with p1:
    play(p1, axl.TitForTat())

[0.413 0.587]
[0.694 0.306]
[0.691 0.309]
[0.828 0.172]
[0.539 0.461]
[0.734 0.266]
[0.673 0.327]
[0.717 0.283]
[0.844 0.156]
[0.66 0.34]
[0.435 0.565]
[0.533 0.467]
[0.36 0.64]
[0.525 0.475]
[0.696 0.304]
[0.741 0.259]
[0.597 0.403]
[0.605 0.395]
[0.563 0.437]
[0.349 0.651]
[0.385 0.615]
[(5, 0), (0, 5), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (5, 0), (0, 5), (5, 0), (0, 5), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (5, 0)]
Player 1 score = 59
Player 2 score = 54


In [119]:
ITERATIONS = 200

p1.network.set_param(param)
for i in range(ITERATIONS):
    # play against tit-for-tat
    # training function is called after each game internally
    play(p1, axl.TitForTat(), show=False)

21.804322081133353 -36.47068513584241
18.935814427500613 -36.839830718004414
21.352525542048227 -35.63169402155203
20.26440537014677 -30.887934358727843
18.149912908503925 -31.43293475059233
18.31648091649007 -24.23260301801937
15.342729581553114 -20.529193402639986
18.26090666020042 -20.693005108347247
12.231516073153804 -21.77742418454487
16.00303338297415 -20.451026369039823
12.410729819385272 -22.246446323744866
11.432368839854883 -19.450764471951736
6.340955305858999 -15.050833139775794
5.071459277518772 -10.802054152694344
12.364893810675376 -11.85774469068356
9.667424341730877 -10.03950918740788
7.147654587092153 -6.8487236908196
4.253945452028107 -6.2020223516701725
4.318201324109573 -4.792881476390072
-1.0407349559420291 -3.6184486771103206
3.5486527795522 -1.6919818401037698
-7.357898658034882 1.8852506561321283
-4.994549608146037 2.6478093226319097
-0.7971395153697464 4.063475689019488
0.05534673327173223 3.9445196249885086
-4.684959978153041 5.769817763417757
1.623308911000

In [122]:
with p2:
    play(p2, axl.TitForTat())

[0.615 0.385]
[0.702 0.298]
[0.734 0.266]
[0.746 0.254]
[0.8 0.2]
[0.839 0.161]
[0.957 0.043]
[0.943 0.057]
[0.955 0.045]
[0.965 0.035]
[0.967 0.033]
[0.971 0.029]
[0.984 0.016]
[0.981 0.019]
[0.991 0.009]
[0.988 0.012]
[0.986 0.014]
[0.966 0.034]
[0.989 0.011]
[0.986 0.014]
[0.984 0.016]
[(3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3)]
Player 1 score = 60
Player 2 score = 60


In [123]:
p2.network.actor.print_parameters()

--0--
Printing flatten layer:
{'freeze': False, 'shape': (1, 2, 21), 'type': 'flatten'}
--1--
Printing linear layer:
{'bias': 0,
 'freeze': False,
 'input': array([[0.1, 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. ,
        1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 0.1, 1. , 1. , 1. , 1. ,
        1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. ,
        1. , 1. , 1. , 1. ]]),
 'input_nodes': 42,
 'm1': array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]]),
 'm2': array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]]),
 'optimizer': <network.layers.layer.Optimizer object at 0x000001E4BFD1E040>