In [1]:
import numpy as np
from random import random, choice

from matplotlib import cm
from time import sleep
from colosseumrl.envs.tron import TronGridEnvironment, TronRender, TronRllibEnvironment

import gym
from gym import Env
from gym.spaces import Dict, Discrete, Box

import ray
from ray import tune
from ray.rllib.agents.ppo import PPOTrainer, DEFAULT_CONFIG
from ray.rllib.agents.dqn import DQNTrainer, DEFAULT_CONFIG

from ray.rllib.models.preprocessors import Preprocessor
from ray.rllib.models import ModelCatalog

SEED = 1517
np.random.seed(SEED)

# Training on better agents
#### Now that we have mastered playing against our hand crafted agents, how do we go beyond to achieve some sort of optimum?

We use a common technique in reinforcement learning known as self-play. Here, we allow the opponents to update along side us, but with a delay. Once we begin defeating our current opponents a certain percentage of the time, we update their values with our own. This will encourage the policy to continually improve because it has to defeat its previous iteration.

## A more advanced pre-processor
For self-play to work, we need to make sure that the opponents see the exact same configuration of the board that player 0 sees. Otherwise the policies will be very confused and try to make player 0 win even when they're supposed to be opponents!

In [2]:
class TronExtractBoard(Preprocessor):
    """ Wrapper to extract just the board from the game state and simplify it for the network. """        
    def _init_shape(self, obs_space, options):
        board_size = env.observation_space['board'].shape[0]
        return (board_size + 2, board_size + 2, 2)
    
    def transform(self, observation):
        # Pretty hacky way to get the current player number
        # Requires having exactly 4 players
        board = observation['board']
        hor_offset = board.shape[0] // 2 + 2
        top_player = board[1, hor_offset]
        player_number = {1: 0, 4: 1, 3: 2, 2: 3}[top_player]

        return self._transform(observation, player_number)

    def _transform(self, observation, rotate: int = 0):
        board = observation['board'].copy()
        
        # Make all enemies look the same
        board[board > 1] = -1
        
        # Mark where all of the player heads are
        heads = np.zeros_like(board)
        
        if (rotate != 0):
            heads.ravel()[observation['heads']] += 1 + ((observation['directions'] - rotate) % 4)
            
            board = np.rot90(board, k=rotate)
            heads = np.rot90(heads, k=rotate)
            
        else:
            heads.ravel()[observation['heads']] += 1 + observation['directions']
            
        # Pad the outsides so that we know where the wall is
        board = np.pad(board, 1, 'constant', constant_values=-1)
        heads = np.pad(heads, 1, 'constant', constant_values=-1)
        
        # Combine together
        board = np.expand_dims(board, -1)
        heads = np.expand_dims(heads, -1)
        
        return np.concatenate([board, heads], axis=-1)

In [3]:
class TeamTron(TronRllibEnvironment):
    def step(self, action_dict):
        observation, reward_dict, done_dict, info_dict = super().step(action_dict)
        return observation, reward_dict, done_dict, info_dict

In [4]:
# def test(render, env, trainer, frame_time = 0.1):
#     policy = trainer.get_policy("training_policy")
#     policy.cur_epsilon_value = 0
#     render.close()
#     observation = env.reset()
#     done = False
#     action = None
#     reward = None
#     cumulative_reward = 0

#     while not done:
#         action = trainer.compute_action(observation, prev_action=action, prev_reward=reward, policy_id='training_policy')

#         observation, reward, done, results = env.step(action)
#         cumulative_reward += reward
#         render.render(env.env.state)

#         sleep(frame_time)
#     print(state)
#     render.render(env.env.state)
#     return cumulative_reward

In [5]:
# A function that updates the opponent policy with the current training policy weights
def synchronize_policies(trainer):
    training_policy = trainer.get_policy("training_policy")
    opponent_policy = trainer.get_policy("opponent_policy")
    opponent_policy.set_weights(training_policy.get_weights())
    
# A callback to caclulate the win percentage after each episode
# We will use this to determine when to update the opponenets
def on_episode_end(info):
    episode = info["episode"]
    reward_history = episode._agent_reward_history["0"]
    reward = 0
    if len(reward_history) > 1:
        reward = (reward_history[-1] + 1) / 11
        
    episode.custom_metrics['final_reward'] = reward

In [6]:
# Initialize training environment
ray.shutdown()
ray.init()

# def environment_creater(params=None):
#     return TronRllibEnvironment(board_size=21, num_players=4)
    
def team_environment_creater(params=None):
    return TeamTron(board_size=21, num_players=4)   
    
env = team_environment_creater()

#tune.register_env("tron_multi_player", environment_creater)
tune.register_env("tron_team", team_environment_creater)
ModelCatalog.register_custom_preprocessor("tron_prep", TronExtractBoard)

# Configure Deep Q Learning for multi-agent training
config = DEFAULT_CONFIG.copy()
config['num_workers'] = 4
config["timesteps_per_iteration"] = 128
config['target_network_update_freq'] = 256
config['buffer_size'] = 10_000
config['schedule_max_timesteps'] = 100_000
config['exploration_fraction'] = 0.9
config['compress_observations'] = False
config['num_envs_per_worker'] = 1
config['train_batch_size'] = 256
config['n_step'] = 2
config['callbacks'] = { 
        "on_episode_end": on_episode_end,
    }

# All of the models will use the same network as before
agent_config = {
    "model": {
        "vf_share_layers": True,
        "conv_filters": [(64, 5, 2), (128, 5, 2), (256, 5, 2)],
        "fcnet_hiddens": [128],
        "custom_preprocessor": 'tron_prep'
    }
}

config['multiagent'] = {
        "policies_to_train": ["training_policy"],
        "policy_mapping_fn": lambda x: "training_policy" if x == "0" else "opponent_policy",
        "policies": {"training_policy": (None, env.observation_space, env.action_space, agent_config),
                     "opponent_policy": (None, env.observation_space, env.action_space, agent_config)}
}
       
trainer = DQNTrainer(config, "tron_team")
num_epoch = 2

for epoch in range(num_epoch):
    print("Training iteration: {}".format(epoch), end='')
    res = trainer.train()
    print(f", Average reward: {res['policy_reward_mean']['training_policy']}")
    
    if res['custom_metrics']['final_reward_mean'] > 0.6:
        print("Updating opponents")
        synchronize_policies(trainer)

checkpoint = trainer.save()        

2020-03-09 18:02:10,481	INFO resource_spec.py:212 -- Starting Ray with 4.0 GiB memory available for workers and up to 2.01 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
2020-03-09 18:02:11,005	INFO trainer.py:377 -- Tip: set 'eager': true or the --eager flag to enable TensorFlow eager execution
2020-03-09 18:02:11,052	INFO trainer.py:524 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.




  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


, Average reward: 13.666666666666666
Training iteration: 1



, Average reward: 11.590909090909092


In [7]:

def determine_winner(rewardDict):
    for i,j in rewardDict.items():
        if rewardDict[i] == 10:
            return i
    return 'none'

In [8]:
def test(render, env, trainer, frame_time = 0.1):
    extractBoard = TronExtractBoard(env.observation_space)
    policy = trainer.get_policy("training_policy")
    policy.cur_epsilon_value = 0
    render.close()
    obsDict = env.reset()
    doneDict = {'__all__':False}
    actionDict = {}
    rewardDict = {}
    cumulative_reward = 0

    while not doneDict['__all__']:
        for player, obs in obsDict.items():
            actionDict[player] = trainer.compute_action(obs, prev_action=actionDict.get(player, None), prev_reward=rewardDict.get(player, None), policy_id='training_policy')

        obsDict, rewardDict, doneDict, results = env.step(actionDict)
#         cumulative_reward += reward
        render.render(env.state)

        sleep(frame_time)
    
    #print(doneDict)
    print("winner: ", determine_winner(rewardDict))
    
    render.render(env.state)
    #render.close()
    return cumulative_reward

In [9]:
render = TronRender(21, 4)
np.random.seed(SEED)
trainer.restore(checkpoint)
for _ in range(5):
    test(render, env, trainer, frame_time=0.3)

2020-03-09 18:02:28,344	INFO trainable.py:416 -- Restored on 192.168.24.54 from checkpoint: /Users/MasterKashani/ray_results/DQN_tron_team_2020-03-09_18-02-117laqf1s6/checkpoint_2/checkpoint-2
2020-03-09 18:02:28,345	INFO trainable.py:423 -- Current state after restoring: {'_iteration': 2, '_timesteps_total': 288, '_time_total': 11.431609153747559, '_episodes_total': 22}


{'0': True, '1': True, '2': array([2]), '3': True, '__all__': True}
winner:  2
{'0': True, '1': True, '2': True, '3': array([3]), '__all__': True}
winner:  3
{'0': True, '1': True, '2': True, '3': array([3]), '__all__': True}
winner:  3
{'0': True, '1': array([1]), '2': True, '3': True, '__all__': True}
winner:  1
{'0': True, '1': True, '2': True, '3': array([3]), '__all__': True}
winner:  3
