In [1]:
# Option
LOAD_FROM_CHECKPOINT = False

In [2]:
import numpy as np
from random import random, choice

from matplotlib import cm
from time import sleep
from colosseumrl.envs.tron import TronGridEnvironment, TronRender, TronRllibEnvironment

import gym
from gym import Env
from gym.spaces import Dict, Discrete, Box

import ray
from ray import tune
from ray.rllib.agents.ppo import PPOTrainer, DEFAULT_CONFIG
from ray.rllib.agents.dqn import DQNTrainer, DEFAULT_CONFIG

from ray.rllib.models.preprocessors import Preprocessor
from ray.rllib.models import ModelCatalog

import matplotlib.pyplot as plt

SEED = 1517
np.random.seed(SEED)

## Training an Agent

##### Thinking of a more intelligent agent is pretty hard. So let's make machine learning find one for us! First, let's train an agent to defeat our personal atempt. We will employ Rllib in order to train an agent using Deep Q-Learning.

## Our manual agent again

In [3]:
class SimpleAvoidAgent:
    """ Basic single player agent to test single player version of Tron. """
    def __init__(self, noise=0.1):
        self.noise = noise

    def __call__(self, env, observation):
        # With some probability, select a random action for variation
        if random() <= self.noise:
            return choice([0, 1, 2])
        
        # Get game information
        board = observation['board']
        head = observation['heads'][0]
        direction = observation['directions'][0]
        
        # Find the head of our body
        board_size = board.shape[0]
        x, y = head % board_size, head // board_size

        # Check ahead. If it's clear, then take a step forward.
        nx, ny = env.next_cell(x, y, direction, board_size)
        if board[ny, nx] == 0:
            return 0

        # Check a random direction. If it's clear, then go there.
        offset, action, backup = choice([(1, 1, 2), (-1, 2, 1)])
        nx, ny = env.next_cell(x, y, (direction + offset) % 4, board_size)
        if board[ny, nx] == 0:
            return action

        # Otherwise, go the opposite direction.
        return backup

## Single Player Tron
##### We create a simpler variant of tron featuring only one actively participating agent. This will simplify the RL task to training an agent to play against a fixed set of opponents. We can imagine this as embedding our manual agents within the environment.

In [4]:
class SinglePlayer(gym.Env):
    """ Transform tron into a single player game with predefined enemy agents. """
    def __init__(self, env, active_player = '0', agents = SimpleAvoidAgent()):       
        if not isinstance(agents, list):
            agents = [agents]
        
        self.agents = agents
        self.active_player = active_player
        self.env = env
        
        self.observation_space = env.observation_space
        self.action_space = env.action_space
        
        self.observations = None
        
        self.weiner = 5
        
    def reset(self):
        self.observations = self.env.reset()
        return self.observations[self.active_player]
        
    def step(self, action, agents = None):
        if agents is None:
            agents = self.agents
        
        num_agents = len(agents)
        actions = {}
        
        agent_id = 0
        for player in self.env.players:
            player = str(player)
            
            if player == self.active_player:
                actions[player] = action
            else:
                actions[player] = agents[agent_id](self.env.env, self.observations[player])
                agent_id  = (agent_id + 1) % num_agents
        
        self.observations, rewards, dones, info = self.env.step(actions)
        
#         for i,j in rewards.items():
#             if j == 10:
#                 #print(rewards)
#                 #print("Player " + str(i) + " Won")
#                 self.weiner = i
                #print(i)
        
        return self.observations[self.active_player], rewards[self.active_player], dones[self.active_player], info

## Observation Preprocessing
##### Often times the original form of the observation is not ideal for neural network input. Therefore, we have to pre-process the observation to extract the key bits of information so that the network can easily learn a value or policy function.

In [5]:
class TronExtractBoard(Preprocessor):
    """ Wrapper to extract just the board from the game state and simplify it for the network. """        
    def _init_shape(self, obs_space, options):
        board_size = env.observation_space['board'].shape[0]
        return (board_size + 2, board_size + 2, 2)
    
    def transform(self, observation):
        if 'board' in observation:
            return self._transform(observation)
        else:
            return {player: self._transform(obs, int(player)) for player, obs in observation.items()}

    def _transform(self, observation, rotate: int = 0):
        board = observation['board'].copy()
        
        # Make all enemies look the same
        board[board > 1] = -1
        
        # Mark where all of the player heads are
        heads = np.zeros_like(board)
        
        if (rotate != 0):
            heads.ravel()[observation['heads']] += 1 + ((observation['directions'] - rotate) % 4)
            
            board = np.rot90(board, k=rotate)
            heads = np.rot90(heads, k=rotate)
            
        else:
            heads.ravel()[observation['heads']] += 1 + observation['directions']
            
        # Pad the outsides so that we know where the wall is
        board = np.pad(board, 1, 'constant', constant_values=-1)
        heads = np.pad(heads, 1, 'constant', constant_values=-1)
        
        # Combine together
        board = np.expand_dims(board, -1)
        heads = np.expand_dims(heads, -1)
        
        return np.concatenate([board, heads], axis=-1)

In [6]:
def test(render, env, trainer, frame_time = 0.1):
    policy = trainer.get_policy()
    policy.cur_epsilon_value = 0
    render.close()
    state = env.reset()
    done = False
    action = None
    reward = None
    cumulative_reward = 0

    while not done:
        action = trainer.compute_action(state, prev_action=action, prev_reward=reward)

        state, reward, done, results = env.step(action)
        cumulative_reward += reward
        render.render(env.env.state)

        sleep(frame_time)
    
        #print(env.env.players)
    #print(env.env.state)
    render.render(env.env.state)
    
    print(reward)
    if reward == 10:
        return True 
#     if 1 in env.env.state[-1]:
#         print("AI Lost")
#     else:
#         print("AI Won")

    #print(rewards)
    
    return False

In [None]:
# Initialize training environment
ray.shutdown()
ray.init()

def environment_creater(params=None):
    agent = SimpleAvoidAgent(noise=0.05)
    return SinglePlayer(TronRllibEnvironment(board_size=13, num_players=4), agents=agent)

env = environment_creater()
tune.register_env("tron_single_player", environment_creater)
ModelCatalog.register_custom_preprocessor("tron_prep", TronExtractBoard)

# Configure Deep Q Learning with reasonable values
config = DEFAULT_CONFIG.copy()
config['num_workers'] = 4
config['num_gpus'] = 0
config["timesteps_per_iteration"] = 128 #big things
config['target_network_update_freq'] = 256
config['buffer_size'] = 10_000
config['schedule_max_timesteps'] = 100_000
config['exploration_fraction'] = 0.9
config['compress_observations'] = False
config['num_envs_per_worker'] = 1 if LOAD_FROM_CHECKPOINT else 4
config['train_batch_size'] = 256 #big things
config['n_step'] = 2
config['seed'] = SEED



# We will use a simple convolution network with 3 layers as our feature extractor
config['model']['vf_share_layers'] = True
config['model']['conv_filters'] = [(64, 5, 2), (128, 3, 2), (256, 3, 2)]
config['model']['fcnet_hiddens'] = [256]
config['model']['custom_preprocessor'] = 'tron_prep'

# Begin training or evaluation

trainer = DQNTrainer(config, "tron_single_player")

winarray = []

avg_reward_array = []

if True:
    num_epoch = 500
    #test_epochs = 1
    for epoch in range(num_epoch + 1):
        print("Training iteration: {}".format(epoch))
        res = trainer.train()
        #print(f", Average reward: {res['episode_reward_mean']})
        
        avg_reward_array.append(res['episode_reward_mean'])
              
        if epoch % 100 == 0:
            checkpoint = trainer.save()
            render = TronRender(13, 4)
            np.random.seed(SEED)
            trainer.restore(checkpoint)
            win = 0
            for _ in range(20):
                winner = test(render, env, trainer)
                #print(winner)
                
                if winner == True:
                    win += 1
#                     print("win")
#                 else:
#                     print("lost")
                    #print("MADSFOIH;ASDLFJAS;LDFASDFASDFASDFJ;ASDJFASD;J;AFSD")
            winarray.append(win/20)
        
#         if epoch % test_epochs == 0:
#             reward = env.test(trainer)
# checkpoint = trainer.save()
# render = TronRender(13, 4)
# np.random.seed(SEED)
# trainer.restore(checkpoint)
# for _ in range(10):
#     test(render, env, trainer, frame_time = 0.4)

2020-03-11 20:26:59,307	INFO resource_spec.py:212 -- Starting Ray with 4.79 GiB memory available for workers and up to 2.42 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
2020-03-11 20:26:59,816	INFO trainer.py:377 -- Tip: set 'eager': true or the --eager flag to enable TensorFlow eager execution
2020-03-11 20:26:59,857	INFO trainer.py:524 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.


Training iteration: 0


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
2020-03-11 20:27:08,358	INFO trainable.py:416 -- Restored on 192.168.24.54 from checkpoint: /Users/MasterKashani/ray_results/DQN_tron_single_player_2020-03-11_20-26-59lw04y8sz/checkpoint_1/checkpoint-1
2020-03-11 20:27:08,359	INFO trainable.py:423 -- Current state after restoring: {'_iteration': 1, '_timesteps_total': 128, '_time_total': 5.814941167831421, '_episodes_total': 15}


-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
Training iteration: 1
Training iteration: 2
Training iteration: 3
Training iteration: 4
Training iteration: 5
Training iteration: 6
Training iteration: 7
Training iteration: 8
Training iteration: 9
Training iteration: 10
Training iteration: 11
Training iteration: 12
Training iteration: 13
Training iteration: 14
Training iteration: 15
Training iteration: 16
Training iteration: 17
Training iteration: 18
Training iteration: 19
Training iteration: 20
Training iteration: 21
Training iteration: 22
Training iteration: 23
Training iteration: 24
Training iteration: 25
Training iteration: 26
Training iteration: 27
Training iteration: 28
Training iteration: 29
Training iteration: 30
Training iteration: 31
Training iteration: 32
Training iteration: 33
Training iteration: 34
Training iteration: 35
Training iteration: 36
Training iteration: 37
Training iteration: 38
Training iteration: 39
Training iteration: 40
Training iteration: 41
Traini

2020-03-11 20:29:12,449	INFO trainable.py:416 -- Restored on 192.168.24.54 from checkpoint: /Users/MasterKashani/ray_results/DQN_tron_single_player_2020-03-11_20-26-59lw04y8sz/checkpoint_101/checkpoint-101
2020-03-11 20:29:12,450	INFO trainable.py:423 -- Current state after restoring: {'_iteration': 101, '_timesteps_total': 42496, '_time_total': 112.77161884307861, '_episodes_total': 5249}


-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
Training iteration: 101
Training iteration: 102
Training iteration: 103
Training iteration: 104
Training iteration: 105
Training iteration: 106
Training iteration: 107
Training iteration: 108
Training iteration: 109
Training iteration: 110
Training iteration: 111
Training iteration: 112
Training iteration: 113
Training iteration: 114
Training iteration: 115
Training iteration: 116
Training iteration: 117
Training iteration: 118
Training iteration: 119
Training iteration: 120
Training iteration: 121
Training iteration: 122
Training iteration: 123
Training iteration: 124
Training iteration: 125
Training iteration: 126
Training iteration: 127
Training iteration: 128
Training iteration: 129
Training iteration: 130
Training iteration: 131
Training iteration: 132
Training iteration: 133
Training iteration: 134
Training iteration: 135
Training iteration: 136
Training iteration: 137
Training iteration: 138
Training iteration: 139
Trai

2020-03-11 20:31:24,770	INFO trainable.py:416 -- Restored on 192.168.24.54 from checkpoint: /Users/MasterKashani/ray_results/DQN_tron_single_player_2020-03-11_20-26-59lw04y8sz/checkpoint_201/checkpoint-201
2020-03-11 20:31:24,771	INFO trainable.py:423 -- Current state after restoring: {'_iteration': 201, '_timesteps_total': 86272, '_time_total': 223.79837131500244, '_episodes_total': 9109}


-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
Training iteration: 201
Training iteration: 202
Training iteration: 203
Training iteration: 204
Training iteration: 205
Training iteration: 206
Training iteration: 207


In [None]:
import matplotlib.pyplot as plt
plt.plot(winarray)
plt.xlabel('epochs per 100')
plt.ylabel('win %')
plt.show()

In [None]:
plt.plot(avg_reward_array)
plt.xlabel('Epoch')
plt.ylabel('Average Reward')
plt.show()

In [None]:
# checkpoint = trainer.save()
# render = TronRender(13, 4)
# np.random.seed(SEED)
# trainer.restore(checkpoint)
# for _ in range(10):
#     test(render, env, trainer, frame_time = 0.4)