In [None]:
import torch
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
from unityagents import UnityEnvironment
from agents.utils import OUNoise

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
env = UnityEnvironment(file_name="Tennis_Windows_x86_64/Tennis.exe")
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

env_info = env.reset(train_mode=False)[brain_name]
states = env_info.vector_observations
action_size = brain.vector_action_space_size
state_size = states.shape[1]

## Analysis
The agents should be the same given the symmetrical property of the environment. Therefore, the policy of an agent trained on the left should be valid when we put the same agent at the right. However, its input would need to be twicked. The first step would consist then to validate that the observations are already symmetrical or not. **That should reduce the training time by half**.

Here are the assumptions:
* The observation state of each agent is having a length of 24
* The observation state contains values about:
    * The position and velocity of the ball
    * The position and velocity of the agent
    * The position and velocity of the opponent
* The horizontal component of the state could be multiplied by -1 to flip the board symmetrically to a vertical axis:
    * horizontal position and velocity of the ball
    * horizontal position and velocity of the agent
    * horizontal position and velocity of the opponent

### Analysis of the different components of the observations and actions
To find the relation between the observed states, let us sent one action to both agents and print the states observed by the agents at each time steps

In [None]:
# reset the environment
env_info = env.reset(train_mode=False)[brain_name]
states = env_info.vector_observations

for i in range(50):
    actions = np.array([[0.5, 0],[0.5, 0]]) # select an action (for each agent)
    env_info = env.step(actions)[brain_name]           # send all actions to tne environment
    next_states = env_info.vector_observations         # get next state (for each agent)
    print("{}\tAgent 1\t{}".format(i, next_states[0]))
    print("{}\tAgent 2\t{}".format(i, next_states[1]))
    states = next_states                               # roll over states to next time step

According to the previous result and the observation of the rendered screen:
* the 1st element of the action is defining the "force" applied by one agent to reach the net
    * with a positive value, an agent is moving to the net
    * with a negative value, the agent is moving away from the net
* the 2nd element of the action is defining the jumping "force" and it is discrete
    * with zero, the agent stays on the ground
    * with one, the agent is jumping
* only the index 4, 12, and 20 of the states observed by the agents are symmetrical along 0

In [None]:
# reset the environment and observe the symmetrical values while moving only one agent
env_info = env.reset(train_mode=False)[brain_name]
states = env_info.vector_observations

for i in range(10):
    actions = np.array([[0.5, 0],[0, 0]])                # select an action (for each agent)
    env_info = env.step(actions)[brain_name]           # send all actions to tne environment
    next_states = env_info.vector_observations         # get next state (for each agent)
    print("{}\tAgent 1\t{}\t{}\t{}".format(i, next_states[0][4], next_states[0][12], next_states[0][20]))
    print("{}\tAgent 2\t{}\t{}\t{}".format(i, next_states[1][4], next_states[1][12], next_states[1][20]))
    states = next_states                               # roll over states to next time step

According to the previous result and the observation of the rendered screen:
* the 1st element of the action is defining the "force" applied by one agent to reach the net
    * with a positive value, an agent is moving to the net
    * with a negative value, the agent is moving away from the net
* the 2nd element of the action is defining the jumping "force" and it is discrete
    * with zero, the agent stays on the ground
    * with one, the agent is jumping
* only the index 4, 12, and 20 of the states observed by the agents are symmetrical along 0
* the agent of the right is the agent with index 0
* these indices 4, 12, and 20 are the x position of the ball from the point of view of the agent within 3 successive timesteps

*Conclusion*:
The observation states are describing the environment from the point of view of the agent and not of the viewer. Therefore, one unique agent can be trained by using both observations

## Training
After that confirmation, we start the training

In [None]:
def plot_result(scores):
    # plot the scores
    fig = plt.figure()
    ax = fig.add_subplot(111)
    plt.plot(np.arange(len(scores)), scores)
    plt.ylabel('Score')
    plt.xlabel('Episode #')
    plt.show()

In [None]:
def train(agent, n_episodes=500, train_mode=True):
    scores = []                        # list containing scores from each episode
    scores_window = deque(maxlen=100)  # last 100 scores
    for i_episode in range(1, n_episodes+1):
        env_info = env.reset(train_mode=train_mode)[brain_name]
        states = env_info.vector_observations
        scores_one_episode = np.zeros(2)
        trajectories_states, trajectories_next_states, trajectories_full_states, trajectories_actions, trajectories_log_probs, trajectories_values, \
        trajectories_rewards, trajectories_dones = [[], []], \
                                            [[], []], \
                                            [[], []], \
                                            [[], []], \
                                            [[], []], \
                                            [[], []], \
                                            [[], []], \
                                            [[], []]
        
        while True:
            # TODO: Call Act only once to produce two actions
            concatenated_actions = np.zeros((2,2))
            for i in range(2):
                actions, log_probs = agent.act(states[i])
                concatenated_actions[i] += actions[0] # + noise * noises[i].sample()
                trajectories_states[i].append(states[i])
                trajectories_actions[i].append(actions[0])
                trajectories_log_probs[i].append(log_probs[0])
            
            for i in range(2):
                # Concatenate all states and actions taken by all agents
                # from agent 0 point of view, concatenate state seen by agent 0, state seen by agent 1, action agent 0, action agent 1
                # from agent 1 point of view, concatenate state seen by agent 1, state seen by agent 0, action agent 1, action agent 0
                full_state = np.concatenate((trajectories_states[i][-1],
                                             trajectories_actions[i][-1],
                                             trajectories_actions[(i+1) % 2][-1]))
                trajectories_full_states[i].append(full_state)
                trajectories_values[i].append(agent.estimate(full_state)[0])
                
            env_info = env.step(np.clip(concatenated_actions, -1, 1))[brain_name] # send all actions to the environment
            next_states = env_info.vector_observations                            # get next state (for each agent)
            rewards = env_info.rewards                                            # get reward (for each agent)
            dones = env_info.local_done                                           # see if episode finished
            scores_one_episode += rewards
            
            for i in range(2):
                trajectories_next_states[i].append(next_states[i])
                trajectories_rewards[i].append(rewards[i])
                trajectories_dones[i].append(dones[i])
                
            states = next_states                                     # roll over states to next time step
            if np.any(dones):                                        # exit loop if episode finished
                break
#                 env_info = env.reset(train_mode=train_mode)[brain_name]
#                 states = env_info.vector_observations 
        
        # in the optimization of the critic, we need the next values
        # however, our next values here are function of the actions and states of all agent
        # so we take an extra step to collect the next values
        trajectories_next_values = trajectories_values.copy()
        for j in range(2):
            for i in range(len(trajectories_next_values[0])):
                trajectories_next_values[j][i] = trajectories_next_values[j][i].copy()
                
            # ensure that the next value of timestep i is the value of timestep i+1            
            trajectories_next_values[j] = np.roll(np.array(trajectories_next_values[j]), -1, axis=0)
            trajectories_next_values[j][-1]=0

        for i in range(2):
            agent.learn(trajectories_states[i], 
                        trajectories_actions[i], 
                        trajectories_log_probs[i], 
                        trajectories_values[i], 
                        trajectories_rewards[i], 
                        trajectories_next_states[i],
                        trajectories_dones[i], 
                        trajectories_full_states[i], 
                        trajectories_next_values[i])
            
        score = np.max(scores_one_episode)
        scores.append(score)
        scores_window.append(score)
        mean_100 = np.mean(scores_window)

        if i_episode % 50 == 0:
            print('\rEpisode {}\tAverage Score: {:.3f}\tMax Score: {:.3f}\tLast Score: {:.3f}'.
                      format(i_episode, 
                         mean_100, 
                         np.max(scores_window),
                        scores_window[-1]))
            agent.save("eps_{}_avg_{}.pth".format(i_episode, mean_100))
            
        if len(scores_window) >= 100 and np.mean(scores_window)>=0.5:
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.3f}'.format(i_episode, mean_100))
            agent.save("final.pth")
            break
    return scores

In [None]:
from agents.ppo import PPO
from agents.model_ppo import Gaussian
import random
import torch.nn.functional as F
from tensorboardX import SummaryWriter

# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device = "cpu"

seed = 257
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)

logger = SummaryWriter(log_dir="./logs")

network = Gaussian(state_size, action_size, (state_size + action_size)*2 , activation = F.leaky_relu).to(device)
agent = PPO(network, device, logger,
                 LR=1e-3,
                 WEIGHT_DECAY=1e-4,
                 GRADIENT_CLIP=5, 
                 EPOCHS=4, 
                 BATCH_SIZE=32,
                GAMMA=0.99,
                GAE_TAU=0.95,
                CLIP_EPSILON=0.1)
scores = train(agent, n_episodes=2000, train_mode=True)
plot_result(scores)
logger.close()

In [None]:
logger.close()

In [None]:
env_info = env.reset(train_mode=True)[brain_name]
states = env_info.vector_observations
states_1 = torch.tensor(states[0]).float().to(agent.device)
states_2 = torch.tensor(states[1]).float().to(agent.device)
actions_1, _ = agent.network(states_1)
actions_2, _ = agent.network(states_2)
full_state = torch.cat((states_1, states_2, actions_1, actions_2))
value = agent.network.estimate(full_state)
logger = SummaryWriter(log_dir="./logs/graphs")
logger.add_graph(agent.network, actions_1)
logger.add_graph(agent.network, full_state)