In [1]:
import torch
import time
import os
import numpy as np

from pathlib import Path
from torch.autograd import Variable
from tensorboardX import SummaryWriter

from unityagents import UnityEnvironment
from model import Agentv3
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
env = UnityEnvironment(file_name='./Tennis_Windows_x86_64/Tennis.exe')

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: TennisBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 8
        Number of stacked Vector Observation: 3
        Vector Action space type: continuous
        Vector Action space size (per agent): 2
        Vector Action descriptions: , 


In [3]:
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of agents 
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space 
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])

Number of agents: 2
Size of each action: 2
There are 2 agents. Each observes a state with length: 24
The state for the first agent looks like: [ 0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.         -6.65278625 -1.5
 -0.          0.          6.83172083  6.         -0.          0.        ]


In [4]:
# number of training episodes.
number_of_episodes = 5000
episode_length = 1
batch_size = 128

# amplitude of OU noise
# this slowly decreases to 0
noise = 1
noise_reduction = 0.9999
random_seed = 15
hidden_dim_actor = 128
hidden_dim_critic = 128
gamma=0.99
tau=0.001
lr_actor=0.0001
lr_critic=0.0001

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


init_dict = {'state_size': state_size+1,
                     'action_size': action_size,
                     'hidden_dim_actor': hidden_dim_actor,
                     'hidden_dim_critic': hidden_dim_critic,
                     'tau':tau,
                    'random_seed':random_seed,
                    'num_agents': num_agents,
                     'gamma': gamma,
                    'lr_actor':lr_actor,
                    'lr_critic':lr_critic,
                    'batch_size':batch_size,
                    'max_episode_len':episode_length}


In [5]:
#edit current run
curr_run = 'run7'
model_dir = Path('./models') 
run_dir = model_dir / curr_run
log_dir = run_dir / 'logs'

#os.makedirs(run_dir)
#os.makedirs(log_dir)

logger = SummaryWriter(str(log_dir))

In [6]:

agent1 = Agentv3(**init_dict)
agent2 = Agentv3(**init_dict)

agent2.critic_local = agent1.critic_local
agent2.critic_target = agent1.critic_target
agent2.critic_optimizer = agent1.critic_optimizer

agent2.actor_local = agent1.actor_local
agent2.actor_target = agent1.actor_target
agent2.actor_optimizer = agent1.actor_optimizer

# load the weights from file

agent_0_weights = run_dir  /  'checkpoint_actor_12.pth'
agent_0_critic_weights =  run_dir  /  'checkpoint_critic_12.pth'
agent_0_actor_target =  run_dir  /  'checkpoint_actor_target_12.pth'
agent_0_critic_target =   run_dir  / 'checkpoint_critic_target_12.pth'


agent_1_weights =  run_dir  /  'checkpoint_actor_22.pth'
agent_1_critic_weights = run_dir  /  'checkpoint_critic_22.pth'
agent_1_actor_target = run_dir  /   'checkpoint_actor_target_22.pth'
agent_1_critic_target = run_dir  /   'checkpoint_critic_target_22.pth'

agent1.actor_local.load_state_dict(torch.load(agent_0_weights))
agent1.critic_local.load_state_dict(torch.load(agent_0_critic_weights))
agent1.actor_target.load_state_dict(torch.load(agent_0_actor_target))
agent1.critic_target.load_state_dict(torch.load(agent_0_critic_target))

agent2.actor_local.load_state_dict(torch.load(agent_1_weights))
agent2.critic_local.load_state_dict(torch.load(agent_1_critic_weights))
agent1.actor_target.load_state_dict(torch.load(agent_1_actor_target))
agent1.critic_target.load_state_dict(torch.load(agent_1_critic_target))



In [7]:
#maddpg = Maddpgv2.init_from_save(run_dir / 'model2.pth')

In [8]:
def split_state(state):
    state1 = np.concatenate([state[0], [1]])
    state2 = np.concatenate([state[1], [-1]])
    return state1,state2

In [9]:

scores1 = []
scores2 = []
mean_scores = []
t_step = 0
for i_episode in range(0, number_of_episodes + 1):
    env_info = env.reset(train_mode=True)[brain_name]
    state = env_info.vector_observations
    agent1.reset()
    agent2.reset()
    score1 = 0
    score2 = 0
        
    while True:
        state1 = np.concatenate([state[0], [1]]).reshape((1,state.shape[1]+1))
        state2 = np.concatenate([state[1], [-1]]).reshape((1,state.shape[1]+1))
        action1 = agent1.act(state1)
        action2 = agent2.act(state2)
        env_info = env.step([action1, action2])[brain_name]
        next_state = env_info.vector_observations
        reward = env_info.rewards
        done = env_info.local_done
        next_state1 = np.concatenate([next_state[0], [1]])
        next_state2 = np.concatenate([next_state[1], [-1]])
        agent1.step(state1, action1, np.mean(reward),next_state1, done[0],t_step)
        agent2.step(state2, action2, np.mean(reward),next_state2, done[1],t_step)
        state = next_state
        score1 += reward[0]
        score2 += reward[1]
        t_step += 1
        if np.any(done):
            break

    scores1.append(score1)
    scores2.append(score2)
    mean_scores.append(np.mean([score1, score2]))
    mean1 = np.mean(scores1[-100:])
    mean2 = np.mean(scores2[-100:])
    mean_score = np.mean(mean_scores[-100:])
    print('\rEpisode {}\t Mean Score for Agent 1: {:.3f}, Mean Score for Agent 2: {:.3f}, Mean Score of both Agents: {:.3f}'.format(i_episode, mean1, mean2, mean_score), end="")
    
    if i_episode % 100 ==0:
        print('\rEpisode {}\t Mean Score for Agent 1: {:.3f}, Mean Score for Agent 2: {:.3f}, Mean Score of both Agents: {:.3f}'.format(
        i_episode, mean1, mean2, mean_score))
            
    if i_episode % 100 == 0:
        torch.save(agent1.actor_local.state_dict(), run_dir  / 'checkpoint_actor_1.pth')
        torch.save(agent1.critic_local.state_dict(), run_dir  / 'checkpoint_critic_1.pth')
        torch.save(agent1.actor_target.state_dict(), run_dir  / 'checkpoint_actor_target_1.pth')
        torch.save(agent1.critic_target.state_dict(), run_dir  / 'checkpoint_critic_target_1.pth')
            
        torch.save(agent2.actor_local.state_dict(),run_dir  /  'checkpoint_actor_2.pth')
        torch.save(agent2.critic_local.state_dict(),run_dir  /  'checkpoint_critic_2.pth')
        torch.save(agent2.actor_target.state_dict(),run_dir  /  'checkpoint_actor_target_2.pth')
        torch.save(agent2.critic_target.state_dict(),run_dir  /  'checkpoint_critic_target_2.pth')
    if mean_score>1:
        break

scores = mean_score

fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(1, len(scores)+1), scores, 'r-')
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()

Episode 0	 Mean Score for Agent 1: 0.400, Mean Score for Agent 2: 0.390, Mean Score of both Agents: 0.395
Episode 81	 Mean Score for Agent 1: 0.001, Mean Score for Agent 2: -0.001, Mean Score of both Agents: -0.000

KeyboardInterrupt: 

In [None]:
256+(2*2)