In [1]:
import torch
import time
import os
import numpy as np

from pathlib import Path
from torch.autograd import Variable
from tensorboardX import SummaryWriter

from model import MADDPG

In [2]:
from unityagents import UnityEnvironment

In [3]:
env = UnityEnvironment(file_name='./Tennis_Windows_x86_64/Tennis.exe')

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: TennisBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 8
        Number of stacked Vector Observation: 3
        Vector Action space type: continuous
        Vector Action space size (per agent): 2
        Vector Action descriptions: , 


In [4]:
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of agents 
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space 
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])

Number of agents: 2
Size of each action: 2
There are 2 agents. Each observes a state with length: 24
The state for the first agent looks like: [ 0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.         -6.65278625 -1.5
 -0.          0.          6.83172083  6.         -0.          0.        ]


In [None]:
#edit current run
curr_run = 'run6'
model_dir = Path('./models') 
run_dir = model_dir / curr_run
log_dir = run_dir / 'logs'

#os.makedirs(run_dir)
#os.makedirs(log_dir)

logger = SummaryWriter(str(log_dir))

In [None]:

# number of training episodes.
number_of_episodes = 10000
episode_length = 50
batch_size = 124

# amplitude of OU noise
# this slowly decreases to 0
noise = 1
noise_reduction = 0.9999

hidden_dim_actor = 164
hidden_dim_critic = 164
gamma=0.99
tau=0.001
lr_actor=0.001
lr_critic=0.001

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


agent_init_params = {'num_in_pol': state_size,
                     'num_out_pol': action_size,
                     'num_in_critic': state_size,
                     'hidden_dim_actor': hidden_dim_actor,
                     'hidden_dim_critic': hidden_dim_critic,
                     'tau':tau,
                     'gamma': gamma,
                    'lr_actor':lr_actor,
                    'lr_critic':lr_critic,
                    'batch_size':batch_size,
                    'max_episode_len':episode_length}

init_dict = {
                     'alg_types': 'MADDPG',
                     'agent_init_params':agent_init_params,
                     'num_agents': num_agents,
                     'discrete_action': False}



In [None]:
#maddpg = MADDPG.init_from_save(run_dir / 'model4.pth')

In [None]:

maddpg = MADDPG(**init_dict)

scores1 = []
scores2 = []
mean_scores = []
maddpg.prep_rollouts(device=device)
maddpg.prep_training(device=device)
t_step = 0

for i_episode in range(0, number_of_episodes):
    env_info = env.reset(train_mode=True)[brain_name]
    state = env_info.vector_observations
    maddpg.reset_noise()
    score1 = 0
    score2 = 0
    
    while True:
        state1 = state[0]
        state2 = state[1]
        
        action1 = maddpg.agents[0].act(state1[None,...]).data.numpy()
        action2 = maddpg.agents[1].act(state2[None,...]).data.numpy()
             
        env_info = env.step([action1, action2])[brain_name]
        next_state = env_info.vector_observations
        reward = env_info.rewards
        done = env_info.local_done
        
        next_state1 = next_state[0]
        next_state2 = next_state[1]
        
        maddpg.agents[0].step(0,state1, action1, reward,next_state1, done,t_step)
        maddpg.agents[1].step(1,state2, action2, reward,next_state2, done,t_step)
        
        state = next_state
        score1 += reward[0]
        score2 += reward[1]
        t_step+=1
        
        if np.any(done):
            break
            
    scores1.append(score1)
    scores2.append(score2)
    mean_scores.append(np.mean([score1, score2]))
    mean1 = np.mean(scores1[-100:])
    mean2 = np.mean(scores2[-100:])
    mean_score = np.mean(mean_scores[-100:])
    print('\rEpisode {}\t Mean Score for Agent 1: {:.3f}, Mean Score for Agent 2: {:.3f}, Mean Score of both Agents: {:.3f}'.format(i_episode, mean1, mean2, mean_score),end="")
    #print('\rstep {}'.format(t_step),end="")
    if i_episode % 200 ==0:
        #logger.add_scalar('agent%i/mean_episode_rewards' % i_episode,  mean_score)
        print('\rEpisode {}\t Mean Score for Agent 1: {:.3f}, Mean Score for Agent 2: {:.3f}, Mean Score of both Agents: {:.3f}'.format(i_episode, mean1, mean2, mean_score))
    
    if i_episode % 100 ==0:
        maddpg.save(run_dir / 'model4.pth')
    
maddpg.save(run_dir / 'model3.pth')
logger.export_scalars_to_json(str(log_dir / 'summary.json'))
logger.close()
    
    

        

Episode 0	 Mean Score for Agent 1: 0.000, Mean Score for Agent 2: -0.010, Mean Score of both Agents: -0.005
Episode 200	 Mean Score for Agent 1: -0.005, Mean Score for Agent 2: -0.005, Mean Score of both Agents: -0.005
Episode 400	 Mean Score for Agent 1: -0.006, Mean Score for Agent 2: -0.004, Mean Score of both Agents: -0.005
Episode 600	 Mean Score for Agent 1: -0.003, Mean Score for Agent 2: -0.005, Mean Score of both Agents: -0.004
Episode 800	 Mean Score for Agent 1: -0.005, Mean Score for Agent 2: -0.005, Mean Score of both Agents: -0.005
Episode 1000	 Mean Score for Agent 1: -0.005, Mean Score for Agent 2: -0.005, Mean Score of both Agents: -0.005
Episode 1200	 Mean Score for Agent 1: 0.004, Mean Score for Agent 2: -0.006, Mean Score of both Agents: -0.001
Episode 1400	 Mean Score for Agent 1: -0.004, Mean Score for Agent 2: -0.005, Mean Score of both Agents: -0.004
Episode 1600	 Mean Score for Agent 1: -0.004, Mean Score for Agent 2: -0.005, Mean Score of both Agents: -0.004
E

In [None]:
a = [False, False, False, False, False]
b = torch.from_numpy(np.array(a, dtype=np.uint8)).float()
print(b)
print(1 - b)

In [None]:
states,actions,rewards,next_states,dones = maddpg.agents[0].replay_buffer.sample()

In [None]:
print(states.shape)
print(actions.shape)
print(next_states.shape)

print(done.shape)

print(rewards.shape)

In [None]:
obs = states.reshape(-1,24*2)
acs = actions.reshape(-1,2,2)
next_obs = next_states.reshape(-1,24*2)
print(obs.shape)
print(acs.shape)
print(next_obs.shape)


In [None]:
result = [int(elem) for elem in dones[0]]
print(result)

In [None]:
len(maddpg.agents)

In [None]:
a = np.array([[[1,2], [3,4]],[[1,2], [3,4]]])
print(a.shape)

In [None]:
rewards[:,1]

In [None]:
True.toInt