In [1]:
import torch
import time
import os
import numpy as np

from pathlib import Path
from torch.autograd import Variable
from tensorboardX import SummaryWriter

from model import MADDPG

In [2]:
from unityagents import UnityEnvironment

In [3]:
env = UnityEnvironment(file_name='./Tennis_Windows_x86_64/Tennis.exe')

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: TennisBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 8
        Number of stacked Vector Observation: 3
        Vector Action space type: continuous
        Vector Action space size (per agent): 2
        Vector Action descriptions: , 


In [4]:
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of agents 
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space 
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])

Number of agents: 2
Size of each action: 2
There are 2 agents. Each observes a state with length: 24
The state for the first agent looks like: [ 0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.         -6.65278625 -1.5
 -0.          0.          6.83172083  6.         -0.          0.        ]


In [None]:
#edit current run
curr_run = 'run1'
model_dir = Path('./models') 
run_dir = model_dir / curr_run
log_dir = run_dir / 'logs'

os.makedirs(run_dir)
os.makedirs(log_dir)

logger = SummaryWriter(str(log_dir))

In [5]:

# number of training episodes.
# change this to higher number to experiment. say 30000.
number_of_episodes = 10
episode_length = 100
batchsize = 128

# amplitude of OU noise
# this slowly decreases to 0
noise = 1
noise_reduction = 0.9999

hidden_dim_actor = 120
hidden_dim_critic = 64
gamma=0.95
tau=0.02
lr_actor=1.0e-4
lr_critic=1.0e-3

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


agent_init_params = {'num_in_pol': state_size,
                     'num_out_pol': action_size,
                     'num_in_critic': state_size,
                     'hidden_dim_actor': hidden_dim_actor,
                     'hidden_dim_critic': hidden_dim_critic,
                    'lr_actor':lr_actor,
                    'lr_critic':lr_critic}

init_dict = {'gamma': gamma, 'tau': tau,
                     'alg_types': 'MADDPG',
                     'agent_init_params':agent_init_params,
                     'num_agents': num_agents,
                     'discrete_action': False}



In [14]:

maddpg = MADDPG(**init_dict)

scores1 = []
scores2 = []
mean_scores = []
maddpg.prep_rollouts(device=device)

for i_episode in range(0, 1):
    env_info = env.reset(train_mode=True)[brain_name]
    state = env_info.vector_observations
    maddpg.reset_noise()
    score1 = 0
    score2 = 0
    i = 0
    while True:
        state1 = np.concatenate([state[0], [1]]).reshape((1,state.shape[1]+1))
        state2 = np.concatenate([state[1], [-1]]).reshape((1,state.shape[1]+1))
        
        torch_actions = maddpg.step(state)
        # convert actions to numpy arrays
        agent_actions = [ac.data.numpy() for ac in torch_actions]
        
        env_info = env.step(agent_actions)[brain_name]
        next_state = env_info.vector_observations
        reward = env_info.rewards
        done = env_info.local_done
        
        print(next_state)
        print(reward)
        
        i+=1
        if i > 2:
            break
        
        
        
            
        
    
    
    

2
[array([[ 0.03169067, -0.10323034]], dtype=float32), array([[ 0.15344222, -0.03064415]], dtype=float32)]
2
[array([[ 0.13277301, -0.23154151]], dtype=float32), array([[0.3483399 , 0.06052719]], dtype=float32)]
2
[array([[-0.07500847,  0.06182102]], dtype=float32), array([[-0.1567443 ,  0.05771503]], dtype=float32)]


In [None]:
state

In [None]:
for a, obs in zip(maddpg.agents,state):
    print(a)
    print(obs)
    print(obs.shape)
    print(obs[None, ...])
    #print(Variable(torch.Tensor(obs),requires_grad=False))

In [None]:
np.concatenate([state[0], [1]]).reshape((1,state.shape[1]+1))

In [None]:
 [Variable(torch.Tensor(np.vstack(state[:, i])),
                                  requires_grad=False)
                         for i in range(maddpg.nagents)]

In [None]:
np.vstack(state[:, 0])

In [None]:
state