# Collaboration and Competition

---

In this notebook, I present my MADDPG based solution to the third project, Collaboration and Competition, of the [Deep Reinforcement Learning Nanodegree](https://www.udacity.com/course/deep-reinforcement-learning-nanodegree--nd893) program.

### Start the Environment

We begin by importing the necessary packages and settting the path of the Unity Environment

In [1]:
from unityagents import UnityEnvironment
import numpy as np

env = UnityEnvironment(file_name=r".\Tennis_Windows_x86_64\Tennis.exe")

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: TennisBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 8
        Number of stacked Vector Observation: 3
        Vector Action space type: continuous
        Vector Action space size (per agent): 2
        Vector Action descriptions: , 


### set the environment brain

In [2]:
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

### Examine the State and Action Spaces


In [3]:
# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of agents 
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space 
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])

Number of agents: 2
Size of each action: 2
There are 2 agents. Each observes a state with length: 24
The state for the first agent looks like: [ 0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.         -6.65278625 -1.5
 -0.          0.          6.83172083  6.         -0.          0.        ]


### Training the agents



In [4]:
from collections import deque
import matplotlib.pyplot as plt
from maddpg_agent import MADDPGAgent
import torch
import time

%matplotlib inline

agent = MADDPGAgent(state_size, action_size, random_seed=1)

Device used:  cpu


In [5]:
def maddpg(n_episodes=3000, max_t=1000, print_every=100, success_criteria_count=100):
    scores_deque = deque(maxlen=success_criteria_count)
    scores = []
    agent_scores = []
    agent_mean_scores_100eps = []

    for i_episode in range(1, n_episodes+1):    
        start_t = time.time()
        env_info = env.reset(train_mode=True)[brain_name]
        state = env_info.vector_observations
        agent.reset()
        score = np.zeros((num_agents,))   
        for t in range(max_t):
            action = agent.act(state)
            env_info = env.step(action)[brain_name]   
            next_state = env_info.vector_observations         
            reward = env_info.rewards                         
            done = env_info.local_done                
            agent.step(state, action, reward, next_state, done, t)
            state = next_state
            score += reward
            if np.any(done):
                break 
            
        scores_deque.append(np.max(score))
        scores.append(score)
        agent_scores.append(np.max(score))
        agent_mean_scores_100eps.append(np.mean(scores_deque))

        end_t = time.time()
               
        if agent_mean_scores_100eps[-1]>=0.5:
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.5f}'.format(i_episode, agent_mean_scores_100eps[-1]), end="")
            torch.save(agent.maddpg_agent[0].actor_local.state_dict(), 'checkpoint_actor_1_maddpg.pth')
            torch.save(agent.maddpg_agent[0].critic_local.state_dict(), 'checkpoint_critic_1_maddpg.pth')
            torch.save(agent.maddpg_agent[1].actor_local.state_dict(), 'checkpoint_actor_2_maddpg.pth')
            torch.save(agent.maddpg_agent[1].critic_local.state_dict(), 'checkpoint_critic_2_maddpg.pth')
            break
        if i_episode % print_every == 0:
            print('Episode {:03d}:\tAgents Mean Score over last 100 episodes: {:.5f} (time per eps:{:.1f} secs)'.format(i_episode, agent_mean_scores_100eps[-1], end_t - start_t), end="\n")
    return scores, agent_scores, agent_mean_scores_100eps

scores, agent_scores, agent_mean_scores_100eps = maddpg()


Episode 100:	Agents Mean Score over last 100 episodes: 0.00290 (time per eps:0.3 secs)
Episode 200:	Agents Mean Score over last 100 episodes: 0.00090 (time per eps:0.3 secs)
Episode 300:	Agents Mean Score over last 100 episodes: 0.01670 (time per eps:0.3 secs)
Episode 400:	Agents Mean Score over last 100 episodes: 0.01450 (time per eps:0.6 secs)
Episode 500:	Agents Mean Score over last 100 episodes: 0.01280 (time per eps:0.6 secs)
Episode 600:	Agents Mean Score over last 100 episodes: 0.01300 (time per eps:0.3 secs)
Episode 700:	Agents Mean Score over last 100 episodes: 0.02460 (time per eps:0.3 secs)
Episode 800:	Agents Mean Score over last 100 episodes: 0.01560 (time per eps:0.3 secs)
Episode 900:	Agents Mean Score over last 100 episodes: 0.03350 (time per eps:0.2 secs)
Episode 1000:	Agents Mean Score over last 100 episodes: 0.05620 (time per eps:0.3 secs)
Episode 1100:	Agents Mean Score over last 100 episodes: 0.06040 (time per eps:0.3 secs)
Episode 1200:	Agents Mean Score over last

KeyboardInterrupt: 

In [None]:
list(agent.maddpg_agent[0].actor_local.parameters())

In [None]:
list(agent.maddpg_agent[0].actor_local.parameters())[5].shape

In [6]:
torch.save(agent.maddpg_agent[0].actor_local.state_dict(), 'checkpoint_actor_1_maddpg.pth')
torch.save(agent.maddpg_agent[0].critic_local.state_dict(), 'checkpoint_critic_1_maddpg.pth')
torch.save(agent.maddpg_agent[1].actor_local.state_dict(), 'checkpoint_actor_2_maddpg.pth')
torch.save(agent.maddpg_agent[1].critic_local.state_dict(), 'checkpoint_critic_2_maddpg.pth')

In [None]:
# plot the scores of every agent
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(1, len(scores)+1), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()

# plot the average scores
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(len(scores)), agent_scores, label='Score of Each Episode')
plt.plot(np.arange(len(scores)), agent_mean_scores_100eps, c='r', label='Average Score over 100 Episodes')
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.legend(loc='lower right')
plt.show()

### Trained Agent in Action

In [8]:
#agent.maddpg_agent[0].actor_local.load_state_dict(torch.load('checkpoint_actor_1_maddpg.pth'))
#agent.maddpg_agent[1].actor_local.load_state_dict(torch.load('checkpoint_actor_2_maddpg.pth'))

for i in range(1, 6):                                      # play game for 5 episodes
    env_info = env.reset(train_mode=False)[brain_name]     # reset the environment    
    states = env_info.vector_observations                  # get the current state (for each agent)
    scores = np.zeros(num_agents)                          # initialize the score (for each agent)
    while True:
        actions = agent.act(states) # select an action (for each agent)
        #actions = np.clip(actions, -1, 1)                  # all actions between -1 and 1
        env_info = env.step(actions)[brain_name]           # send all actions to tne environment
        next_states = env_info.vector_observations         # get next state (for each agent)
        rewards = env_info.rewards                         # get reward (for each agent)
        dones = env_info.local_done                        # see if episode finished
        scores += env_info.rewards                         # update the score (for each agent)
        states = next_states                               # roll over states to next time step
        if np.any(dones):                                  # exit loop if episode finished
            break
    print('Score (max over agents) from episode {}: {}'.format(i, np.max(scores)))

Score (max over agents) from episode 1: 2.0000000298023224
Score (max over agents) from episode 2: 0.800000011920929
Score (max over agents) from episode 3: 0.5000000074505806
Score (max over agents) from episode 4: 0.30000000447034836
Score (max over agents) from episode 5: 0.6000000089406967


### When finished, you can close the environment.

In [None]:
env.reset()
env.close()