In [None]:
!python --version # should say 3.7.16

Python 3.7.16


In [2]:
import os
from agent import Agent, recursive_obs_dict_to_spaces_dict
from action_net import ActionNet
os.chdir("..")
from rice import Rice

import torch
from collections import deque
import numpy as np

from tqdm import tqdm

from typing import List

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
env = Rice()

In [4]:
env.num_agents # environment reduced to only 4 agents

4

In [5]:
env.action_space

{0: MultiDiscrete([10 10 10 10 10 10 10 10 10 10 10]),
 1: MultiDiscrete([10 10 10 10 10 10 10 10 10 10 10]),
 2: MultiDiscrete([10 10 10 10 10 10 10 10 10 10 10]),
 3: MultiDiscrete([10 10 10 10 10 10 10 10 10 10 10])}

In [6]:
env.episode_length

20

In [7]:
def create_agents() -> List[Agent]:
    initial_observation = env.reset()
    agents = []
    for key in initial_observation:
        agents.append(
            Agent(
                recursive_obs_dict_to_spaces_dict(initial_observation[0]),
                env.action_space[0],
                id = key
            )
        )
    return agents

In [8]:
agents = create_agents()

In [9]:
# Adapted from https://huggingface.co/deep-rl-course/unit4/hands-on?fw=pt
def reinforce(agents : List[Agent], 
              n_training_episodes : int, 
              gamma : float) -> None:
    
    optimizers = {agent.id : torch.optim.Adam(agent.nets[0].parameters(), lr=.0005) for agent in agents}
    
    scores_deque = deque(maxlen=100)
    scores = []
    
    for i_episode in tqdm(range(1, n_training_episodes+1)):
        saved_log_probs = {agent.id : [] for agent in agents}
        rewards = {agent.id : [] for agent in agents}
        state = env.reset()
        
        # Generate a whole episode
        for t in range(env.episode_length):
            
            collective_action = {}
            
            for agent in agents:
                action, log_prob = agent.act(0, state[agent.id])
                saved_log_probs[agent.id].append(log_prob)
                collective_action[agent.id] = np.array(action)
                
            state, reward, done, _ = env.step(collective_action)
            
            for agent in agents:
                rewards[agent.id].append(reward[agent.id])
        
        returns = {agent.id : deque(maxlen=env.episode_length) for agent in agents} 
        
        # Calculate discounted returns
        for t in range(env.episode_length)[::-1]:
            for agent in agents:
                disc_return_t = (returns[agent.id][0] if len(returns[agent.id])>0 else 0)
                returns[agent.id].appendleft( gamma*disc_return_t + rewards[agent.id][t]   )    
            
        eps = np.finfo(np.float32).eps.item()
        
        # Standardize returns
        returns = {agent.id : torch.tensor(returns[agent.id]) for agent in agents}
        for agent in agents:
            returns[agent.id] = (returns[agent.id] - returns[agent.id].mean()) / (returns[agent.id].std() + eps)
        
        # Calculate loss and update weights
        policy_loss = {agent.id : [] for agent in agents}
        for agent in agents:
            for log_prob, disc_return in zip(saved_log_probs[agent.id], returns[agent.id]):
                policy_loss[agent.id].append(-log_prob * disc_return)
            loss = torch.cat(policy_loss[agent.id]).sum()
            
            optimizers[agent.id].zero_grad()
            loss.backward()
            optimizers[agent.id].step()

In [10]:
reinforce(agents, n_training_episodes = 500, gamma = 1.)

100%|██████████████████████████████████████████████████████| 500/500 [03:40<00:00,  2.27it/s]


In [None]:
def evaluate_agents(agents : List[Agent]) -> dict:
    state = env.reset()
    for i in range(env.episode_length):
        collective_action = {}
    
        for agent in agents:
            action, _ = agent.act(0, state[agent.id])
            collective_action[agent.id] = np.array(action)
                
        state, reward, done, _ = env.step(collective_action)
    return env.global_state

In [None]:
def baseline() -> dict:
    return evaluate_agents(create_agents())

In [None]:
training_rewards = evaluate_agents(agents)["reward_all_regions"]["value"]

In [None]:
baseline_rewards = baseline()["reward_all_regions"]["value"]

In [None]:
training_rewards #[i,j] -> j = agent id, i = timestep

In [None]:
baseline_rewards