In [None]:
# default_exp loops

# loops

> This module will include some useful interaction loops for types of RL agents. It'll be updated over time.

In [None]:
#hide
from nbdev import *

In [None]:
%nbdev_export
import gym
import numpy as np
from rl_bolts import buffers, env_wrappers, neuralnets
import torch
import torch.nn as nn
import torch.nn.functional as F

In [None]:
%nbdev_export
def polgrad_interaction_loop(
    env: gym.Env, 
    agent: nn.Module, 
    buffer: buffers.PGBuffer, 
    num_interactions: int = 4000, 
    horizon: int = 1000
):
    """
    Interaction loop for actor-critic policy gradient agent.
    
    This loop does not handle converting between PyTorch Tensors and NumPy arrays. So either your env should first be wrapped
    in `ToTorchWrapper` or your agent should accept and return NumPy arrays.
    
    Args:
    - env (gym.Env): Environment to run in. 
    - agent (nn.Module): Agent to run within the environment, generates actions, values, and logprobs at each step.
    - buffer (rl_bolts.buffers.PGBuffer-like): Buffer object with same API and function signatures as the PGBuffer.
    - num_interactions (int): How many interactions to collect in the environment.
    - horizon (int): Maximum allowed episode length.
    
    Returns:
    - buffer (rl_bolts.buffers.PGBuffer-like): Buffer filled with interactions.
    - infos (dict): Dictionary of reward and episode length statistics.
    - env_infos (list of dicts): List of all info dicts from the environment.
    """
    
    env_infos = []
    
    rets = []
    lens = []
    
    ret = 0
    length = 0
    
    obs = env.reset()
    
    for i in range(num_interactions):
        action, logp, value = agent.step(obs)
        
        next_obs, reward, done, env_info = env.step(action)
        env_infos.append(env_info)
        
        buffer.store(
            obs,
            action,
            reward,
            value,
            logp
        )
        
        ret += reward
        length += 1
        
        obs = next_obs
        
        timeup = length == horizon
        over = done or timeup
        epoch_ended = i == num_interactions - 1
        
        if over or epoch_ended:
            if timeup or epoch_ended:
                with torch.no_grad():
                    last_val = agent.value_f(obs)
                
            else:
                last_val = 0
            
            buffer.finish_path(last_val)
                
            if over:
                rets.append(ret)
                lens.append(length)
            
            obs, ret, length = env.reset(), 0, 0
            
    infos = {
        "MeanEpReturn": np.mean(rets),
        "StdEpReturn": np.std(rets),
        "MaxEpReturn": np.max(rets),
        "MinEpReturn": np.min(rets),
        "MeanEpLength": np.mean(lens),
        "StdEpLength": np.std(lens)
    }
        
    return buffer, infos, env_infos

In [None]:
show_doc(polgrad_interaction_loop)

<h4 id="polgrad_interaction_loop" class="doc_header"><code>polgrad_interaction_loop</code><a href="__main__.py#L2" class="source_link" style="float:right">[source]</a></h4>

> <code>polgrad_interaction_loop</code>(**`env`**:`Env`, **`agent`**:`Module`, **`buffer`**:[`PGBuffer`](/rl_bolts/buffers#PGBuffer), **`num_interactions`**:`int`=*`4000`*, **`horizon`**:`int`=*`1000`*)

Interaction loop for actor-critic policy gradient agent.

This loop does not handle converting between PyTorch Tensors and NumPy arrays. So either your env should first be wrapped
in [`ToTorchWrapper`](/rl_bolts/env_wrappers#ToTorchWrapper) or your agent should accept and return NumPy arrays.

Args:
- env (gym.Env): Environment to run in. 
- agent (nn.Module): Agent to run within the environment, generates actions, values, and logprobs at each step.
- buffer (rl_bolts.buffers.PGBuffer-like): Buffer object with same API and function signatures as the PGBuffer.
- num_interactions (int): How many interactions to collect in the environment.
- horizon (int): Maximum allowed episode length.

Returns:
- buffer (rl_bolts.buffers.PGBuffer-like): Buffer filled with interactions.
- infos (dict): Dictionary of reward and episode length statistics.
- env_infos (list of dicts): List of all info dicts from the environment.

Here we demonstrate hypothetical usage of the interaction loop.

In [None]:
env = gym.make("CartPole-v1") # make the environment
env = env_wrappers.ToTorchWrapper(env) # wrap it for conversion to/from torch.Tensors
agent = neuralnets.ActorCritic( # make the actor-critic agent
    env.observation_space.shape[0],
    env.action_space,
)
buf = buffers.PGBuffer(env.observation_space.shape, env.action_space.shape, 4000) # create empty buffer
full_buf, infos, env_infos = polgrad_interaction_loop(env, agent, buf) # run loop, fills buffer
for k, v in infos.items(): # print loop stats
    print(f"{k}: {v}")

MeanEpReturn: 25.477707006369428
StdEpReturn: 14.071059873100223
MaxEpReturn: 100.0
MinEpReturn: 9.0
MeanEpLength: 25.477707006369428
StdEpLength: 14.071059873100223


In [None]:
#hide
notebook2script()

Converted 00_utils.ipynb.
Converted 01_datasets.ipynb.
Converted 02_buffers.ipynb.
Converted 03_neuralnets.ipynb.
Converted 04_losses.ipynb.
Converted 05_env_wrappers.ipynb.
Converted 06_loops.ipynb.
Converted 07_algorithms.ipynb.
Converted index.ipynb.
