In [1]:
### Feel free to disable GPU if you don't have one
import torch

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
torch.cuda.is_available()

import gym
import numpy as np

cuda:0


# Behavioral Cloning

## Rollout Storage

In [2]:
### Make a data structure to load and store Rollouts.
### At the very least, you'll need observations and actions.

class RolloutStorage:
    """
    An object for easy load and get batch
    from the action and observations
    """
    def __init__(self):
        self.obs = None
        self.action = None

    def load_actions(self, file_name):
        self.action = np.load(file_name)
    
    def load_obs(self, file_name):
        self.obs = np.load(file_name)

    def get_batch(self, batch_size):
        curr = 0
        while len(self.obs) >= curr:
            yield self.obs[curr:curr+batch_size], self.action[curr:curr+batch_size]
            curr += batch_size

In [3]:
storage = RolloutStorage()
storage.load_actions("../data/rollouts/actions_500.npy")
storage.load_obs("../data/rollouts/states_500.npy")

In [4]:
storage.obs.shape, storage.action.shape

((500000, 24), (500000, 6))

In [5]:
storage.action.min()

-0.99993634

## BC Network

In [9]:
### Define a network that you'll train through behavioral cloning (supervised learning)

import torch
import torch.nn as nn
import torch.nn.functional as F

# BCNetwork with continuous action space
class BCNetwork(nn.Module):
    def __init__(self, obs_dim, action_dim, hidden_dim):
        # assumes that observation and action are one-dimensional
        super(BCNetwork, self).__init__()
        self.obs_dim = obs_dim
        self.action_dim = action_dim
        self.fc1 = nn.Linear(self.obs_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        
        self.mean_linear = nn.Linear(hidden_dim, self.action_dim)
        self.log_std_linear = nn.Linear(hidden_dim, self.action_dim)

    def forward(self, obs):
        x = F.relu(self.fc1(obs.to(device)))
        x = F.relu(self.fc2(x))
        x = self.mean_linear(x)
        return torch.tanh(x)

## Trainning Procedures

In [None]:
### Train the network

# initialize the network
network = BCNetwork(storage.obs.shape[1], storage.action.shape[1], 1024).to(device)

# define the optimizer
optimizer = torch.optim.Adam(network.parameters(), lr=3e-4)

# define the loss function
loss_fn = nn.MSELoss()

# define the number of epochs
num_epochs = 1000

# define the batch size
batch_size = 1024

# define the number of batches
num_batches = len(storage.obs) // batch_size
print(f"num batches = {num_batches}")

# note: you can keep the obs and action completely in memory 
# (the training loop is set up for that). make sure that:
# 1. the types are correct
# 2. the shapes match

# train the network
for epoch in range(num_epochs):
    gen = storage.get_batch(batch_size)
    # accumulate loss
    epoch_loss = 0

    for batch in range(num_batches):
        # get the batch somehow. you can either write a method 
        # into the storage class or just directly access the 
        # values in it
        
        batch_obs, batch_action = next(gen)
        batch_obs, batch_action = torch.FloatTensor(batch_obs), torch.FloatTensor(batch_action)
        # forward pass
        logits = network(batch_obs.to(device))
        # need to squeeze out the extra dimension
        logits = torch.squeeze(logits).cpu()
    
        # compute the loss
        loss = loss_fn(logits, batch_action)

        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # accumulate loss
        epoch_loss += loss.item()
    # print the loss
    if epoch % 10 == 1:
        print("Epoch: {}, Loss: {}".format(epoch, epoch_loss / num_batches))

num batches = 488
Epoch: 1, Loss: 0.2810342876759709
Epoch: 11, Loss: 0.2582145448713029
Epoch: 21, Loss: 0.2545624172589818
Epoch: 31, Loss: 0.25208555693264867
Epoch: 41, Loss: 0.24991686052841242
Epoch: 51, Loss: 0.2478536468182431
Epoch: 61, Loss: 0.2458549211809381
Epoch: 71, Loss: 0.24391384911341746
Epoch: 81, Loss: 0.24204727753874708
Epoch: 91, Loss: 0.24024596236279752
Epoch: 101, Loss: 0.2385489151003908
Epoch: 111, Loss: 0.23689354913400823
Epoch: 121, Loss: 0.23532821989206018
Epoch: 131, Loss: 0.23385423974546252
Epoch: 141, Loss: 0.23243369803321165
Epoch: 151, Loss: 0.23104420356208183
Epoch: 161, Loss: 0.22972029743746655
Epoch: 171, Loss: 0.22849659493467847
Epoch: 181, Loss: 0.22732383678438234
Epoch: 191, Loss: 0.22616824535194968
Epoch: 201, Loss: 0.22506856362594932
Epoch: 211, Loss: 0.22399559714755074
Epoch: 221, Loss: 0.22313213617098135
Epoch: 231, Loss: 0.2221553444740225
Epoch: 241, Loss: 0.22133675932151373
Epoch: 251, Loss: 0.22031775788694133
Epoch: 261, 

# BC Model Eval

In [None]:
### run the trained network on the environment, based on the evaluate function but using network instead of model
def evaluate_network(network, num_episodes=100):
    """
    Evaluate a RL agent
    :param model: (BaseRLModel object) the RL Agent
    :param num_episodes: (int) number of episodes to evaluate it
    :return: (float) Mean reward for the last num_episodes
    """
    all_episode_rewards = []
    for i in range(num_episodes):
        episode_rewards = []
        done = False
        obs = env.reset()
        while not done:
            # _states are only useful when using LSTM policies
            # need to add the additional dimenstion becuase of the 
            # single batch training
            action = network(torch.tensor([obs], dtype=torch.float32))
            obs, reward, done, info = env.step([action])
            episode_rewards.append(reward)

        all_episode_rewards.append(sum(episode_rewards))

    mean_episode_reward = np.mean(all_episode_rewards)
    print("Mean reward:", mean_episode_reward, "Num episodes:", num_episodes)

    return mean_episode_reward

evaluate_network(network)