In [1]:
### Feel free to disable GPU if you don't have one
import torch

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
torch.cuda.is_available()

import gym
import numpy as np

cuda:0


# Behavioral Cloning

## Rollout Storage

In [15]:
### Make a data structure to load and store Rollouts.
### At the very least, you'll need observations and actions.

class RolloutStorage:
    """
    An object for easy load and get batch
    from the action and observations
    """
    def __init__(self):
        self.obs = None
        self.action = None

    def load_actions(self, file_name):
        self.obs = np.load(file_name)
    
    def load_obs(self, file_name):
        self.action = np.load(file_name)

    def get_batch(self, batch_size):
        curr = 0
        while len(self.obs) >= curr:
            yield self.obs[curr:curr+batch_size], self.action[curr:curr+batch_size]
            curr += batch_size

In [18]:
storage = RolloutStorage()
storage.load_actions("../data/rollouts/actions_500.npy")
storage.load_obs("../data/rollouts/states_500.npy")

In [19]:
storage.obs.shape, storage.action.shape

((500000, 6), (500000, 24))

In [24]:
storage.action.min()

-55.91955

## BC Network

In [36]:
### Define a network that you'll train through behavioral cloning (supervised learning)

import torch
import torch.nn as nn
import torch.nn.functional as F

# BCNetwork with continuous action space
class BCNetwork(nn.Module):
    def __init__(self, obs_dim, action_dim):
        # assumes that observation and action are one-dimensional
        super(BCNetwork, self).__init__()
        self.obs_dim = obs_dim
        self.action_dim = action_dim

        self.fc1 = nn.Linear(self.obs_dim, 256)
        self.fc2 = nn.Linear(256, 256)
        self.fc3 = nn.Linear(256, self.action_dim)

    def forward(self, obs):
        x = F.relu(self.fc1(obs.to(device)))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

## Trainning Procedures

In [43]:
### Train the network

# initialize the network
network = BCNetwork(storage.obs.shape[1], storage.action.shape[1]).to(device)

# define the optimizer
optimizer = torch.optim.Adam(network.parameters(), lr=1e-1)

# define the loss function
loss_fn = F.mse_loss

# define the number of epochs
num_epochs = 50

# define the batch size
batch_size = 1024

# define the number of batches
num_batches = len(storage.obs) // batch_size

# note: you can keep the obs and action completely in memory 
# (the training loop is set up for that). make sure that:
# 1. the types are correct
# 2. the shapes match

# train the network
for epoch in range(num_epochs):
    gen = storage.get_batch(batch_size)
    # accumulate loss
    epoch_loss = 0

    for batch in range(num_batches):
        # get the batch somehow. you can either write a method 
        # into the storage class or just directly access the 
        # values in it
        
        batch_obs, batch_action = next(gen)
        batch_obs, batch_action = torch.FloatTensor(batch_obs), torch.FloatTensor(batch_action)
        # forward pass
        logits = network(batch_obs.to(device))
        # need to squeeze out the extra dimension
        logits = torch.squeeze(logits).cpu()
    
        # compute the loss
        loss = loss_fn(logits, batch_action)

        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # accumulate loss
        epoch_loss += loss.item()
    # print the loss
    print("Epoch: {}, Loss: {}".format(epoch, epoch_loss / num_batches))

Epoch: 0, Loss: 14.617555717952916
Epoch: 1, Loss: 13.69321315992074
Epoch: 2, Loss: 13.688372975490132
Epoch: 3, Loss: 13.683413810417301
Epoch: 4, Loss: 13.678872022472444
Epoch: 5, Loss: 13.679166696110709
Epoch: 6, Loss: 13.678646288934301
Epoch: 7, Loss: 13.680067464953563
Epoch: 8, Loss: 13.678012163912664
Epoch: 9, Loss: 13.681565276912002
Epoch: 10, Loss: 13.679486513137817
Epoch: 11, Loss: 13.679545586226416
Epoch: 12, Loss: 13.678526188506455
Epoch: 13, Loss: 13.678175556855123
Epoch: 14, Loss: 13.678426633115675
Epoch: 15, Loss: 13.677955904944998
Epoch: 16, Loss: 13.678476722514043
Epoch: 17, Loss: 13.680178952998803
Epoch: 18, Loss: 13.680200029592045
Epoch: 19, Loss: 13.679176709691031
Epoch: 20, Loss: 13.680018133804447
Epoch: 21, Loss: 13.681466550123496
Epoch: 22, Loss: 13.681161376296497
Epoch: 23, Loss: 13.681681742433641
Epoch: 24, Loss: 13.681253937424206
Epoch: 25, Loss: 13.680865604369366
Epoch: 26, Loss: 13.681230388703893
Epoch: 27, Loss: 13.682417767946838
Epo

# BC Model Eval

In [None]:
### run the trained network on the environment, based on the evaluate function but using network instead of model
def evaluate_network(network, num_episodes=100):
    """
    Evaluate a RL agent
    :param model: (BaseRLModel object) the RL Agent
    :param num_episodes: (int) number of episodes to evaluate it
    :return: (float) Mean reward for the last num_episodes
    """
    all_episode_rewards = []
    for i in range(num_episodes):
        episode_rewards = []
        done = False
        obs = env.reset()
        while not done:
            # _states are only useful when using LSTM policies
            # need to add the additional dimenstion becuase of the 
            # single batch training
            action = network(torch.tensor([obs], dtype=torch.float32))
            obs, reward, done, info = env.step([action])
            episode_rewards.append(reward)

        all_episode_rewards.append(sum(episode_rewards))

    mean_episode_reward = np.mean(all_episode_rewards)
    print("Mean reward:", mean_episode_reward, "Num episodes:", num_episodes)

    return mean_episode_reward

evaluate_network(network)