In [1]:
### Feel free to disable GPU if you don't have one
import torch

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
torch.cuda.is_available()

import gym
import numpy as np

cuda:0


True

# Behavioral Cloning

## Rollout Storage

In [None]:
### Make a data structure to load and store Rollouts.
### At the very least, you'll need observations and actions.

class RolloutStorage:
    """
    An object for easy load and get batch
    from the action and observations
    """
    def __init__(self):
        self.obs = None
        self.action = None

    def load_actions(self, file_name):
        self.obs = np.load(file_name)
    
    def load_observations(self, file_name):
        self.action = np.load(file_name)

    def get_batch(self, batch_size):
        curr = 0
        while len(self.obs) >= curr:
            yield self.obs[curr:curr+batch_size], self.action[curr:curr+batch_size]
            curr += batch_size

## BC Network

In [None]:
### Define a network that you'll train through behavioral cloning (supervised learning)

import torch
import torch.nn as nn
import torch.nn.functional as F

# BCNetwork but with a discrete action space
class BCNetworkDiscrete(nn.Module):
    def __init__(self, obs_dim, action_dim):
        # assumes that observation and action are one-dimensional
        super(BCNetworkDiscrete, self).__init__()
        self.obs_dim = obs_dim
        self.action_dim = action_dim

        self.fc1 = nn.Linear(self.obs_dim, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, self.action_dim)

    def forward(self, obs):
        x = F.relu(self.fc1(obs))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        print(x)
        x = F.softmax(x, dim=2) # what should you do to x to make it a probability distribution?
        return x

## Trainning Procedures

In [None]:
### Train the network

# initialize the network
network = BCNetworkDiscrete(env.observation_space.shape[0], env.action_space.n)

# define the optimizer
optimizer = torch.optim.Adam(network.parameters(), lr=1e-3)

# define the loss function
loss_fn = nn.CrossEntropyLoss()

# define the number of epochs
num_epochs = 10

# define the batch size
batch_size = 32

# define the number of batches
num_batches = len(storage.obs) // batch_size

# note: you can keep the obs and action completely in memory 
# (the training loop is set up for that). make sure that:
# 1. the types are correct
# 2. the shapes match

# train the network
for epoch in range(num_epochs):
    gen = storage.get_batch(batch_size)
    # accumulate loss
    epoch_loss = 0

    for batch in range(num_batches):
        # get the batch somehow. you can either write a method 
        # into the storage class or just directly access the 
        # values in it
        
        batch_obs, batch_action = next(gen)
        batch_obs, batch_action = torch.FloatTensor(batch_obs), torch.FloatTensor(batch_action)
        # forward pass
        logits = network(batch_obs)
        print(batch_obs.shape)
        # need to squeeze out the extra dimension
        logits = torch.squeeze(logits)
    
        # compute the loss
        loss = loss_fn(logits, batch_action)

        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # accumulate loss
        epoch_loss += loss.item()
    # print the loss
    print("Epoch: {}, Loss: {}".format(epoch, epoch_loss / num_batches))
    break

# BC Model Eval

In [None]:
### run the trained network on the environment, based on the evaluate function but using network instead of model
def evaluate_network(network, num_episodes=100):
    """
    Evaluate a RL agent
    :param model: (BaseRLModel object) the RL Agent
    :param num_episodes: (int) number of episodes to evaluate it
    :return: (float) Mean reward for the last num_episodes
    """
    # This function will only work for a single Environment
    env = model.get_env()
    all_episode_rewards = []
    for i in range(num_episodes):
        episode_rewards = []
        done = False
        obs = env.reset()
        while not done:
            # _states are only useful when using LSTM policies
            # need to add the additional dimenstion becuase of the 
            # single batch training
            action = network(torch.tensor([obs], dtype=torch.float32)).argmax().item()
            # here, action, rewards and dones are arrays
            # because we are using vectorized env
            obs, reward, done, info = env.step([action])
            episode_rewards.append(reward)

        all_episode_rewards.append(sum(episode_rewards))

    mean_episode_reward = np.mean(all_episode_rewards)
    print("Mean reward:", mean_episode_reward, "Num episodes:", num_episodes)

    return mean_episode_reward

evaluate_network(network)