In [13]:
import os
import time
import numpy as np
import gym
from collections import OrderedDict
import pickle
import torch
import torch.nn.functional as F

%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [2]:
###########################
# placeholder for parameter
###########################

logdir = "./logs/demo2"

params = {
    "expert_policy_file": "cs285/policies/experts/Humanoid.pkl",
    "expert_data": "cs285/expert_data/expert_data_Humanoid-v2.pkl",
    "env_name": "Humanoid-v2",
    "exp_name": "test_bc_humanoid",
    "do_dagger": False,
    "num_agent_train_steps_per_iter": 1000,
    "n_iter": 1,
    "batch_size": 1000,
    "eval_batch_size": 5000,
    "train_batch_size":100,
    "n_layers": 2,
    "size": 64,
    "learning_rate": 5e-3,
    "video_log_freq": 5,
    "scalar_log_freq": 1,
    "use_gpu": True,
    "which_gpu":0,
    "max_replay_buffer_size":1000000,
    "seed":1,
    "logdir":logdir
}

agent_params = {
            'n_layers': params['n_layers'],
            'size': params['size'],
            'learning_rate': params['learning_rate'],
            'max_replay_buffer_size': params['max_replay_buffer_size'],
            }

params['agent_params'] = agent_params

In [3]:
# Make the gym environment
seed = params['seed']
env = gym.make(params['env_name'])
env.seed(seed)

# Maximum length for episodes
params['ep_len'] = env.spec.max_episode_steps

# Is this env continuous, or self.discrete?
discrete = isinstance(env.action_space, gym.spaces.Discrete)
params['agent_params']['discrete'] = discrete

# Observation and action sizes
ob_dim = env.observation_space.shape[0]
ac_dim = env.action_space.n if discrete else env.action_space.shape[0]
params['agent_params']['ac_dim'] = ac_dim
params['agent_params']['ob_dim'] = ob_dim


In [4]:
env.reset()
env.step(1)[0].shape

(376,)

In [5]:
##Perhaps a deque is better
##This function organise replay buffer for SARS

class ReplayBuffer(object):

    def __init__(self, max_size=1000000):

        self.max_size = max_size

        # store each rollout
        self.paths = []

        # store (concatenated) component arrays from each rollout
        self.obs = None
        self.acs = None
        self.rews = None
        self.next_obs = None
        self.terminals = None

    def __len__(self):
        if self.obs:
            return self.obs.shape[0]
        else:
            return 0

    def add_rollouts(self, paths, concat_rew=True):

        # add new rollouts into our list of rollouts
        for path in paths:
            self.paths.append(path)

        # convert new rollouts into their component arrays, and append them onto our arrays
        observations, actions, rewards, next_observations, terminals = self.convert_listofrollouts(paths, concat_rew)

        if self.obs is None:
            self.obs = observations[-self.max_size:]
            self.acs = actions[-self.max_size:]
            self.rews = rewards[-self.max_size:]
            self.next_obs = next_observations[-self.max_size:]
            self.terminals = terminals[-self.max_size:]
        else:
            self.obs = np.concatenate([self.obs, observations])[-self.max_size:]
            self.acs = np.concatenate([self.acs, actions])[-self.max_size:]
            if concat_rew:
                self.rews = np.concatenate([self.rews, rewards])[-self.max_size:]
            else:
                if isinstance(rewards, list):
                    self.rews += rewards
                else:
                    self.rews.append(rewards)
                self.rews = self.rews[-self.max_size:]
            self.next_obs = np.concatenate([self.next_obs, next_observations])[-self.max_size:]
            self.terminals = np.concatenate([self.terminals, terminals])[-self.max_size:]
            
    def convert_listofrollouts(self, paths, concat_rew=True):
        """
            Take a list of rollout dictionaries
            and return separate arrays,
            where each array is a concatenation of that array from across the rollouts
        """
        observations = np.concatenate([path["observation"] for path in paths])
        actions = np.concatenate([path["action"] for path in paths])
        if concat_rew:
            rewards = np.concatenate([path["reward"] for path in paths])
        else:
            rewards = [path["reward"] for path in paths]
        next_observations = np.concatenate([path["next_observation"] for path in paths])
        terminals = np.concatenate([path["terminal"] for path in paths])
        return observations, actions, rewards, next_observations, terminals

    ########################################
    ########################################

    def sample_random_data(self, batch_size):
        assert self.obs.shape[0] == self.acs.shape[0] == self.rews.shape[0] == self.next_obs.shape[0] == self.terminals.shape[0]

        ## TODO return batch_size number of random entries from each of the 5 component arrays above
        ## HINT 1: use np.random.permutation to sample random indices
        ## HINT 2: return corresponding data points from each array (i.e., not different indices from each array)
        ## HINT 3: look at the sample_recent_data function below
        mask = np.random.permutation(self.obs.shape[0])
        return self.obs[mask<batch_size], self.acs[mask<batch_size], self.rews[mask<batch_size], self.next_obs[mask<batch_size], self.terminals[mask<batch_size]

    def sample_recent_data(self, batch_size=1):
        return self.obs[-batch_size:], self.acs[-batch_size:], self.rews[-batch_size:], self.next_obs[-batch_size:], self.terminals[-batch_size:]

In [6]:
replay_buffer = ReplayBuffer(agent_params['max_replay_buffer_size'])

In [7]:
#Sample Paths
print(params['expert_data'])
with open(params['expert_data'], "rb") as f:
    loaded_paths = pickle.load(f)
    
replay_buffer.add_rollouts(loaded_paths)
sam_obs, sam_act, sam_reward, sam_nextobs, sam_terminal = replay_buffer.sample_random_data(10)

sam_obs.shape

cs285/expert_data/expert_data_Humanoid-v2.pkl


(10, 376)

In [8]:
ac_dim = agent_params['ac_dim']
ob_dim = agent_params['ob_dim']
n_layers = agent_params['n_layers']
size = agent_params['size']
discrete = agent_params['discrete']
learning_rate = agent_params['learning_rate']


In [15]:
model = torch.nn.Sequential(
    torch.nn.Linear(ob_dim, size),
    torch.nn.ReLU(),
    torch.nn.Linear(size, size),
    torch.nn.ReLU(),
    torch.nn.Linear(size, ac_dim)
    )

class Net(torch.nn.Module):
    def __init__(self, ob_dim, size, ac_dim):
        super(Net, self).__init__()
        self.fc1 = torch.nn.Linear(ob_dim, size)
        self.fc2 = torch.nn.Linear(size, size)
        self.fc3 = torch.nn.Linear(size, ac_dim)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    
model = Net(ob_dim, size, ac_dim)

loss_fn = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
####################
# Training loop
####################



In [None]:
##################
# Policy
##################

class Policy(object):
    
    def __init__(self, model, env, params, loss_fn, optimizer):
        super(Policy, self).__init__()
        self.model = model
        self.env = env
        self.loss_fn = loss_fn
        self.optimizer = optimizer
        self.replay_buffer = ReplayBuffer(params['max_replay_buffer_size'])
        
    def get_action(self, obs):
        raise NotImplemntedError
    
    def update(self, obs, acs):
        raise NotImplemntedError
        
    def training_loop(self, epochs=10):
        running_loss = 0.0
        total_envsteps = 0
        for epoch in range(epochs):
            #Collect trajectories, need model and env
            paths, envsteps_this_batch = collect_training_trajectories(itr,
                                            params['expert_data'], policy,
                                            params['batch_size'])

            total_envsteps += envsteps_this_batch

            #Add paths to replay buffer
            replay_buffer.add_rollouts(paths)

            #Train agent
            for train_step in range(params['num_agent_train_steps_per_iter']):
                ob_batch, ac_batch, re_batch, next_ob_batch, terminal_batch = self.replay_buffer.sample_random_data(
                                                                                params['train_batch_size'])

                loss = self.train_agent(ob_batch, ac_batch)
                running_loss += loss.item()

                if train_step % 1000 == 999:    # every 1000 mini-batches...
                    # ...log the running loss
                    writer.add_scalar('training loss',
                            running_loss / 1000,
                            epoch * len(trainloader) + i)

                    # ...log a Matplotlib Figure showing the model's predictions on a random mini-batch
                    writer.add_figure('predictions vs. actuals',
                            plot_classes_preds(net, inputs, labels),
                            global_step=epoch * len(trainloader) + i)

                    running_loss = 0.0
            
        
    def train_agent(ob_batch, ac_batch):
        # zero the parameter gradients
        self.optimizer.zero_grad()

        # forward + backward + optimize
        outputs = self.model(ob_batch)
        loss = self.loss_fn(outputs, torch.Tensor(ac_batch))
        loss.backward()
        self.optimizer.step()

        return loss


In [16]:
pred_act = model(torch.Tensor(sam_obs))

In [17]:
sam_act.shape

(10, 17)

In [9]:
from torch.utils.tensorboard import SummaryWriter

writer = SummaryWriter(logdir)

In [21]:
writer.add_graph(model, torch.Tensor(sam_obs))
writer.close()

In [28]:
loss = loss_fn(pred_act, torch.Tensor(sam_act))
optimizer.zero_grad()
loss.backward()
optimizer.step()

In [None]:
def sample_trajectory(env, policy, max_path_length, render=False, render_mode=('rgb_array')):

    # initialize env for the beginning of a new rollout
    ob = env.reset() # HINT: should be the output of resetting the env

    # init vars
    obs, acs, rewards, next_obs, terminals, image_obs = [], [], [], [], [], []
    steps = 0
    while True:

        # render image of the simulated env
        if render:
            if 'rgb_array' in render_mode:
                if hasattr(env, 'sim'):
                    image_obs.append(env.sim.render(camera_name='track', height=500, width=500)[::-1])
                else:
                    image_obs.append(env.render(mode=render_mode))
            if 'human' in render_mode:
                env.render(mode=render_mode)
                time.sleep(env.model.opt.timestep)

        # use the most recent ob to decide what to do
        obs.append(ob)
        ac = policy.get_action(ob) # HINT: query the policy's get_action function TF object, change to PyTorch
        ac = ac[0]
        acs.append(ac)

        # take that action and record results
        ob, rew, done, _ = env.step(ac)

        # record result of taking that action
        steps += 1
        next_obs.append(ob)
        rewards.append(rew)

        # TODO end the rollout if the rollout ended 
        # HINT: rollout can end due to done, or due to max_path_length
        rollout_done = (steps == max_path_length) or done # HINT: this is either 0 or 1
        terminals.append(rollout_done)
        
        if rollout_done: 
            break

    return Path(obs, image_obs, acs, rewards, next_obs, terminals)


def sample_trajectories(env, policy, min_timesteps_per_batch, max_path_length, render=False, render_mode=('rgb_array')):
    """
        Collect rollouts until we have collected min_timesteps_per_batch steps.
        TODO implement this function
        Hint1: use sample_trajectory to get each path (i.e. rollout) that goes into paths
        Hint2: use get_pathlength to count the timesteps collected in each path
    """
    timesteps_this_batch = 0
    paths = []
    while timesteps_this_batch < min_timesteps_per_batch:
        thispath = sample_trajectory(env, policy, max_path_length, render, render_mode)
        paths.append(thispath)
        timesteps_this_batch += get_pathlength(thispath)

    return paths, timesteps_this_batch



def collect_training_trajectories(itr, load_initial_expertdata, collect_policy, batch_size):
    """
    :param itr:
    :param load_initial_expertdata:  path to expert data pkl file
    :param collect_policy:  the current policy using which we collect data
    :param batch_size:  the number of transitions we collect
    :return:
        paths: a list trajectories
        envsteps_this_batch: the sum over the numbers of environment steps in paths
        train_video_paths: paths which also contain videos for visualization purposes
    """

    if itr==0:
        print(load_initial_expertdata)
        with open(load_initial_expertdata, "rb") as f:
            loaded_paths = pickle.load(f)
        return loaded_paths, 0, None

    print("\nCollecting data to be used for training...")
    paths, envsteps_this_batch = sample_trajectories(env, collect_policy, batch_size, self.params['ep_len'])

    return paths, envsteps_this_batch

In [23]:
%tensorboard --logdir $logdir

In [20]:
#############
## AGENT
#############

class BaseAgent(object):
    def __init__(self, **kwargs):
        super(BaseAgent, self).__init__(**kwargs)

    def train(self):
        raise NotImplementedError

    def add_to_replay_buffer(self, paths):
        raise NotImplementedError

    def sample(self, batch_size):
        raise NotImplementedError
        
        
class BCAgent(BaseAgent):
    def __init__(self, env, agent_params):
        super(BCAgent, self).__init__()

        # init vars
        self.env = env
        self.agent_params = agent_params

        # actor/policy
        self.actor = MLPPolicySL(
                               self.agent_params['ac_dim'],
                               self.agent_params['ob_dim'],
                               self.agent_params['n_layers'],
                               self.agent_params['size'],
                               discrete = self.agent_params['discrete'],
                               learning_rate = self.agent_params['learning_rate'],
                               ) ## TODO: look in here and implement this

        # replay buffer
        self.replay_buffer = ReplayBuffer(self.agent_params['max_replay_buffer_size'])

    def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):
        # training a BC agent refers to updating its actor using
        # the given observations and corresponding action labels
        self.actor.update(ob_no, ac_na) ## TODO: look in here and implement this

    def add_to_replay_buffer(self, paths):
        self.replay_buffer.add_rollouts(paths)

    def sample(self, batch_size):
        return self.replay_buffer.sample_random_data(batch_size) ## TODO: look in here and implement this

(1, 376)