In [1]:
import os
import time
import numpy as np
import gym
from collections import OrderedDict
import pickle
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt

%load_ext tensorboard
%matplotlib notebook

In [2]:
###########################
# placeholder for parameter
###########################

logdir = "./logs/demo4"

params = {
    "expert_policy_file": "cs285/policies/experts/Humanoid.pkl",
    "expert_data": "cs285/expert_data/expert_data_Humanoid-v2.pkl",
    "env_name": "Humanoid-v2",
    "exp_name": "test_bc_humanoid",
    "do_dagger": False,
    "num_agent_train_steps_per_iter": 1000,
    "n_iter": 1,
    "batch_size": 1000,
    "eval_batch_size": 5000,
    "train_batch_size":100,
    "n_layers": 2,
    "size": 64,
    "learning_rate": 5e-3,
    "video_log_freq": 5,
    "scalar_log_freq": 1,
    "use_gpu": True,
    "which_gpu":0,
    "max_replay_buffer_size":1000000,
    "seed":1,
    "logdir":logdir
}

agent_params = {
            'n_layers': params['n_layers'],
            'size': params['size'],
            'learning_rate': params['learning_rate'],
            'max_replay_buffer_size': params['max_replay_buffer_size'],
            }

params['agent_params'] = agent_params

# params for saving rollout videos to tensorboard
MAX_NVIDEO = 2
MAX_VIDEO_LEN = 40

log_video = True
log_metric = True

In [3]:
# Make the gym environment
seed = params['seed']
env = gym.make(params['env_name'])
env.seed(seed)

# Maximum length for episodes
params['ep_len'] = env.spec.max_episode_steps

# Is this env continuous, or self.discrete?
discrete = isinstance(env.action_space, gym.spaces.Discrete)
params['agent_params']['discrete'] = discrete

# Observation and action sizes
ob_dim = env.observation_space.shape[0]
ac_dim = env.action_space.n if discrete else env.action_space.shape[0]
params['agent_params']['ac_dim'] = ac_dim
params['agent_params']['ob_dim'] = ob_dim

fps = env.env.metadata['video.frames_per_second']

In [4]:
env.reset()
env.step(1)[0].shape

(376,)

In [4]:
##Perhaps a deque is better
##This function organise replay buffer for SARS

class ReplayBuffer(object):

    def __init__(self, max_size=1000000):

        self.max_size = max_size

        # store each rollout
        self.paths = []

        # store (concatenated) component arrays from each rollout
        self.obs = None
        self.acs = None
        self.rews = None
        self.next_obs = None
        self.terminals = None

    def __len__(self):
        if self.obs:
            return self.obs.shape[0]
        else:
            return 0

    def add_rollouts(self, paths, concat_rew=True):

        # add new rollouts into our list of rollouts
        for path in paths:
            self.paths.append(path)

        # convert new rollouts into their component arrays, and append them onto our arrays
        observations, actions, rewards, next_observations, terminals = self.convert_listofrollouts(paths, concat_rew)

        if self.obs is None:
            self.obs = observations[-self.max_size:]
            self.acs = actions[-self.max_size:]
            self.rews = rewards[-self.max_size:]
            self.next_obs = next_observations[-self.max_size:]
            self.terminals = terminals[-self.max_size:]
        else:
            self.obs = np.concatenate([self.obs, observations])[-self.max_size:]
            self.acs = np.concatenate([self.acs, actions])[-self.max_size:]
            if concat_rew:
                self.rews = np.concatenate([self.rews, rewards])[-self.max_size:]
            else:
                if isinstance(rewards, list):
                    self.rews += rewards
                else:
                    self.rews.append(rewards)
                self.rews = self.rews[-self.max_size:]
            self.next_obs = np.concatenate([self.next_obs, next_observations])[-self.max_size:]
            self.terminals = np.concatenate([self.terminals, terminals])[-self.max_size:]
            
    def convert_listofrollouts(self, paths, concat_rew=True):
        """
            Take a list of rollout dictionaries
            and return separate arrays,
            where each array is a concatenation of that array from across the rollouts
        """
        observations = np.concatenate([path["observation"] for path in paths])
        actions = np.concatenate([path["action"] for path in paths])
        if concat_rew:
            rewards = np.concatenate([path["reward"] for path in paths])
        else:
            rewards = [path["reward"] for path in paths]
        next_observations = np.concatenate([path["next_observation"] for path in paths])
        terminals = np.concatenate([path["terminal"] for path in paths])
        return observations, actions, rewards, next_observations, terminals

    ########################################
    ########################################

    def sample_random_data(self, batch_size):
        assert self.obs.shape[0] == self.acs.shape[0] == self.rews.shape[0] == self.next_obs.shape[0] == self.terminals.shape[0]

        ## TODO return batch_size number of random entries from each of the 5 component arrays above
        ## HINT 1: use np.random.permutation to sample random indices
        ## HINT 2: return corresponding data points from each array (i.e., not different indices from each array)
        ## HINT 3: look at the sample_recent_data function below
        mask = np.random.permutation(self.obs.shape[0])
        return self.obs[mask<batch_size], self.acs[mask<batch_size], self.rews[mask<batch_size], self.next_obs[mask<batch_size], self.terminals[mask<batch_size]

    def sample_recent_data(self, batch_size=1):
        return self.obs[-batch_size:], self.acs[-batch_size:], self.rews[-batch_size:], self.next_obs[-batch_size:], self.terminals[-batch_size:]

In [5]:
replay_buffer = ReplayBuffer(agent_params['max_replay_buffer_size'])

In [6]:
#Sample Paths
print(params['expert_data'])
with open(params['expert_data'], "rb") as f:
    loaded_paths = pickle.load(f)
    
replay_buffer.add_rollouts(loaded_paths)
sam_obs, sam_act, sam_reward, sam_nextobs, sam_terminal = replay_buffer.sample_random_data(10)

sam_obs.shape

cs285/expert_data/expert_data_Humanoid-v2.pkl


(10, 376)

In [7]:
ac_dim = agent_params['ac_dim']
ob_dim = agent_params['ob_dim']
n_layers = agent_params['n_layers']
size = agent_params['size']
discrete = agent_params['discrete']
learning_rate = agent_params['learning_rate']


In [8]:
model = torch.nn.Sequential(
    torch.nn.Linear(ob_dim, size),
    torch.nn.ReLU(),
    torch.nn.Linear(size, size),
    torch.nn.ReLU(),
    torch.nn.Linear(size, ac_dim)
    )

class Net(torch.nn.Module):
    def __init__(self, ob_dim, size, ac_dim):
        super(Net, self).__init__()
        self.fc1 = torch.nn.Linear(ob_dim, size)
        self.fc2 = torch.nn.Linear(size, size)
        self.fc3 = torch.nn.Linear(size, ac_dim)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    
model = Net(ob_dim, size, ac_dim)

loss_fn = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
def perform_logging(self, itr, paths, eval_policy, train_video_paths):

    # collect eval trajectories, for logging
    print("\nCollecting data for eval...")
    eval_paths, eval_envsteps_this_batch = sample_trajectories(self.env, eval_policy, self.params['eval_batch_size'], self.params['ep_len'])

    # save eval rollouts as videos in tensorboard event file
    if self.log_video and train_video_paths != None:
        print('\nCollecting video rollouts eval')
        eval_video_paths = sample_n_trajectories(self.env, eval_policy, MAX_NVIDEO, MAX_VIDEO_LEN, True)

        #save train/eval videos
        print('\nSaving train rollouts as videos...')
        self.logger.log_paths_as_videos(train_video_paths, itr, fps=self.fps, max_videos_to_save=MAX_NVIDEO,
                                        video_title='train_rollouts')
        self.logger.log_paths_as_videos(eval_video_paths, itr, fps=self.fps,max_videos_to_save=MAX_NVIDEO,
                                         video_title='eval_rollouts')

    # save eval metrics
    if self.log_metrics:
        # returns, for logging
        train_returns = [path["reward"].sum() for path in paths]
        eval_returns = [eval_path["reward"].sum() for eval_path in eval_paths]

        # episode lengths, for logging
        train_ep_lens = [len(path["reward"]) for path in paths]
        eval_ep_lens = [len(eval_path["reward"]) for eval_path in eval_paths]

        # decide what to log
        logs = OrderedDict()
        logs["Eval_AverageReturn"] = np.mean(eval_returns)
        logs["Eval_StdReturn"] = np.std(eval_returns)
        logs["Eval_MaxReturn"] = np.max(eval_returns)
        logs["Eval_MinReturn"] = np.min(eval_returns)
        logs["Eval_AverageEpLen"] = np.mean(eval_ep_lens)

        logs["Train_AverageReturn"] = np.mean(train_returns)
        logs["Train_StdReturn"] = np.std(train_returns)
        logs["Train_MaxReturn"] = np.max(train_returns)
        logs["Train_MinReturn"] = np.min(train_returns)
        logs["Train_AverageEpLen"] = np.mean(train_ep_lens)

        logs["Train_EnvstepsSoFar"] = self.total_envsteps
        logs["TimeSinceStart"] = time.time() - self.start_time


        if itr == 0:
            self.initial_return = np.mean(train_returns)
        logs["Initial_DataCollection_AverageReturn"] = self.initial_return

        # perform the logging
        for key, value in logs.items():
            print('{} : {}'.format(key, value))
            self.logger.log_scalar(value, key, itr)
        print('Done logging...\n\n')

        self.logger.flush()


In [9]:
with open(params['expert_policy_file'], "rb") as f:
    expert_policy_file = pickle.load(f)

NameError: name 'ExpertPolicy' is not defined

In [50]:
obsnorm_mean = expertpolicy.policy_params['obsnorm']['Standardizer']['mean_1_D']
obsnorm_meansq = expertpolicy.policy_params['obsnorm']['Standardizer']['meansq_1_D']
obsnorm_stdev = np.sqrt(np.maximum(0, obsnorm_meansq - np.square(obsnorm_mean)))
print('obs', obsnorm_mean.shape, obsnorm_stdev.shape)

curr_activations_bd = torch.Tensor((sam_obs[1].reshape(1,-1) - obsnorm_mean) / (obsnorm_stdev + 1e-6))
layer_params = expertpolicy.policy_params['hidden']['FeedforwardNet']
for layer_name in sorted(layer_params.keys()):
    l = layer_params[layer_name]
    W = torch.Tensor(l['AffineLayer']['W'])
    b = torch.Tensor(l['AffineLayer']['b'])
    curr_activations_bd = torch.tanh(torch.mm(curr_activations_bd, W) + b)


obs (1, 376) (1, 376)


In [76]:
expertpolicy.get_action(sam_obs[0])

tensor([[-0.2668, -2.1136,  1.1157,  1.0326,  0.6725,  4.7755,  4.1224,  1.3461,
          1.9001, -1.9799,  0.4929,  0.7351,  1.1593,  0.1483,  1.7030,  1.7201,
         -0.5640]])

In [11]:
##################
# Policy
##################

class BasePolicy:
    def get_action(self, obs):
        raise NotImplementedError
        
    

class Policy(BasePolicy):
    
    def __init__(self, model, env, params, loss_fn, optimizer):
        super().__init__()
        #self.model = model
        self.env = env
        #self.loss_fn = loss_fn
        #self.optimizer = optimizer
        self.replay_buffer = ReplayBuffer(params['max_replay_buffer_size'])
        
    def get_action(self, obs):
        return model(torch.Tensor(obs))
        
    def training_loop(self, epochs=10, log_video=False, relabel_with_expert=False):
        
        total_envsteps = 0
        for epoch in range(epochs):
            #Collect trajectories, need model and env
            paths, envsteps_this_batch = collect_training_trajectories(epoch,
                                            params['expert_data'], model,
                                            params['batch_size'])

            total_envsteps += envsteps_this_batch
            
            # relabel the collected obs with actions from a provided expert policy
            if relabel_with_expert and epoch >= 1:
                paths = self.do_relabel_with_expert(expert_policy, paths) 

            #Add paths to replay buffer
            self.replay_buffer.add_rollouts(paths)

            #Train agent
            self.train_agent(epoch)
            
            #logging
            if log_video:
                perform_logging(epoch, paths, train_video_paths)
            
        
    def train_agent(self, epoch):
        running_loss = 0.0
        for train_step in range(params['num_agent_train_steps_per_iter']):
            ob_batch, ac_batch, re_batch, next_ob_batch, terminal_batch = self.replay_buffer.sample_random_data(
                                                                            params['train_batch_size'])

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = model(torch.Tensor(ob_batch))
            loss = loss_fn(outputs, torch.Tensor(ac_batch))
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

            if train_step % 1000 == 999:    # every 1000 mini-batches...
                # ...log the running loss
                writer.add_scalar('training loss', running_loss / 1000, epoch * 1000 + train_step)

                running_loss = 0.0
                
    def do_relabel_with_expert(self, expert_policy, paths):
        print("\nRelabelling collected observations with labels from an expert policy...")

        # TODO relabel collected obsevations (from our policy) with labels from an expert policy
        # HINT: query the policy (using the get_action function) with paths[i]["observation"]
        # and replace paths[i]["action"] with these expert labels
        for i in range(len(paths)):
            paths[i]["action"] = expert_policy.get_action(paths[i]["observation"])

        return paths
    
    
class ExpertPolicy(BasePolicy):
    def __init__(self, expert_policy_file):
        super().__init__()
        
        self.nonlin_type = expert_policy_file['nonlin_type']
        self.policy_params = expert_policy_file['GaussianPolicy']
        assert set(self.policy_params.keys()) == {'logstdevs_1_Da', 'hidden', 'obsnorm', 'out'}
        
    def get_action(self, obs):
        obsnorm_mean = self.policy_params['obsnorm']['Standardizer']['mean_1_D']
        obsnorm_meansq = self.policy_params['obsnorm']['Standardizer']['meansq_1_D']
        obsnorm_stdev = np.sqrt(np.maximum(0, obsnorm_meansq - np.square(obsnorm_mean)))

        curr_activations_bd = torch.Tensor((obs - obsnorm_mean) / (obsnorm_stdev + 1e-6))
        layer_params = self.policy_params['hidden']['FeedforwardNet']
        for layer_name in sorted(layer_params.keys()):
            l = layer_params[layer_name]
            W = torch.Tensor(l['AffineLayer']['W'])
            b = torch.Tensor(l['AffineLayer']['b'])
            curr_activations_bd = torch.tanh(torch.mm(curr_activations_bd, W) + b)
        
        W = torch.Tensor(self.policy_params['out']['AffineLayer']['W'])
        b = torch.Tensor(self.policy_params['out']['AffineLayer']['b'])
        output_bo = torch.mm(curr_activations_bd, W) + b
        
        return output_bo

In [12]:
expertpolicy = ExpertPolicy(expert_policy_file)

In [12]:
pred_act = model(torch.Tensor(sam_obs))

In [13]:
pred_act.size()

torch.Size([10, 17])

In [13]:
from torch.utils.tensorboard import SummaryWriter

writer = SummaryWriter(logdir)

In [56]:
writer.add_graph(model, torch.Tensor(sam_obs))
writer.close()

In [16]:
loss = loss_fn(pred_act, torch.Tensor(sam_act))
optimizer.zero_grad()
loss.backward()
optimizer.step()

In [19]:
loss.detach().numpy()

array(1686.8428, dtype=float32)

In [14]:
def Path(obs, image_obs, acs, rewards, next_obs, terminals):
    """
        Take info (separate arrays) from a single rollout
        and return it in a single dictionary
    """
    if image_obs != []:
        image_obs = np.stack(image_obs, axis=0)
    return {"observation" : np.array(obs, dtype=np.float32),
            "image_obs" : np.array(image_obs, dtype=np.uint8),
            "reward" : np.array(rewards, dtype=np.float32),
            "action" : np.array(acs, dtype=np.float32),
            "next_observation": np.array(next_obs, dtype=np.float32),
            "terminal": np.array(terminals, dtype=np.float32)}

def get_pathlength(path):
    return len(path["reward"])

def sample_trajectory(env, model, max_path_length):

    # initialize env for the beginning of a new rollout
    ob = env.reset() # HINT: should be the output of resetting the env

    # init varas
    obs, acs, rewards, next_obs, terminals, image_obs = [], [], [], [], [], []
    steps = 0
    while True:
        # use the most recent ob to decide what to do
        obs.append(ob)
        ac = model(torch.Tensor(ob)).detach().numpy() # HINT: query the policy's get_action function TF object, change to PyTorch
        #ac = ac[0]
        acs.append(ac)

        # take that action and record results
        ob, rew, done, _ = env.step(ac)

        # record result of taking that action
        steps += 1
        next_obs.append(ob)
        rewards.append(rew)

        # TODO end the rollout if the rollout ended 
        # HINT: rollout can end due to done, or due to max_path_length
        rollout_done = (steps == max_path_length) or done # HINT: this is either 0 or 1
        terminals.append(rollout_done)
        
        if rollout_done: 
            break

    return Path(obs, image_obs, acs, rewards, next_obs, terminals)


def sample_trajectories(env, policy, min_timesteps_per_batch, max_path_length):
    """
        Collect rollouts until we have collected min_timesteps_per_batch steps.
        TODO implement this function
        Hint1: use sample_trajectory to get each path (i.e. rollout) that goes into paths
        Hint2: use get_pathlength to count the timesteps collected in each path
    """
    timesteps_this_batch = 0
    paths = []
    while timesteps_this_batch < min_timesteps_per_batch:
        thispath = sample_trajectory(env, policy, max_path_length)
        paths.append(thispath)
        timesteps_this_batch += get_pathlength(thispath)

    return paths, timesteps_this_batch



def collect_training_trajectories(itr, load_initial_expertdata, model, batch_size):
    """
    :param itr:
    :param load_initial_expertdata:  path to expert data pkl file
    :param collect_policy:  the current policy using which we collect data
    :param batch_size:  the number of transitions we collect
    :return:
        paths: a list trajectories
        envsteps_this_batch: the sum over the numbers of environment steps in paths
        train_video_paths: paths which also contain videos for visualization purposes
    """

    if itr==0:
        print(load_initial_expertdata)
        with open(load_initial_expertdata, "rb") as f:
            loaded_paths = pickle.load(f)
        return loaded_paths, 0

    print("\nCollecting data to be used for training...")
    paths, envsteps_this_batch = sample_trajectories(env, model, batch_size, params['ep_len'])

    return paths, envsteps_this_batch

In [16]:
trajectories = collect_training_trajectories(1, params['expert_data'], model, params['batch_size'])


Collecting data to be used for training...


In [17]:
trajectories[0][1]['observation'].shape

(27, 376)

In [15]:
policy = Policy(model, env, params, loss_fn, optimizer)

In [16]:
policy.training_loop(10)

cs285/expert_data/expert_data_Humanoid-v2.pkl

Collecting data to be used for training...

Collecting data to be used for training...

Collecting data to be used for training...

Collecting data to be used for training...

Collecting data to be used for training...

Collecting data to be used for training...

Collecting data to be used for training...

Collecting data to be used for training...

Collecting data to be used for training...


In [17]:
%tensorboard --logdir $logdir

In [18]:
env.render()

Creating window glfw
