In [1]:
import os
import time
import numpy as np
import gym
from collections import OrderedDict
import pickle
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt

from torch.utils.tensorboard import SummaryWriter

%load_ext tensorboard
%matplotlib notebook

In [2]:
###########################
# placeholder for parameter
###########################

params = {
    "env_name": "CartPole-v0",
    "exp_name": "sb_no_rtg_dsa",
    "num_agent_train_steps_per_iter": 1,
    "n_iter": 10,
    "reward_to_go": True,
    "nn_baseline": True,
    "dont_standardize_advantages": True,
    "batch_size": 1000,
    "eval_batch_size": 500,
    "train_batch_size":1000,
    "discount": 1.0,
    "n_layers": 2,
    "size": 64,
    "learning_rate": 5e-3,
    "video_log_freq": 5,
    "scalar_log_freq": 1,
    "use_gpu": True,
    "which_gpu":0,
    "max_replay_buffer_size":1000000,
    "seed":1
}

agent_params = {
            'n_layers': params['n_layers'],
            'size': params['size'],
            'learning_rate': params['learning_rate'],
            'max_replay_buffer_size': params['max_replay_buffer_size'],
            }

params['agent_params'] = agent_params

# params for saving rollout videos to tensorboard
MAX_NVIDEO = 2
MAX_VIDEO_LEN = 40

log_video = True
log_metric = True


In [3]:
# Make the gym environment
seed = params['seed']
env = gym.make(params['env_name'])
env.seed(seed)

# Maximum length for episodes
params['ep_len'] = env.spec.max_episode_steps

# Is this env continuous, or self.discrete?
discrete = isinstance(env.action_space, gym.spaces.Discrete)
params['agent_params']['discrete'] = discrete

# Observation and action sizes
ob_dim = env.observation_space.shape[0]
ac_dim = env.action_space.n if discrete else env.action_space.shape[0]
params['agent_params']['ac_dim'] = ac_dim
params['agent_params']['ob_dim'] = ob_dim

fps = env.env.metadata['video.frames_per_second']

In [4]:
##Perhaps a deque is better
##This function organise replay buffer for SARS

class ReplayBuffer:

    def __init__(self, max_size=1000000):

        self.max_size = max_size
        self.paths = []
        self.obs = None
        self.acs = None
        self.concatenated_rews = None
        self.unconcatenated_rews = None
        self.next_obs = None
        self.terminals = None

    def __len__(self):
        if self.obs:
            return self.obs.shape[0]
        else:
            return 0

    def add_rollouts(self, paths):

        # add new rollouts into our list of rollouts
        for path in paths:
            self.paths.append(path)

        # convert new rollouts into their component arrays, and append them onto our arrays
        observations, actions, next_observations, terminals, concatenated_rews, unconcatenated_rews = self.convert_listofrollouts(paths)

        if self.obs is None:
            self.obs = observations[-self.max_size:]
            self.acs = actions[-self.max_size:]
            self.next_obs = next_observations[-self.max_size:]
            self.terminals = terminals[-self.max_size:]
            self.concatenated_rews = concatenated_rews[-self.max_size:]
            self.unconcatenated_rews = unconcatenated_rews[-self.max_size:]
        else:
            self.obs = np.concatenate([self.obs, observations])[-self.max_size:]
            self.acs = np.concatenate([self.acs, actions])[-self.max_size:]
            self.next_obs = np.concatenate([self.next_obs, next_observations])[-self.max_size:]
            self.terminals = np.concatenate([self.terminals, terminals])[-self.max_size:]
            self.concatenated_rews = np.concatenate([self.concatenated_rews, concatenated_rews])[-self.max_size:]
            if isinstance(unconcatenated_rews, list):
                self.unconcatenated_rews += unconcatenated_rews
            else:
                self.unconcatenated_rews.append(unconcatenated_rews)
            
    def convert_listofrollouts(self, paths):
        """
            Take a list of rollout dictionaries
            and return separate arrays,
            where each array is a concatenation of that array from across the rollouts
        """
        observations = np.concatenate([path["observation"] for path in paths])
        actions = np.concatenate([path["action"] for path in paths])
        next_observations = np.concatenate([path["next_observation"] for path in paths])
        terminals = np.concatenate([path["terminal"] for path in paths])
        concatenated_rewards = np.concatenate([path["reward"] for path in paths])
        unconcatenated_rewards = [path["reward"] for path in paths]
        return observations, actions, next_observations, terminals, concatenated_rewards, unconcatenated_rewards
    
    
    def sample_random_rollouts(self, num_rollouts):
        rand_indices = np.random.permutation(len(self.paths))[:num_rollouts]
        return self.paths[rand_indices]

    def sample_recent_rollouts(self, num_rollouts=1):
        return self.paths[-num_rollouts:]

    ########################################
    ########################################

    def sample_random_data(self, batch_size):

        assert self.obs.shape[0] == self.acs.shape[0] == self.concatenated_rews.shape[0] == self.next_obs.shape[0] == self.terminals.shape[0]
        rand_indices = np.random.permutation(self.obs.shape[0])[:batch_size]
        return self.obs[rand_indices], self.acs[rand_indices], self.concatenated_rews[rand_indices], self.next_obs[rand_indices], self.terminals[rand_indices]

    def sample_recent_data(self, batch_size=1, concat_rew=True):

        if concat_rew:
            return self.obs[-batch_size:], self.acs[-batch_size:], self.concatenated_rews[-batch_size:], self.next_obs[-batch_size:], self.terminals[-batch_size:]
        else:
            num_recent_rollouts_to_return = 0
            num_datapoints_so_far = 0
            index = -1
            while num_datapoints_so_far < batch_size:
                recent_rollout = self.paths[index]
                index -=1
                num_recent_rollouts_to_return +=1
                num_datapoints_so_far += get_pathlength(recent_rollout)
            rollouts_to_return = self.paths[-num_recent_rollouts_to_return:]
            observations, actions, next_observations, terminals, concatenated_rews, unconcatenated_rews = self.convert_listofrollouts(rollouts_to_return)
            return observations, actions, unconcatenated_rews, next_observations, terminals

In [5]:
replay_buffer = ReplayBuffer(agent_params['max_replay_buffer_size'])

In [6]:
ac_dim = agent_params['ac_dim']
ob_dim = agent_params['ob_dim']
n_layers = agent_params['n_layers']
size = agent_params['size']
discrete = agent_params['discrete']
learning_rate = agent_params['learning_rate']


In [7]:
class Net(torch.nn.Module):
    def __init__(self, ob_dim, size, ac_dim):
        super(Net, self).__init__()
        self.fc1 = torch.nn.Linear(ob_dim, size)
        self.fc2 = torch.nn.Linear(size, size)
        self.fc3 = torch.nn.Linear(size, ac_dim)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        #x = F.softmax(x, dim=1)
        return x
    
model = Net(ob_dim, size, ac_dim)

loss_fn = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [8]:
model(torch.Tensor([env.reset(), env.step(1)[0], env.step(0)[0]]))

tensor([[-0.0609,  0.1366],
        [-0.0638,  0.1331],
        [-0.0605,  0.1363]], grad_fn=<AddmmBackward>)

In [40]:
model(torch.Tensor([env.reset()])).argmax()

tensor(0)

In [8]:
##################
# Policy
##################

class BasePolicy:
    def get_action(self, obs):
        raise NotImplementedError
        
    

class Policy(BasePolicy):
    
    def __init__(self, model, env, params, gamma, learning_rate):
        super().__init__()
        self.modelx = model
        self.optimizer = torch.optim.Adam(self.modelx.parameters(), lr=learning_rate)
        self.env = env
        self.gamma = gamma
        self.nn_baseline = False
        self.reward_to_go = False
        self.standardize_advantages = True
        #self.loss_fn = loss_fn
        #self.optimizer = optimizer
        self.replay_buffer = ReplayBuffer(params['max_replay_buffer_size'])
        
    def get_action(self, obs):
        return modelx(torch.Tensor(obs))
        
    def training_loop(self, epochs=10, log_video=False, relabel_with_expert=False):
        logdir = "./logs/demo_" + time.strftime("%d-%m-%Y_%H-%M-%S")
        writer = SummaryWriter(logdir)
        total_envsteps = 0
        for epoch in range(epochs):
            #Collect trajectories, need model and env
            paths, envsteps_this_batch = collect_training_trajectories(epoch, self.modelx, params['batch_size'])

            total_envsteps += envsteps_this_batch

            #Add paths to replay buffer
            self.replay_buffer.add_rollouts(paths)

            #Train agent
            self.train_agent(epoch, writer)
            
        
    def train_agent(self, epoch, writer):
        #running_loss = 0.0
        for train_step in range(params['num_agent_train_steps_per_iter']):
            #sample from recent data
            ob_batch, ac_batch, re_batch, next_ob_batch, terminal_batch = self.replay_buffer.sample_recent_data(
                                                                            params['train_batch_size'], False)
            
            q_values = self.calculate_q_vals(re_batch)
            advantage_values = self.estimate_advantage(ob_batch, q_values)
            adv_n = torch.Tensor(advantage_values)

            # zero the parameter gradients
            self.optimizer.zero_grad()

            # forward + backward + optimize
            outputs = self.modelx(torch.Tensor(ob_batch))
            model_dist = torch.distributions.Categorical(logits=outputs)
            #sam_ac = model_dist.sample()
            logprob_n = model_dist.log_prob(torch.Tensor(ac_batch))
            loss = (-logprob_n * adv_n).sum()
            
            #want to learn which tensor is backprop and which does not
            loss.backward()
            self.optimizer.step()
            #running_loss += loss.item()
            
            train_returns = np.mean([reward.sum() for reward in re_batch])

            #if train_step % 1000 == 999:    # every 1000 mini-batches...
                # ...log the running loss
                #writer.add_scalar('training loss', running_loss / 1000, epoch )
            writer.add_scalar('Average Rewards', train_returns, epoch )

                #running_loss = 0.0

    def calculate_q_vals(self, rews_list):
        # Case 1: trajectory-based PG 
        if not self.reward_to_go:
            q_values = np.concatenate([self._discounted_return(r) for r in rews_list])

        # Case 2: reward-to-go PG 
        else:
            q_values = np.concatenate([self._discounted_cumsum(r) for r in rews_list])

        return q_values
    
    def estimate_advantage(self, obs, q_values):
        # TODO: Estimate the advantage when nn_baseline is True
        # HINT1: pass obs into the neural network that you're using to learn the baseline
            # extra hint if you're stuck: see your actor's run_baseline_prediction
        # HINT2: advantage should be [Q-b]
        if self.nn_baseline:
            b_n_unnormalized = self.actor.run_baseline_prediction(obs)
            b_n = b_n_unnormalized * np.std(q_values) + np.mean(q_values)
            adv_n = q_values - b_n

        # Else, just set the advantage to [Q]
        else:
            adv_n = q_values.copy()

        # Normalize the resulting advantages
        if self.standardize_advantages:
            adv_n = (adv_n - np.mean(adv_n)) / (np.std(adv_n) + 1e-8)

        return adv_n
    
    def _discounted_return(self, rewards):
        # 1) create a list of indices (t'): from 0 to T-1
        indices = np.arange(len(rewards))

        # 2) create a list where the entry at each index (t') is gamma^(t')
        discounts = np.power(self.gamma, indices)

        # 3) create a list where the entry at each index (t') is gamma^(t') * r_{t'}
        discounted_rewards = discounts * np.array(rewards)

        # 4) calculate a scalar: sum_{t'=0}^{T-1} gamma^(t') * r_{t'}
        sum_of_discounted_rewards = np.sum(discounted_rewards)

        # 5) create a list of length T-1, where each entry t contains that scalar
        list_of_discounted_returns = sum_of_discounted_rewards * np.ones(indices.shape[0])

        return list_of_discounted_returns
    
    def _discounted_cumsum(self, rewards):
        all_discounted_cumsums = []

        # for loop over steps (t) of the given rollout
        for start_time_index in range(len(rewards)): 

            # 1) create a list of indices (t'): goes from t to T-1
            indices = np.arange(len(rewards))

            # 2) create a list where the entry at each index (t') is gamma^(t'-t)
            discounts = np.power(self.gamma, indices - start_time_index)

            # 3) create a list where the entry at each index (t') is gamma^(t'-t) * r_{t'}
            # Hint: remember that t' goes from t to T-1, so you should use the rewards from those indices as well
            discounted_rtg = np.array(rewards[start_time_index:]) * discounts[start_time_index:]

            # 4) calculate a scalar: sum_{t'=t}^{T-1} gamma^(t'-t) * r_{t'}
            sum_discounted_rtg = np.sum(discounted_rtg)

            # appending each of these calculated sums into the list to return
            all_discounted_cumsums.append(sum_discounted_rtg)
        list_of_discounted_cumsums = np.array(all_discounted_cumsums)
        return list_of_discounted_cumsums 

In [9]:
def Path(obs, image_obs, acs, rewards, next_obs, terminals):
    """
        Take info (separate arrays) from a single rollout
        and return it in a single dictionary
    """
    if image_obs != []:
        image_obs = np.stack(image_obs, axis=0)
    return {"observation" : np.array(obs, dtype=np.float32),
            "image_obs" : np.array(image_obs, dtype=np.uint8),
            "reward" : np.array(rewards, dtype=np.float32),
            "action" : np.array(acs, dtype=np.float32),
            "next_observation": np.array(next_obs, dtype=np.float32),
            "terminal": np.array(terminals, dtype=np.float32)}

def get_pathlength(path):
    return len(path["reward"])

def sample_trajectory(env, modelx, max_path_length):

    # initialize env for the beginning of a new rollout
    ob = env.reset() # HINT: should be the output of resetting the env

    # init varas
    obs, acs, rewards, next_obs, terminals, image_obs = [], [], [], [], [], []
    steps = 0
    while True:
        # use the most recent ob to decide what to do
        obs.append(ob)
        
        with torch.no_grad():
            #ac = modelx(torch.Tensor([ob]))
            output = modelx(torch.Tensor(ob))
            m = torch.distributions.Categorical(logits=output)
            ac = m.sample()
            ac = ac.detach().numpy() # HINT: query the policy's get_action function TF object, change to PyTorch
        #ac = ac[0]
        acs.append(ac)

        # take that action and record results
        ob, rew, done, _ = env.step(ac)

        # record result of taking that action
        steps += 1
        next_obs.append(ob)
        rewards.append(rew)

        # TODO end the rollout if the rollout ended 
        # HINT: rollout can end due to done, or due to max_path_length
        rollout_done = (steps == max_path_length) or done # HINT: this is either 0 or 1
        terminals.append(rollout_done)
        
        if rollout_done: 
            break

    return Path(obs, image_obs, acs, rewards, next_obs, terminals)


def sample_trajectories(env, policy, min_timesteps_per_batch, max_path_length):
    """
        Collect rollouts until we have collected min_timesteps_per_batch steps.
        TODO implement this function
        Hint1: use sample_trajectory to get each path (i.e. rollout) that goes into paths
        Hint2: use get_pathlength to count the timesteps collected in each path
    """
    timesteps_this_batch = 0
    paths = []
    while timesteps_this_batch < min_timesteps_per_batch:
        thispath = sample_trajectory(env, policy, max_path_length)
        paths.append(thispath)
        timesteps_this_batch += get_pathlength(thispath)

    return paths, timesteps_this_batch



def collect_training_trajectories(itr, model, batch_size):

    #print("\nCollecting data to be used for training...")
    paths, envsteps_this_batch = sample_trajectories(env, model, batch_size, params['ep_len'])

    return paths, envsteps_this_batch

In [15]:
ob_batch, ac_batch, re_batch, next_ob_batch, terminal_batch = replay_buffer.sample_recent_data(20, False)

In [38]:
#sam_obs = np.array([-0.03387007,  0.1474191 ,  0.03877906, -0.28876239])
output = model(torch.Tensor(ob_batch))
m = torch.distributions.Categorical(logits=output)
sam_ac = m.sample()
logprob_n = m.log_prob(torch.Tensor(ac_batch))

In [17]:
output.shape

torch.Size([27, 2])

In [18]:
logprob_n.shape

torch.Size([27])

In [19]:
sam_ac.shape

torch.Size([27])

In [46]:
q_values = calculate_q_vals(re_batch)
advantage_values = estimate_advantage(ob_batch, q_values)
adv_n = torch.Tensor(advantage_values)

In [47]:
adv_n

tensor([9., 9., 9., 9., 9., 9., 9., 9., 9., 9., 9., 9., 9., 9., 9., 9., 9., 9.,
        9., 9., 9., 9., 9., 9., 9., 9., 9.])

In [48]:
(logprob_n * adv_n)

tensor([-5.9170, -6.0416, -6.1232, -6.0825, -5.8365, -5.5870, -5.3580, -5.1370,
        -4.9167, -5.8745, -5.9961, -6.0797, -6.0588, -5.8322, -5.5854, -5.3609,
        -5.1358, -4.9160, -5.8945, -6.0259, -6.1043, -6.0390, -5.8074, -5.5726,
        -5.3471, -5.1177, -4.8906], grad_fn=<MulBackward0>)

In [37]:
m.log_prob(torch.Tensor(ac_batch))

tensor([-0.6574, -0.6713, -0.6804, -0.6758, -0.6485, -0.6208, -0.5953, -0.5708,
        -0.5463, -0.6527, -0.6662, -0.6755, -0.6732, -0.6480, -0.6206, -0.5957,
        -0.5706, -0.5462, -0.6549, -0.6695, -0.6783, -0.6710, -0.6453, -0.6192,
        -0.5941, -0.5686, -0.5434], grad_fn=<SqueezeBackward1>)

In [29]:
model(torch.Tensor([ob_batch[1]])).argmax().numpy()

array(1)

In [34]:
            
#want to learn which tensor is backprop and which does not
#loss = loss_fn(logprob_n, torch.Tensor(advantage_values))

#loss
#env.reset()
#next_state, reward, _, _ = env.step(sam_ac)

In [40]:
m.sample()

tensor([1, 1, 0,  ..., 1, 0, 0])

In [45]:
def calculate_q_vals(rews_list):
    # Case 1: trajectory-based PG 
    q_values = np.concatenate([_discounted_return(r) for r in rews_list])

    return q_values

def estimate_advantage(obs, q_values):

    adv_n = q_values.copy()

    # Normalize the resulting advantages

    #adv_n = (adv_n - np.mean(adv_n)) / (np.std(adv_n) + 1e-8)

    return adv_n

def _discounted_return(rewards):
    gamma = 1.0
    # 1) create a list of indices (t'): from 0 to T-1
    indices = np.arange(len(rewards))

    # 2) create a list where the entry at each index (t') is gamma^(t')
    discounts = np.power(gamma, indices)

    # 3) create a list where the entry at each index (t') is gamma^(t') * r_{t'}
    discounted_rewards = discounts * np.array(rewards)

    # 4) calculate a scalar: sum_{t'=0}^{T-1} gamma^(t') * r_{t'}
    sum_of_discounted_rewards = np.sum(discounted_rewards)

    # 5) create a list of length T-1, where each entry t contains that scalar
    list_of_discounted_returns = sum_of_discounted_rewards * np.ones(indices.shape[0])

    return list_of_discounted_returns

In [15]:
sam_paths, _ = collect_training_trajectories(1, model, params['batch_size'])

In [16]:
len(sam_paths)

16

In [17]:
replay_buffer.add_rollouts(sam_paths)

In [18]:
replay_buffer.sample_recent_data(10, False)

(array([[-0.02140454,  0.0325623 ,  0.01619541,  0.04024277],
        [-0.02075329, -0.16278811,  0.01700026,  0.33799118],
        [-0.02400906,  0.03208786,  0.02376008,  0.05071731],
        [-0.0233673 ,  0.2268612 ,  0.02477443, -0.23437543],
        [-0.01883007,  0.03139419,  0.02008692,  0.06601804],
        [-0.01820219, -0.1640099 ,  0.02140728,  0.3649702 ],
        [-0.02148239,  0.03080138,  0.02870669,  0.07911351],
        [-0.02086636,  0.2255003 ,  0.03028896, -0.20437594],
        [-0.01635635,  0.02995859,  0.02620144,  0.09770569],
        [-0.01575718, -0.1655289 ,  0.02815555,  0.39853862],
        [-0.01906776,  0.02918255,  0.03612632,  0.11486384],
        [-0.01848411,  0.22376874,  0.0384236 , -0.16620617],
        [-0.01400874,  0.02811844,  0.03509948,  0.13834643],
        [-0.01344637, -0.1674882 ,  0.03786641,  0.4418928 ],
        [-0.01679613,  0.02707802,  0.04670426,  0.16138273],
        [-0.01625457,  0.22150132,  0.04993192, -0.11620813],
        

In [10]:
policy = Policy(model, env, params, 0.9, learning_rate)

In [11]:
policy.training_loop(100)