In [29]:
import math
import random 

import gym
import numpy as np
        
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Normal,Beta
from sklearn import preprocessing

from IPython.display import clear_output
import matplotlib.pyplot as plt
%matplotlib inline

seed_number = 2019

# Use CUDA

In [28]:
np.random.seed(seed_number)
torch.backends.cudnn.deterministic = True
torch.manual_seed(seed_number)

use_cuda = torch.cuda.is_available()

if use_cuda:
    torch.cuda.manual_seed_all(seed_number)
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

# Neural Network

In [27]:
def init_weights(m):
    if isinstance(m, nn.Linear):
#         nn.init.normal_(m.weight, mean=0., std=0.1)
        nn.init.uniform(m.weight, a=0, b=1)
        nn.init.constant_(m.bias, 0.1)
        

class ActorCritic(nn.Module):
    def __init__(self, num_inputs, num_outputs, hidden_size, std=0.0):
        super(ActorCritic, self).__init__()
        
        self.critic = nn.Sequential(
            nn.Linear(num_inputs, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, 1)
        )
        
        self.actor = nn.Sequential(
            nn.Linear(num_inputs, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, num_outputs),
        )
        self.apply(init_weights)
        
    def forward(self, x):
        value = self.critic(x)
        a = nn.LogSoftmax()
        act   = a(self.actor(x))
        return act, value

In [26]:
def plot(frame_idx, rewards):
    clear_output(True)
    plt.figure(figsize=(20,5))
    plt.subplot(131)
    plt.title('frame %s. reward: %s' % (frame_idx, rewards[-1]))
    plt.plot(rewards)
    plt.show()
    
def test_env(model,goal, vis=False):
#     print ('\nevaluate performance')
    state = env.reset()
    state = state[0]
    done = False
    total_reward = 0
    while not done :
        state_goal = np.concatenate((state,goal),0)
        if vis: 
            env.render()
        state_goal = torch.FloatTensor(state_goal).unsqueeze(0).to(device)
        act, _ = model(state_goal)
        next_state, reward, done, _ = env.step(np.argmax(torch.exp(act).data.cpu().numpy()[0]))
        state = next_state
        total_reward += reward
#     print ('\ntotal_reward: ', total_reward)
    return total_reward

# Hindsight GAE

In [24]:
def hindsight_gae(next_value, rewards, masks, values, gamma=0.95, lamda=0.95):

    for step in range(len(values)):
        values[step] = torch.cat((values[step], next_value[step]),0)
    returns = []
    for outer_step in range(len(rewards)):
        gae = 0
        for step in range(len(rewards[outer_step])):
            delta = rewards[outer_step][step]  - values[outer_step][step]+ gamma * values[outer_step][step + 1] * masks[outer_step][step]
            gae = delta + gamma * lamda * masks[outer_step][step] * gae
            returns.insert(0, gae + values[outer_step][step])
    return returns

# Importance Hindsight GAE

In [25]:
def is_hindsight_gae(rewards, current_logprobs, desired_logprobs, masks, values, gamma = 0.5, lamda = 0.95):
    returns = []
    lambda_ret = 1
    for outer_step in range(len(rewards)):
        hindsight_gae = 0
        for step in range(len(rewards[outer_step])):
            temp = 0
            is_weight_ratio = 1
#             print ('len(rewards[outer]): ', len(rewards[outer_step]))
            for step_ in range(1, len(rewards[outer_step])):
                ratio = np.exp(current_logprobs[outer_step][step_] - desired_logprobs[outer_step][step_])
                clipped_ratio = lambda_ret * np.clip(ratio, a_min = -1, a_max = 1)
#                 print ('clipped_ratio: ', clipped_ratio)
                is_weight_ratio = is_weight_ratio * clipped_ratio
                # print ('is_weight_ratio: ', is_weight_ratio)
            for step_ in range(1, len(rewards[outer_step])):
                temp = temp + ((gamma ** (step_+1)) * rewards[outer_step][step_] - 
                               (gamma ** (step_)) * rewards[outer_step][step_])  
            temp = temp - (gamma ** (step + 1)) * rewards[outer_step][step]

            delta = rewards[outer_step][step] + is_weight_ratio * temp
            hindsight_gae = delta + gamma * lamda * masks[outer_step][step] * hindsight_gae
            returns.insert(0, torch.FloatTensor([hindsight_gae]).to(device) + values[outer_step][step])

    return returns

# Proximal Policy Optimization Algorithm

In [9]:
def ppo_iter(mini_batch_size, states, actions, log_probs, returns, advantage):
    batch_size = states.size(0)
    actions = actions.view(len(actions),1)
    log_probs = log_probs.view(len(log_probs),1)
    returns = returns.view(len(returns),1)
    advantage = advantage.view(len(advantage),1)
    for _ in range(batch_size // mini_batch_size):
        rand_ids = np.random.randint(0, batch_size, mini_batch_size)
        yield states[rand_ids, :], actions[rand_ids, :], log_probs[rand_ids, :], returns[rand_ids, :], advantage[rand_ids, :]
        
def ppo_update(ppo_epochs, mini_batch_size, states, actions, log_probs, returns, advantages, clip_param=0.2):
    print ('update ppo')
    actor_loss_list = []
    critic_loss_list = []
    clip = 5
    for _ in range(ppo_epochs):
        for state, action, old_log_probs, return_, advantage in ppo_iter(mini_batch_size, states, actions, log_probs, returns, advantages):
            act, value = model(state)
            action = np.argmax((act.data.cpu().numpy()),axis = 1)
            one_hot_action = to_one_hot(action, BITS_NUMBER)
            new_log_probs = np.sum((act.data.cpu().numpy() * one_hot_action),1)
            ratio = np.exp(new_log_probs - old_log_probs)
            ratio = torch.FloatTensor(ratio.float()).to(device)
            surr1 = ratio * advantage
            surr2 = torch.clamp(ratio, 1.0 - clip_param, 1.0 + clip_param) * advantage
            actor_loss  = - torch.min(surr1, surr2).mean()
            critic_loss = (returns - value).pow(2).mean()

            actor_loss_list.append(actor_loss.data.cpu().numpy().item(0))
            critic_loss_list.append(critic_loss.data.cpu().numpy().item(0))

            loss = 0.5 * critic_loss + actor_loss 

            optimizer.zero_grad()
            loss.backward()
            
            # clip gradient to prevent gradient exploding
            nn.utils.clip_grad_norm_(model.parameters(), clip)
            
            optimizer.step()
    mean_actor_loss = np.mean(actor_loss_list)
    mean_critic_loss = np.mean(critic_loss_list)
    
    mean_actor_loss_list.append(mean_actor_loss)
    mean_critic_loss_list.append(mean_critic_loss)

# Create Environment

In [10]:
class BitFlip():
    def __init__(self, n, reward_type, max_steps):
        self.number_of_bits = n # number of bits
        self.reward_type = reward_type
        self.max_steps = max_steps
    def reset(self):
        self.n_steps = 0
        self.goal = np.random.randint(2, size=(self.number_of_bits)) # a random sequence of 0's and 1's
        self.state = np.random.randint(2, size=(self.number_of_bits)) 
        return np.array(self.state) ,np.array(self.goal)
    def step(self, action):
        if action >= self.number_of_bits:
            raise Exception('Action out of range')
        self.n_steps += 1
        self.state[action] = 1-self.state[action] # flip this bit
        done = np.array_equal(self.state, self.goal) or (self.max_steps <= self.n_steps)
        if self.reward_type == 'sparse':
            reward = 0 if np.array_equal(self.state, self.goal) else -1
        else:
            reward = -np.sum(np.square(self.state-self.goal))
        return np.array(self.state), reward, done, {}
    def render(self):
        print("\rstate :", np.array_str(self.state), ", goal :", np.array_str(self.goal), end=' '*10)
    def get_state_dim(self):
        state_dim = self.number_of_bits
        return state_dim
    def get_action_dim(self):
        action_dim = self.number_of_bits # since it is determining which bit to change
        return action_dim
    
BITS_NUMBER = 8
MAX_STEPS = BITS_NUMBER
env = BitFlip(BITS_NUMBER, 'sparse', MAX_STEPS)

# Initial Goal Distribution

In [11]:
class RandomAgent(object):
    def __init__(self, action_space):
        self.action_space = action_space

    def act(self, observation, reward, done):
        return random.randint(0,BITS_NUMBER-1)
    
agent = RandomAgent(env.get_action_dim())

episode_count = 100
reward = 0
done = False
initial_subgoals = []
        
for i in range(episode_count):
    state = env.reset()
#     print (state)
    done_count = 0
    while True:
        action = agent.act(state, reward, done)
        next_state, reward, done, _ = env.step(action)
        state = next_state
        if done:
            break
    initial_subgoals.append(state)
    
random.seed(seed_number)
initial_subgoal = initial_subgoals[random.randint(0, len(initial_subgoals)-1)]
print ('Initial subgoal sampled is: ', initial_subgoal)

Initial subgoal sampled is:  [0 1 1 0 1 0 1 0]


# Training Agent

In [12]:
def to_one_hot(index, dim):
    if isinstance(index, np.int) or isinstance(index, np.int64):
        one_hot = np.zeros(dim)
        one_hot[index] = 1.
    else:
        one_hot = np.zeros((len(index), dim))
        one_hot[np.arange(len(index)), index] = 1.
    return one_hot

In [None]:
num_inputs  = env.get_state_dim()
num_outputs = env.get_action_dim()

#Hyper params:
hidden_size      = 256
lr               = 3e-4
num_restarts     = 16
mini_batch_size  = 16
ppo_epochs       = 4

model = ActorCritic(2*num_inputs, num_outputs, hidden_size).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)

ep_max     = 20
ep_count   = 0
eval_num   = 1
num_steps  = 0

test_rewards = []
mean_actor_loss_list = []
mean_critic_loss_list = []
best_return = -99999

# Model-based parameter
use_modelbased = False

early_stop = False
zero_reward_count = 0
done = False

while ep_count < ep_max and not early_stop:
    if ep_count == 0: 
        goal = initial_subgoal
    else: 
        goal = temp
                
#     print ('goal: ', goal)
#     print ('desired_goal: ', ep_desired_goal)
    
    ep_log_probs = []
    ep_log_probs_desired = []
    ep_values    = []
    ep_states    = []
    ep_actions   = []
    ep_rewards   = []
    ep_masks     = []
    ep_state_goals = []
    ep_next_state_goals = []
    ep_state_desired_goals = []
    subgoals = []
    desired_goals = []
    for _ in range(num_restarts):
        done = False
        state = env.reset()
        desired_goal = state[1]
        state = state[0]
        # print ('initial_state: ', state)
        # print ('desired_goal: ', desired_goal)
        
        log_probs = []
        log_probs_desired = []
        values    = []
        states    = []
        actions   = []
        rewards   = []
        masks     = []
        state_goals = []
        next_state_goals = []
        state_desired_goals = []
        
        
        desired_goals.append(desired_goal)
        while not done:
            # for subgoal
            state_goal = np.concatenate((state,goal),0)
            state_desired_goal = np.concatenate((state, desired_goal), 0)
            
            state_goal = torch.FloatTensor(state_goal).to(device)
            state_desired_goal = torch.FloatTensor(state_desired_goal).to(device)
            
            # for subgoal
            act, value = model(state_goal)
            action = np.argmax(act.data.cpu().numpy())
            one_hot_action = to_one_hot(action, BITS_NUMBER)
            log_prob = np.sum(act.data.cpu().numpy() * one_hot_action)
            next_state, reward, done, _ = env.step(action)
            
            # for desired goal
            act_desired, value_desired = model(state_desired_goal)
            action_desired = np.argmax(act_desired.data.cpu().numpy())
            one_hot_action_desired = to_one_hot(action_desired, BITS_NUMBER)
            log_prob_desired = np.sum(act_desired.data.cpu().numpy() * one_hot_action)
            if reward == 0:
                zero_reward_count += 1 
            
            log_probs.append(log_prob)
            log_probs_desired.append(log_prob_desired)
            values.append(value)
            rewards.append(reward)
            masks.append(1-done)

            state_goal = list(state_goal.data.cpu().numpy())
            
            states.append(state)
            state_goals.append(state_goal)
            actions.append(action)
            temp = state
            state = next_state
            
            # for subgoal
            next_state_goal = np.concatenate((next_state,goal),0)
            next_state_goals.append(next_state_goal)
            
            num_steps += 1
        subgoals.append(temp)
        ep_states.append(states)
        ep_actions.append(torch.FloatTensor(actions).to(device))
        ep_state_goals.append(torch.FloatTensor(state_goals).to(device))
        ep_values.append(torch.FloatTensor(values).to(device))
        ep_values_copy = ep_values.copy()
        ep_rewards.append(rewards)
        ep_masks.append(masks)
        ep_next_state_goals.append(next_state_goals)
        ep_log_probs.append(torch.FloatTensor(log_probs).to(device))
        ep_log_probs_desired.append(log_probs_desired)
        
    for i in range(len(subgoals)):
        if np.array_equal(subgoals[i], desired_goals[i]):
            temp = subgoals[i]
            break
        else:
            index = random.randint(0,len(subgoals)-1)
            temp = subgoals[index]

    ep_count += 1
    if ep_count % eval_num == 0:
        test_reward = np.mean([test_env(model, goal, False) for _ in range(10)])
        if test_reward >= best_return:
            best_return = test_reward
        if test_reward >= -3: 
            early_stop = True
        if test_reward != -8.0:
            test_rewards.append(test_reward)

        print ('\nepisode {0} mean_rewards: {1}'.format(ep_count,test_reward))
            
    zero_freq = float(zero_reward_count) / num_steps
    other_freq = 1 - zero_freq 
    
    next_state_goals = []
    for step in range(len(ep_next_state_goals)):
        next_state_goals.append(ep_next_state_goals[step][-1])
    next_state_goals = torch.FloatTensor(next_state_goals).to(device)
    _, next_value = model(next_state_goals)

    current_logprobs     = ep_log_probs 
    desired_logprobs     = ep_log_probs_desired
    
#     returns      = is_hindsight_gae(ep_rewards, current_logprobs, desired_logprobs, ep_masks, ep_values)
    returns = torch.FloatTensor(hindsight_gae(next_value, ep_rewards, ep_masks, ep_values_copy)).to(device)
#     returns      = torch.cat(returns).detach()
    ep_actions   = torch.cat(ep_actions).detach()
    ep_values    = torch.cat(ep_values).detach()
    ep_log_probs  = torch.cat(ep_log_probs).detach()
    ep_state_goals  = torch.cat(ep_state_goals).detach()
    advantage    = returns - ep_values

    ppo_update(ppo_epochs, mini_batch_size, ep_state_goals, ep_actions, ep_log_probs, returns, advantage)

    print ('\nactor_loss: {0:.3f}, critic_loss: {1:.3f}, best_return: {2:.3f}'
           .format(mean_actor_loss_list[ep_count-1], mean_critic_loss_list[ep_count-1], best_return))
    print ('====================================================================')
print ('zero_reward_count: ', zero_reward_count)
print ('zero_freq: ', zero_freq)

# Save file

In [None]:
import pickle

with open('./Test Reward Plot/test_rewards_hppo_bitflipping8', 'wb') as fp1:
    pickle.dump(test_rewards, fp1)
with open('./Loss Plot/mean_actor_loss_hppo_bitflipping8', 'wb') as fp2:
    pickle.dump(mean_actor_loss_list, fp2)
with open('./Loss Plot/mean_critic_loss_hppo_bitflipping8', 'wb') as fp3:
    pickle.dump(mean_critic_loss_list, fp3)