In [2]:
from comet_ml import Experiment
import torch
import torch.autograd as autograd
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Normal
import mujoco_py
import os
import gym
import ipdb
import numpy as np
import random

LOG_SIG_MAX = 2
LOG_SIG_MIN = -20
epsilon = 1e-6

In [3]:
class Policy(nn.Module):
    
    def __init__(self, num_inputs, hidden_size, action_space):
        
        super(Policy,self).__init__()
        
        self.action_space = action_space
        num_outputs = action_space.shape[0]
        
        self.linear = nn.Linear(num_inputs,hidden_size)
        self.mean = nn.Linear(hidden_size, num_outputs)
        self.log_std = nn.Linear(hidden_size, num_outputs)
        
    def forward(self, inputs):
            
        x = inputs
        x = F.relu(self.linear(x))
        
        mean = self.mean(x)
        
        log_std = self.log_std(x) 
        log_std = torch.clamp(log_std, min=LOG_SIG_MIN, max=LOG_SIG_MAX) #clamp log std into range
        std = log_std.exp()
            
        return mean, std

In [4]:
class ValueNetwork(nn.Module):
    
    def __init__(self, num_inputs, hidden_dim):                                                                                                                                                             
        super(ValueNetwork, self).__init__()                                                                                                                                                                
                                                                                                                                                                                                            
        self.linear1 = nn.Linear(num_inputs, hidden_dim)                                                                                                                                                    
        self.linear2 = nn.Linear(hidden_dim, hidden_dim)                                                                                                                                                    
        self.linear3 = nn.Linear(hidden_dim, 1)
        
    def forward(self, state):
        
        x = F.relu(self.linear1(state))                                                                                                                                                                     
        x = F.relu(self.linear2(x))                                                                                                                                                                         
        x = self.linear3(x)                                                                                                                                                                                 
        
        return x

In [5]:
class REINFORCE:
    
    def __init__(self, num_inputs, hidden_size, action_space, lr_pi = 1e-2,\
                 gamma = 0.99, baseline = False, lr_vf = 1e-2):
        
        self.action_space = action_space
        self.policy = Policy(num_inputs, hidden_size, action_space)
        self.policy_optimizer = optim.Adam(self.policy.parameters(), lr = lr_pi)
        self.gamma = gamma
        self.baseline = False
        
        if baseline:
            self.baseline = True
            self.value_function = ValueNetwork(num_inputs, hidden_size)
            self.value_optim = optim.Adam(self.value_function.parameters(), lr = lr_vf)
        
    def select_action(self, state):
            
        state = torch.from_numpy(state).float().unsqueeze(0) # just to make it a Tensor obj
        
        # get mean and std
        mean, std = self.policy(state)
        # create normal distribution
        normal = Normal(mean, std)
        # sample action
        action = normal.sample()
        # get log prob of that action
        ln_prob = normal.log_prob(action)
        ln_prob = ln_prob.sum()
        
        # squeeze action into [-1,1]
        action = torch.tanh(action)
        # turn actions into numpy array
        action = action.numpy()

        return action[0], ln_prob, mean, std
    
    
    def train(self, log_probs, rewards):
        
        R = 0
        returns = []
        
        for r in rewards[::-1]:
            
            R = r + self.gamma * R
            returns.insert(0, R)
            
        returns = torch.tensor(returns) # rewards to go for each step of env trajectory
        
        if self.baseline:
            value_estimates = []
            
            
            
        policy_loss = []
        for log_prob, R in zip(log_probs, returns):
            policy_loss.append(-log_prob * R)
            
        policy_loss = torch.stack( policy_loss ).sum()
        
        self.policy_optimizer.zero_grad()                                                                                                                                                                          
        policy_loss.backward()                                                                                                                                                                                     
        self.policy_optimizer.step()
        
        return policy_loss
    
    def train2 (self, trajectory):
        
        log_probs = []
        rewards = []
        states = []
        actions = []
    
        for t in trajectory:
    
            state, action, ln_prob, reward, next_state, done = t
    
            states.append(state)
            actions.append(action)
            log_probs.append(ln_prob)
            rewards.append(reward)
        
###################### calculate rewards to go #####################################
        R = 0
        returns = []
        
        for r in rewards[::-1]:
            
            R = r + 0.99 * R
            returns.insert(0, R)
            
        returns = torch.tensor(returns) # rewards to go for each step of env trajectory
        
## #############################get value function loss ##########################################

        value_estimates = []
        for state in states:
            state = torch.from_numpy(state).float().unsqueeze(0) # just to make it a Tensor obj
            value_estimates.append( self.value_function(state) )


        value_estimates = torch.stack(value_estimates).squeeze() # rewards to go for each step of env trajectory

        v_loss = F.mse_loss(value_estimates, returns)

        self.value_optim.zero_grad()                                                                                                                                                                          
        v_loss.backward()                                                                                                                                                                                     
        self.value_optim.step()


##################################################################################
        advantage = []
        for value, R in zip(value_estimates, returns):

            advantage.append(R - value)
        advantage = torch.Tensor(advantage)
# get policy loss #
        policy_loss = []
        for log_prob, adv in zip(log_probs, advantage):
            policy_loss.append(-log_prob * adv)
            
        policy_loss = torch.stack( policy_loss ).sum()
        
        self.policy_optimizer.zero_grad()                                                                                                                                                                          
        policy_loss.backward()                                                                                                                                                                                     
        self.policy_optimizer.step()
        
#         return loss
        
        
#     def train(self, trajectory):                                                                                                                                                                            
                                                                                                                                                                                                            
#         '''                                                                                                                                                                                                 
#         trajectory: a list of the form                                                                                                                                                                     
#         [(lnP(a_t|s_t), r(s_t,a_t) ),(lnP(a_{t+1}|s_{t+1}), r(s_{t+1},a_{t+1}))]                                                                                                                            
                                                                                                                                                                                                            
#         Train the model by summing lnP(a_t|s_t)*r(s_t,a_t)                                                                                                                                                  
#         '''                                                                                                                                                                                                 
                                                                                                                                                                                                            
#         loss = 0                                                                                                                                                                                        
#         for step in trajectory:                                                                                                                                                                             
#             # look at one step                                                                                                                                                                              
#             ln_prob, reward = step 
#             loss = loss - ln_prob * reward 
                                                                                                                                          
#         self.optimizer.zero_grad()                                                                                                                                                                          
#         loss.backward()                                                                                                                                                                                     
#         self.optimizer.step()
        
#         return loss


# Create env

In [6]:
env = gym.make("MountainCarContinuous-v0") #
#env = gym.make("LunarLanderContinuous-v2")
#env = gym.make("Pendulum-v0")
#env = gym.make("InvertedPendulum-v1")

state_dim = env.observation_space.shape[0]
action_dim = env.action_space
max_action = env.action_space.high
min_action = env.action_space.low

print("number of actions:{0}, dim of states: {1}, max_action:{2}, min_action: {3}".format(action_dim,state_dim,max_action,min_action))

[2019-04-08 14:25:13,707] Making new env: MountainCarContinuous-v0


number of actions:Box(1,), dim of states: 2, max_action:[1.], min_action: [-1.]


  result = entry_point.load(False)


In [7]:
# # Create comet an experiment
# experiment = Experiment(api_key="BUXbNT79Q2PEtRkuX9swzxspZ",
#                         project_name="florl", workspace="nadeem-ward")

# experiment.set_name("FLORL")

# Methods to evaluate policy

In [8]:
def evaluate_policy(policy, eval_episodes = 10):
    '''
        function to return the average reward of the policy over 10 runs
    '''
    
    avg_reward = 0.0
    for _ in range(eval_episodes):
        obs = env.reset()
        done = False
        
        while not done:
            
            action, log_prob, mean, std = policy.select_action(np.array(obs))
            obs, reward, done, _ = env.step(action)
            
            avg_reward += reward
            
    avg_reward /= eval_episodes
    
    print("the average reward is: {0}".format(avg_reward))
    #return avg_reward


In [9]:
def render_policy(policy):
    '''
        Function to see the policy in action
    '''
    
    obs = env.reset()
    done = False
    
    while not done:
        env.render()
        action,_,_,_ = policy.select_action(np.array(obs))
        
        obs, reward, done, _ = env.step(action)
        
    env.close()

# Implementing everything and update parameters

In [11]:
hidden_size = 256
policy = REINFORCE (state_dim, hidden_size, action_dim, baseline = True)


max_episodes = 100
total_episodes = 0    
while total_episodes < max_episodes:

    obs = env.reset()
    done = False

    trajectory = []
    log_probs = []
    rewards = []
    
    while not done:
        
        action, ln_prob, mean, std = policy.select_action(np.array(obs))
        
        next_state, reward, done, _ = env.step(action)
        
        trajectory.append( [np.array(obs), action, ln_prob, reward, next_state, done] )
        log_probs.append(ln_prob)
        rewards.append(reward)
        
        #trajectory_info.append([ln_prob, reward])
            #print("state:{0}, next_state:{1}, reward:{2}, action{3}, done:{4}".format(obs,next_state, reward, action,done))
        obs = next_state

    total_episodes += 1
        
    print("At episode:{0}".format(total_episodes))
    #print(log_probs)
    #print(rewards)
    #policy.train(log_probs, rewards)
    policy.train2(trajectory)
    #experiment.log_metric("Loss Value", value_loss, step = total_episodes)

At episode:1
At episode:2
At episode:3
At episode:4
At episode:5
At episode:6
At episode:7
At episode:8
At episode:9
At episode:10
At episode:11
At episode:12
At episode:13
At episode:14
At episode:15
At episode:16
At episode:17
At episode:18
At episode:19
At episode:20
At episode:21
At episode:22
At episode:23
At episode:24
At episode:25
At episode:26
At episode:27
At episode:28
At episode:29
At episode:30
At episode:31
At episode:32
At episode:33
At episode:34
At episode:35
At episode:36
At episode:37
At episode:38
At episode:39
At episode:40
At episode:41
At episode:42
At episode:43
At episode:44
At episode:45
At episode:46
At episode:47
At episode:48
At episode:49
At episode:50
At episode:51
At episode:52
At episode:53
At episode:54
At episode:55
At episode:56
At episode:57
At episode:58
At episode:59
At episode:60
At episode:61
At episode:62
At episode:63
At episode:64
At episode:65
At episode:66
At episode:67
At episode:68
At episode:69
At episode:70
At episode:71
At episode:72
A

In [12]:
evaluate_policy(policy)

the average reward is: -13.787747109819424


In [13]:
render_policy(policy)

In [None]:
# fixing problem with type 

# Lunar Lander NEEDS actions between -1,1

# action needs to be numpy array

In [None]:
max_episodes = 1
total_episodes = 0 

policy = REINFORCE (state_dim, hidden_size, action_dim, baseline = True)

while total_episodes < max_episodes:

    obs = env.reset()
    done = False

    trajectory = []
    
    while not done:
        # select action
        action, ln_prob, mean, std = policy.select_action(np.array(obs))
        # use action on state
        next_state, reward, done, _ = env.step(action)
        # append info
        trajectory.append( [np.array(obs),action,ln_prob,reward,next_state, done] )

        obs = next_state

    total_episodes += 1



In [None]:
log_probs = []
rewards = []
states = []
actions = []

for t in trajectory:
    
    state, action, ln_prob, reward, next_state, done = t
    
    states.append(state)
    actions.append(action)
    log_probs.append(ln_prob)
    rewards.append(reward)

    
###################### calculate rewards to go #####################################
R = 0
returns = []
        
for r in rewards[::-1]:
            
    R = r + 0.99 * R
    returns.insert(0, R)
            
returns = torch.tensor(returns) # rewards to go for each step of env trajectory
        
## #############################get value function loss ##########################################

value_estimates = []
for state in states:
    state = torch.from_numpy(state).float().unsqueeze(0) # just to make it a Tensor obj
    value_estimates.append( policy.value_function(state) )

#print(value_estimates)
value_estimates = torch.stack(value_estimates).squeeze() # rewards to go for each step of env trajectory
#print(returns.shape)
#print(value_estimates.shape)
#print(value_estimates)
#print(value_estimates.squeeze())

v_loss = F.mse_loss(value_estimates, returns)

policy.value_optim.zero_grad()                                                                                                                                                                          
v_loss.backward()                                                                                                                                                                                     
policy.value_optim.step()


##################################################################################
advantage = []
for value, R in zip(value_estimates, returns):

    advantage.append(R - value)

# policy.value_function = ValueNetwork(num_inputs, hidden_size)
# policy.value_optim = optim.Adam(self.value_function.parameters(), lr = lr_vf)

# get policy loss #
policy_loss = []
for log_prob, adv in zip(log_probs, advantage):
    policy_loss.append(-log_prob * adv)
            
policy_loss = torch.stack( policy_loss ).sum()

In [None]:
policy_loss