In [20]:
from comet_ml import Experiment
import torch
import torch.autograd as autograd
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Normal
import mujoco_py
import os
import gym
import ipdb
import numpy as np
import random

LOG_SIG_MAX = 2
LOG_SIG_MIN = -20
epsilon = 1e-6

In [47]:
class Policy(nn.Module):
    
    def __init__(self, num_inputs, hidden_size, action_space):
        
        super(Policy,self).__init__()
        
        self.action_space = action_space
        num_outputs = action_space.shape[0]
        
        self.linear = nn.Linear(num_inputs,hidden_size)
        self.mean = nn.Linear(hidden_size, num_outputs)
        self.log_std = nn.Linear(hidden_size, num_outputs)
        
    def forward(self, inputs):
            
        x = inputs
        x = F.relu(self.linear(x))
        
        mean = self.mean(x)
        
        log_std = self.log_std(x) 
        log_std = torch.clamp(log_std, min=LOG_SIG_MIN, max=LOG_SIG_MAX) #clamp log std into range
        std = log_std.exp()
            
        return mean, std
        
class REINFORCE:
    
    def __init__(self, num_inputs, hidden_size, action_space, lr = 1e-2, gamma = 0.99):
        
        self.action_space = action_space
        self.policy = Policy(num_inputs, hidden_size, action_space)
        self.optimizer = optim.Adam(self.policy.parameters(), lr = lr)
        self.gamma = gamma
        
    def select_action(self, state):
            
        state = torch.from_numpy(state).float().unsqueeze(0) # just to make it a Tensor obj
        
        # get mean and std
        mean, std = self.policy(state)
        # create normal distribution
        normal = Normal(mean, std)
        # sample action
        action = normal.sample()
        # get log prob of that action
        ln_prob = normal.log_prob(action)
        ln_prob = ln_prob.sum()
        
        # squeeze action into [-1,1]
        action = torch.tanh(action)
        # turn actions into numpy array
        action = action.numpy()

        return action[0], ln_prob, mean, std
    
    
    def train(self, log_probs, rewards):
        
        R = 0
        returns = []
        
        for r in rewards[::-1]:
            
            R = r + self.gamma * R
            returns.insert(0, R)
            
        returns = torch.tensor(returns)
        
        policy_loss = []
        for log_prob, R in zip(log_probs, returns):
            policy_loss.append(-log_prob * R)
            
        policy_loss = torch.stack( policy_loss ).sum()
        
        self.optimizer.zero_grad()                                                                                                                                                                          
        policy_loss.backward()                                                                                                                                                                                     
        self.optimizer.step()
        
        return policy_loss
    
#     def train(self, trajectory):                                                                                                                                                                            
                                                                                                                                                                                                            
#         '''                                                                                                                                                                                                 
#         trajectory: a list of the form                                                                                                                                                                     
#         [(lnP(a_t|s_t), r(s_t,a_t) ),(lnP(a_{t+1}|s_{t+1}), r(s_{t+1},a_{t+1}))]                                                                                                                            
                                                                                                                                                                                                            
#         Train the model by summing lnP(a_t|s_t)*r(s_t,a_t)                                                                                                                                                  
#         '''                                                                                                                                                                                                 
                                                                                                                                                                                                            
#         loss = 0                                                                                                                                                                                        
#         for step in trajectory:                                                                                                                                                                             
#             # look at one step                                                                                                                                                                              
#             ln_prob, reward = step 
#             loss = loss - ln_prob * reward 
                                                                                                                                          
#         self.optimizer.zero_grad()                                                                                                                                                                          
#         loss.backward()                                                                                                                                                                                     
#         self.optimizer.step()
        
#         return loss


# Create env

In [48]:
#env = gym.make("MountainCarContinuous-v0") #
env = gym.make("LunarLanderContinuous-v2")
#env = gym.make("Pendulum-v0")
#env = gym.make("InvertedPendulum-v1")

state_dim = env.observation_space.shape[0]
action_dim = env.action_space
max_action = env.action_space.high
min_action = env.action_space.low

print("number of actions:{0}, dim of states: {1}, max_action:{2}, min_action: {3}".format(action_dim,state_dim,max_action,min_action))

[2019-04-07 22:14:07,686] Making new env: LunarLanderContinuous-v2


number of actions:Box(2,), dim of states: 8, max_action:[1. 1.], min_action: [-1. -1.]


In [49]:
# # Create comet an experiment
# experiment = Experiment(api_key="BUXbNT79Q2PEtRkuX9swzxspZ",
#                         project_name="florl", workspace="nadeem-ward")

# experiment.set_name("FLORL")

# Methods to evaluate policy

In [50]:
def evaluate_policy(policy, eval_episodes = 10):
    '''
        function to return the average reward of the policy over 10 runs
    '''
    
    avg_reward = 0.0
    for _ in range(eval_episodes):
        obs = env.reset()
        done = False
        
        while not done:
            
            action, log_prob, mean, std = policy.select_action(np.array(obs))
            obs, reward, done, _ = env.step(action)
            
            avg_reward += reward
            
    avg_reward /= eval_episodes
    
    print("the average reward is: {0}".format(avg_reward))
    #return avg_reward


In [51]:
def render_policy(policy):
    '''
        Function to see the policy in action
    '''
    
    obs = env.reset()
    done = False
    
    while not done:
        env.render()
        action,_,_,_ = policy.select_action(np.array(obs))
        
        obs, reward, done, _ = env.step(action)
        
    env.close()

# Implementing everything and update parameters

In [58]:
hidden_size = 256
policy = REINFORCE (state_dim, hidden_size, action_dim)


max_episodes = 1000
total_episodes = 0    
while total_episodes < max_episodes:

    obs = env.reset()
    done = False

    log_probs = []
    rewards = []
    
    while not done:
        
        action, ln_prob, mean, std = policy.select_action(np.array(obs))
        
        next_state, reward, done, _ = env.step(action)
        
        log_probs.append(ln_prob)
        rewards.append(reward)
        
        #trajectory_info.append([ln_prob, reward])
            #print("state:{0}, next_state:{1}, reward:{2}, action{3}, done:{4}".format(obs,next_state, reward, action,done))
        obs = next_state

    total_episodes += 1
        
    print("At episode:{0}".format(total_episodes))
    #print(log_probs)
    #print(rewards)
    value_loss = policy.train(log_probs, rewards)
    #experiment.log_metric("Loss Value", value_loss, step = total_episodes)

At episode:1
At episode:2
At episode:3
At episode:4
At episode:5
At episode:6
At episode:7
At episode:8
At episode:9
At episode:10
At episode:11
At episode:12
At episode:13
At episode:14
At episode:15
At episode:16
At episode:17
At episode:18
At episode:19
At episode:20
At episode:21
At episode:22
At episode:23
At episode:24
At episode:25
At episode:26
At episode:27
At episode:28
At episode:29
At episode:30
At episode:31
At episode:32
At episode:33
At episode:34
At episode:35
At episode:36
At episode:37
At episode:38
At episode:39
At episode:40
At episode:41
At episode:42
At episode:43
At episode:44
At episode:45
At episode:46
At episode:47
At episode:48
At episode:49
At episode:50
At episode:51
At episode:52
At episode:53
At episode:54
At episode:55
At episode:56
At episode:57
At episode:58
At episode:59
At episode:60
At episode:61
At episode:62
At episode:63
At episode:64
At episode:65
At episode:66
At episode:67
At episode:68
At episode:69
At episode:70
At episode:71
At episode:72
A

At episode:556
At episode:557
At episode:558
At episode:559
At episode:560
At episode:561
At episode:562
At episode:563
At episode:564
At episode:565
At episode:566
At episode:567
At episode:568
At episode:569
At episode:570
At episode:571
At episode:572
At episode:573
At episode:574
At episode:575
At episode:576
At episode:577
At episode:578
At episode:579
At episode:580
At episode:581
At episode:582
At episode:583
At episode:584
At episode:585
At episode:586
At episode:587
At episode:588
At episode:589
At episode:590
At episode:591
At episode:592
At episode:593
At episode:594
At episode:595
At episode:596
At episode:597
At episode:598
At episode:599
At episode:600
At episode:601
At episode:602
At episode:603
At episode:604
At episode:605
At episode:606
At episode:607
At episode:608
At episode:609
At episode:610
At episode:611
At episode:612
At episode:613
At episode:614
At episode:615
At episode:616
At episode:617
At episode:618
At episode:619
At episode:620
At episode:621
At episode

In [65]:
evaluate_policy(policy)

the average reward is: -409.4966535017377


In [66]:
render_policy(policy)

In [61]:
# fixing problem with type 

# Lunar Lander NEEDS actions between -1,1

# action needs to be numpy array