In [6]:
import gym
import time
import torch
import random
import numpy as np
import torch.nn as nn
import matplotlib.pyplot as plt

In [7]:
# env = gym.make('HalfCheetah-v1')
env = gym.make('InvertedPendulum-v1')
print('Observation Dimension:', env.observation_space.shape[0], '| Action Dimension:', env.action_space.shape[0])

[2020-03-30 19:03:34,047] Making new env: InvertedPendulum-v1


Observation Dimension: 4 | Action Dimension: 1


In [8]:
class Config():

    def __init__(self, epsilon_start, epsilon_end, decay, batch_size, batch_length, observation_dim, action_dim, N_episodes, discount, learning_rate):
        self.epsilon_start = epsilon_start
        self.epsilon_end = epsilon_end
        self.decay = decay
        self.batch_size = batch_size
        self.batch_length = batch_length
        self.observation_dim = observation_dim
        self.action_dim = action_dim
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.N_episodes = N_episodes
        self.discount = discount
        self.learning_rate = learning_rate

        self.rewards = []
    
    def append_reward(self, episodic_reward):
        self.rewards.append(episodic_reward)
        
    def plot(self):
        clear_output(True)
        plt.figure(figsize=(20,5))
        plt.subplot(131)
        plt.title('Reward: %s' % (np.mean(self.rewards[-10:])))
        plt.plot(self.rewards)
#         plt.subplot(132)
#         plt.title('loss')
#         plt.plot(losses)
        plt.show()

In [9]:
# One hidden layer function approximator
class Actor(nn.Module):
    """ Policy returns continuous action
    """

    def __init__(self, config, std=0.0):
        super(Actor, self).__init__()
        self.eps = config.epsilon_start
        self.eps_end = config.epsilon_end
        self.eps_decay = config.decay
        self.device = config.device
        
        self.criterion = torch.nn.MSELoss()
        self.batch_size = config.batch_size
        
        self.block = nn.Sequential(
            nn.Linear(config.observation_dim, 16),
            nn.ReLU(),
            nn.Linear(16, config.action_dim),
        )
        
        self.log_std = nn.Parameter(torch.ones(1, config.action_dim) * std)


    def forward(self, ip):
        return self.block(ip)
    
    def act(self, observation):
        self.eps = max(self.eps*self.eps_decay, self.eps_end)      # Decay Doubt Batch random acting doubt
        observation = torch.FloatTensor(observation).to(self.device)

        # Q: Do i need to use eplison greedy policy? 
        action_mean = self.forward(observation).unsqueeze(dim=0)#.squeeze()
        
        assert action_mean.shape == (1, 1)
        
        std   = self.log_std.exp().expand_as(action_mean)
        dist = torch.distributions.Normal(action_mean, std)
        actions = dist.sample()
        log_probs = dist.log_prob(actions)
        
        return actions.detach().cpu().numpy(), log_probs

In [10]:
# One hidden layer function approximator
class Critic(nn.Module):
    """ Policy returns continuous action
    """

    def __init__(self, config):
        super(Critic, self).__init__()
        self.eps = config.epsilon_start
        self.eps_end = config.epsilon_end
        self.eps_decay = config.decay
        self.device = config.device
        
        self.criterion = torch.nn.MSELoss()
        self.batch_size = config.batch_size
        
        self.block = nn.Sequential(
            nn.Linear(config.observation_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, config.action_dim),
        )

    def forward(self, ip):
        return self.block(ip)
    
    def act(self, observation):
        self.eps = max(self.eps*self.eps_decay, self.eps_end)      # Decay Doubt Batch random acting doubt
        observation = torch.FloatTensor(observation).to(self.device)

        if random.random() < self.eps:
            return env.action_space.sample()
        else:
            return self.forward(observation).squeeze().max(dim=1)[1].cpu().detach().numpy() # 1 returns index, 0 returns value

In [11]:
config = Config(epsilon_start=0.99, epsilon_end=0.1, decay=0.9999, batch_size=64, batch_length=100,
                observation_dim=env.observation_space.shape[0], action_dim=env.action_space.shape[0],
                N_episodes=1000, discount=0.99, learning_rate = 3e-2
               )

actor = Actor(config).to(config.device)
critic = Critic(config).to(config.device)

In [12]:
from IPython.display import clear_output
%matplotlib inline

In [13]:
class TrajectoryBuffer():
    """ Policy returns continuous action
    """

    def __init__(self, config):
        self.log_probs = []
        self.returns = []
        self.discount = config.discount
        
    def clear(self):
        self.log_probs = []
        self.returns = []
    
    def push(self, log_probs, rewards):
        "Convert to discounted returns and push to buffer"
        Rt = rewards.copy()
        discounts = [self.discount**i for i in range(len(Rt))]
        discounts.reverse()        
        Rt.reverse()
        returns = [np.sum(np.multiply(Rt[:len(Rt)-i], discounts[i:])) for i in range(len(Rt))]
        
        self.log_probs.append(log_probs)
        self.returns.append(returns)
        

In [14]:
def update_actor():
    
    optimizer.zero_grad()
    loss = 0
    for it, log_prob in enumerate(trajectory_buffer.log_probs):
        return_ = trajectory_buffer.returns[it]
        for it_, return__ in enumerate(return_):
            log_prob_ = log_prob[it_]
            loss -= return__*log_prob_

    loss = loss/(it+1)
    loss.backward()
    optimizer.step()
    
    trajectory_buffer.clear()

In [15]:
trajectory_buffer = TrajectoryBuffer(config)
optimizer = torch.optim.Adam(actor.parameters(), lr=config.learning_rate)
last_highest_rew = 0


In [None]:
for i in range(config.N_episodes):
    
    for _ in range(config.batch_length):
        
        episodic_rewards, log_probs, done = [], [], False
        state = env.reset()

        while not done:

            action, log_prob = actor.act(state)

            next_state, reward, done, _ = env.step(action)

            episodic_rewards.append(reward)
            log_probs.append(log_prob)
            if i%5 == 0:
                env.render()

            state = next_state

        trajectory_buffer.push(log_probs, episodic_rewards)

        if np.sum(episodic_rewards) > last_highest_rew:
            torch.save(actor.state_dict(), 'InvertedPendulumActor.pt')
            last_highest_rew = np.sum(episodic_rewards)
            print('Saved Model')

        if np.sum(episodic_rewards) > 200:
            print('Wohoo!! Agent Trained')
            break

        #Update
        config.append_reward(np.sum(episodic_rewards))
        config.plot()
        
    update_actor()   
    print('##############UPDATED_ACTOR############')
    time.sleep(2)


In [17]:
actor_test = Actor(config).to(config.device)
actor_test.load_state_dict(torch.load('InvertedPendulumActor.pt'))
# actor_test.load_state_dict(actor.state_dict())
actor_test.eval()

Actor(
  (criterion): MSELoss()
  (block): Sequential(
    (0): Linear(in_features=4, out_features=16, bias=True)
    (1): ReLU()
    (2): Linear(in_features=16, out_features=1, bias=True)
  )
)

<h2> Testing policy</h2>

In [26]:
import time


state = env.reset()
total_reward = 0
done = False

while not done:

    action, log_prob = actor_test.act(state)

    next_state, reward, done, _ = env.step(action)
    total_reward += reward

    env.render()
    time.sleep(0.001)

    state = next_state

print('Total collected reward:', total_reward)

env.close()

Total collected reward: 1000.0


<h2> Test Ground</h2>

In [19]:
print(last_highest_rew)

1000.0
