In [1]:
import random

import gym
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Categorical, Normal
from torch.autograd import Variable

In [16]:
class VanillaPolicyNetwork(nn.Module):
    def __init__(self, obs_space, action_space, hidden_size=128):
        super(VanillaPolicyNetwork, self).__init__()
        
        self.network = nn.Sequential(
            nn.Linear(obs_space, hidden_size),
            nn.ReLU(inplace=True),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(inplace=True),
            nn.Linear(hidden_size, action_space),
            nn.Softmax(dim=-1)
        )

    def forward(self, x):
        return self.network(x)

class ValueNetwork(nn.Module):
    def __init__(self, obs_space, hidden_size=128):
        super(ValueNetwork, self).__init__()
        
        self.network = nn.Sequential(
            nn.Linear(obs_space, hidden_size),
            nn.ReLU(inplace=True),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(inplace=True),
            nn.Linear(hidden_size, 1),
        )

    def forward(self, x):
        return self.network(x)
    
class GaussianPolicy(nn.Module):
    def __init__(self, obs_space, action_space, hidden_size=128):
       	super(GaussianPolicy, self).__init__()
        
        self.network = nn.Sequential(
            nn.Linear(obs_space, hidden_size),
            nn.ReLU(inplace=True),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(inplace=True),
        )

        self.mean = nn.Linear(hidden_size, action_space)
        self.log_std = nn.Linear(hidden_size, action_space)

    def forward(self, x):

        x = self.network(x)

        mean = self.mean(x)
        log_std = self.log_std(x)
        log_std = torch.clamp(log_std, min=-2, max=20)
        std = log_std.exp()

        return mean, std

In [17]:
# MountainCar-v0, Acrobot-v1
env_name = 'MountainCar-v0'
env = gym.make(env_name)

obs_space = env.observation_space.shape[0]
action_space = env.action_space.n

In [18]:
if env_name == "MountainCar-v0":
    lr = 1e-3
else:
    lr = 5e-2

In [40]:
# def run_episode(env, env_name, policy_network):
#     rewards, actions, states = [], [], []
#     action_probs = []

#     S = env.reset()

#     done = False
#     episode_len = 0
#     policy_network.eval()
#     while not done:

#         A_prob = policy_network(torch.tensor(S)[None])
#         distribution = Categorical(A_prob)
#         A = distribution.sample()

#         S_, R, done, _ = env.step(A.item())
#         episode_len += 1

#         if env_name == "MountainCar-v0" and S_[0] > -0.2:
#             R = 1

#         states.append(S)
#         actions.append(A)
#         rewards.append(R)

#         action_probs.append(A_prob[0, A.item()])

#         S = S_
#         if done:
#             break
#     env.close()
#     return rewards, actions, states, action_probs

In [41]:
# def calculate_discounted_rewards(rewards, gamma=0.8):
#     G = []
#     R = 0
#     for r in rewards[::-1]:
#         R = r + gamma * R
#         G.insert(0, R)
#     G = torch.tensor(G)
#     G = (G - G.mean()) / G.std()
    
#     return G

### Vanilla Policy Gradient

In [None]:
# MountainCar-v0, Acrobot-v1
env_name = 'Acrobot-v1'
env = gym.make(env_name)

obs_space = env.observation_space.shape[0]
action_space = env.action_space.n

MAX_EPISODES = 1000
lr_policy = 1.e-3
# lr_value = 1e-3
gamma = 0.5

random.seed(4)
torch.manual_seed(4)
np.random.seed(4)

policy_network = VanillaPolicyNetwork(obs_space, action_space, hidden_size=512)

policy_optimizer = torch.optim.Adam(policy_network.parameters(), lr=lr_policy)

returns = []

policy_network.train()
# value_network.train()

for episode in range(MAX_EPISODES):
    
    rewards, actions, states, values = [], [], [], []
    action_probs = []

    S = env.reset()

    done = False
    episode_len = 0
    while not done:

        action_prob = policy_network(torch.tensor(S)[None])
        dist = Categorical(action_prob)
        action = dist.sample()

        S_, R, done, _ = env.step(action.item())
        
        log_prob = dist.log_prob(action)
        log_prob = log_prob.sum()
        
        episode_len += 1

        if env_name == "MountainCar-v0" and S_[0] > -0.2:
            R = 1

        states.append(S)
        actions.append(action)
        rewards.append(R)
        values.append(value)

        action_probs.append(log_prob)

        S = S_
        if done:
            break
    env.close()

    G = []
    R = 0
    for r in rewards[::-1]:
        R = r + gamma * R
        G.insert(0, R)
    G = torch.tensor(G)
    G = (G - G.mean()) / G.std()
    
    advantage = []
    for R in G:
        advantage.append(R)

    advantage = torch.Tensor(advantage)
    
    policy_loss = []
    for log_prob, adv in zip(action_probs, advantage):
        policy_loss.append(-log_prob*adv)
    
    policy_loss = torch.stack(policy_loss).sum()
    
    policy_optimizer.zero_grad()
    policy_loss.backward()
    policy_optimizer.step()
    
    rewards = torch.tensor(rewards)
    
    avg_rewards = torch.sum(rewards)
    returns.append(avg_rewards)
    
    if (episode + 1) % 50 == 0:
        print(f"Episode: {episode+1:4d}, Avg. Rewards: {np.mean(returns):.4f}")

Episode:   50, Avg. Rewards: -500.0000
Episode:  100, Avg. Rewards: -356.6200
Episode:  150, Avg. Rewards: -296.0067
Episode:  200, Avg. Rewards: -268.3200
Episode:  250, Avg. Rewards: -289.0120
Episode:  300, Avg. Rewards: -324.1767


In [None]:
# # MountainCar-v0, Acrobot-v1
# env_name = 'Acrobot-v1'
# env = gym.make(env_name)

# obs_space = env.observation_space.shape[0]
# action_space = env.action_space.n

# MAX_EPISODES = 1000
# lr = 1e-4
# gamma = 0.5

# random.seed(4)
# torch.manual_seed(4)
# np.random.seed(4)

# policy_network = VanillaPolicyNetwork(obs_space, action_space, hidden_size=512)

# optimizer = torch.optim.AdamW(policy_network.parameters(), lr=lr)

# returns = []
# for episode in range(MAX_EPISODES):

#     rewards, actions, states, action_probs = run_episode(env, env_name, policy_network)
            
#     policy_network.train()

#     Q_t = calculate_discounted_rewards(rewards, gamma)
    
#     rewards = torch.tensor(rewards)
    
#     states = torch.tensor(states).float()
#     actions = torch.tensor(actions)
    
#     probabilities = policy_network(states)
#     distribution = Categorical(probabilities)
    
#     loss = torch.sum(-distribution.log_prob(actions) * Q_t)
    
#     optimizer.zero_grad()
#     loss.backward()
#     optimizer.step()
    
#     avg_rewards = torch.sum(rewards)
#     returns.append(avg_rewards)
    
#     if (episode+1) % 50 == 0:
#         # print(states.shape, actions.shape, rewards.shape, Q_t.shape)
#         print(f"Episode: {episode+1:4d}, Avg. Rewards: {np.mean(returns):.4f}")

### Policy Gradient with Baselines

In [None]:
# MountainCar-v0, Acrobot-v1
env_name = 'MountainCar-v0'
env = gym.make(env_name)

obs_space = env.observation_space.shape[0]
action_space = env.action_space.n

MAX_EPISODES = 1000
lr_policy = 1.e-3
# lr_value = 1e-3
gamma = 0.5

random.seed(4)
torch.manual_seed(4)
np.random.seed(4)

policy_network = VanillaPolicyNetwork(obs_space, action_space, hidden_size=512)

policy_optimizer = torch.optim.Adam(policy_network.parameters(), lr=lr_policy)

returns = []

policy_network.train()
# value_network.train()

for episode in range(MAX_EPISODES):
    
    rewards, actions, states, values = [], [], [], []
    action_probs = []

    S = env.reset()

    done = False
    episode_len = 0
    while not done:

        action_prob = policy_network(torch.tensor(S)[None])
        dist = Categorical(action_prob)
        action = dist.sample()

        S_, R, done, _ = env.step(action.item())
        
        log_prob = dist.log_prob(action)
        log_prob = log_prob.sum()
        
        episode_len += 1

        if env_name == "MountainCar-v0" and S_[0] > -0.2:
            R = 1

        states.append(S)
        actions.append(action)
        rewards.append(R)
        values.append(value)

        action_probs.append(log_prob)

        S = S_
        if done:
            break
    env.close()

    G = []
    R = 0
    for r in rewards[::-1]:
        R = r + gamma * R
        G.insert(0, R)
    G = torch.tensor(G)
    G = (G - G.mean()) / G.std()
    
    advantage = []
    for R in G:
        advantage.append(R)

    advantage = torch.Tensor(advantage)
    
    policy_loss = []
    for log_prob, adv in zip(action_probs, advantage):
        policy_loss.append(-log_prob*adv)
    
    policy_loss = torch.stack(policy_loss).sum()
    
    policy_optimizer.zero_grad()
    policy_loss.backward()
    policy_optimizer.step()
    
    rewards = torch.tensor(rewards)
    
    avg_rewards = torch.sum(rewards)
    returns.append(avg_rewards)
    
    if (episode + 1) % 50 == 0:
        print(f"Episode: {episode+1:4d}, Avg. Rewards: {np.mean(returns):.4f}")

In [None]:
# # MountainCar-v0, Acrobot-v1
# env_name = 'Acrobot-v1'
# env = gym.make(env_name)

# obs_space = env.observation_space.shape[0]
# action_space = env.action_space.n

# MAX_EPISODES = 1000
# lr = 1e-4
# gamma = 0.5

# random.seed(4)
# torch.manual_seed(4)
# np.random.seed(4)

# policy_network = VanillaPolicyNetwork(obs_space, action_space, hidden_size=512)

# optimizer = torch.optim.AdamW(policy_network.parameters(), lr=lr)

# returns = []
# Q_t_avg = []
# for episode in range(MAX_EPISODES):

#     rewards, actions, states, action_probs = run_episode(env, env_name, policy_network)
            
#     policy_network.train()
    
#     Q_t = calculate_discounted_rewards(rewards, gamma)
    
#     rewards = torch.tensor(rewards)
    
#     states = torch.tensor(states).float()
#     actions = torch.tensor(actions)
    
#     probabilities = policy_network(states)
#     distribution = Categorical(probabilities)
    
#     loss = torch.sum(-distribution.log_prob(actions) * Q_t)
    
#     optimizer.zero_grad()
#     loss.backward()
#     optimizer.step()
    
#     avg_rewards = torch.sum(rewards)
#     # print(avg_rewards.shape)
#     returns.append(avg_rewards)
    
#     if (episode+1) % 50 == 0:
        
#         # print(states.shape, actions.shape, rewards.shape, Q_t.shape)
#         print(f"Episode: {episode+1:4d}, Avg. Rewards: {np.mean(returns):.4f}")

### Actor-Critic Method

In [64]:
# # MountainCar-v0, Acrobot-v1
# env_name = 'MountainCar-v0'
# env = gym.make(env_name)

# obs_space = env.observation_space.shape[0]
# action_space = env.action_space.n

# MAX_EPISODES = 1000
# lr = 1e-3
# gamma = 0.5

# random.seed(4)
# torch.manual_seed(4)
# np.random.seed(4)

# policy_network = VanillaPolicyNetwork(obs_space, action_space, hidden_size=512)
# value_network = ValueNetwork(obs_space, hidden_size=512)

# params = list(policy_network.parameters()) + list(value_network.parameters())
# optimizer = torch.optim.AdamW(policy_network.parameters(), lr=lr)

# returns = []
# Q_t_avg = []
# for episode in range(MAX_EPISODES):

#     rewards, actions, states, action_probs = run_episode(env, env_name, policy_network)
            
#     policy_network.train()
#     value_network.train()
    
#     Q_t = calculate_discounted_rewards(rewards, gamma)
    
#     rewards = torch.tensor(rewards)
    
#     states = torch.tensor(states).float()
#     actions = torch.tensor(actions)
#     action_probs = torch.tensor(action_probs)
    
#     probabilities = policy_network(states)
#     state_values = value_network(states)
    
#     distribution = Categorical(probabilities)
    
#     loss = torch.sum(-distribution.log_prob(actions) * (Q_t - state_values.detach().squeeze(1))) + F.smooth_l1_loss(state_values.squeeze(1), Q_t)
    
#     optimizer.zero_grad()
#     loss.backward()
#     optimizer.step()
    
#     avg_rewards = torch.sum(rewards)
#     returns.append(avg_rewards)
    
#     if (episode + 1) % 50 == 0:
        
#         # print(states.shape, actions.shape, rewards.shape, Q_t.shape)
#         print(f"Episode: {episode+1:4d}, Avg. Rewards: {np.mean(returns):.4f}")

### Actor-Critic Methods

In [53]:
# MountainCar-v0, Acrobot-v1
env_name = 'Acrobot-v1'
env = gym.make(env_name)

obs_space = env.observation_space.shape[0]
action_space = env.action_space.n

MAX_EPISODES = 1000
lr_policy = 1e-3
lr_value = 1e-3
gamma = 0.5

random.seed(4)
torch.manual_seed(4)
np.random.seed(4)

policy_network = VanillaPolicyNetwork(obs_space, action_space, hidden_size=512)
value_network = ValueNetwork(obs_space, hidden_size=512)

policy_optimizer = torch.optim.Adam(policy_network.parameters(), lr=lr_policy)
value_optimizer = torch.optim.Adam(value_network.parameters(), lr=lr_value)

returns = []

policy_network.train()
value_network.train()

for episode in range(MAX_EPISODES):
    
    rewards, actions, states, values = [], [], [], []
    action_probs = []

    S = env.reset()

    done = False
    episode_len = 0
    while not done:

        action_prob = policy_network(torch.tensor(S)[None])
        dist = Categorical(action_prob)
        action = dist.sample()

        S_, R, done, _ = env.step(action.item())
        
        log_prob = dist.log_prob(action)
        log_prob = log_prob.sum()
        
        episode_len += 1

        if env_name == "MountainCar-v0" and S_[0] > -0.2:
            R = 1

        states.append(S)
        actions.append(action)
        rewards.append(R)
        values.append(value)

        action_probs.append(log_prob)

        S = S_
        if done:
            break
    env.close()

    value_estimates = []
    for state in states:
        state = torch.from_numpy(state).float().unsqueeze(0)
        value_estimates.append(value_network(state))
        
    value_estimates = torch.stack(value_estimates).squeeze()
    
    G = []
    R = 0
    for r in rewards[::-1]:
        R = r + gamma * R
        G.insert(0, R)
    G = torch.tensor(G)
    
    advantage = []
    for value, R in zip(value_estimates, G):
        advantage.append(R - value)

    advantage = torch.Tensor(advantage)
    
    policy_loss = []
    for log_prob, adv in zip(action_probs, advantage):
        policy_loss.append( - log_prob * adv)
    
    value_loss = F.mse_loss(value_estimates, G)
    value_optimizer.zero_grad()
    value_loss.backward()
    value_optimizer.step()
    
    policy_loss = torch.stack(policy_loss).sum()
    policy_optimizer.zero_grad()
    policy_loss.backward()
    policy_optimizer.step()
    
    rewards = torch.tensor(rewards)
    
    avg_rewards = torch.sum(rewards)
    returns.append(avg_rewards)
    
    if (episode + 1) % 50 == 0:
        
        print(f"Episode: {episode+1:4d}, Avg. Rewards: {np.mean(returns):.4f}")

Episode:   50, Avg. Rewards: -279.0800
Episode:  100, Avg. Rewards: -222.6600
Episode:  150, Avg. Rewards: -199.4333
Episode:  200, Avg. Rewards: -186.5600
Episode:  250, Avg. Rewards: -185.7840
Episode:  300, Avg. Rewards: -184.9667
Episode:  350, Avg. Rewards: -186.9057
Episode:  400, Avg. Rewards: -183.3150
Episode:  450, Avg. Rewards: -180.0911
Episode:  500, Avg. Rewards: -178.7140
Episode:  550, Avg. Rewards: -180.2746
Episode:  600, Avg. Rewards: -185.9017
Episode:  650, Avg. Rewards: -191.3738
Episode:  700, Avg. Rewards: -190.5771
Episode:  750, Avg. Rewards: -187.8427
Episode:  800, Avg. Rewards: -186.6400
Episode:  850, Avg. Rewards: -184.3577
Episode:  900, Avg. Rewards: -182.0700
Episode:  950, Avg. Rewards: -180.2579
Episode: 1000, Avg. Rewards: -179.2480


### Gaussian Policy

In [65]:
# MountainCar-v0, Acrobot-v1
env_name = 'MountainCarContinuous-v0'
env = gym.make(env_name)

obs_space = env.observation_space.shape[0]
action_space = env.action_space

MAX_EPISODES = 1000
lr_policy = 1e-3
lr_value = 1e-3
gamma = 0.5

random.seed(4)
torch.manual_seed(4)
np.random.seed(4)

num_outputs = action_space.shape[0]

gaussian_policy = GaussianPolicy(obs_space, num_outputs, hidden_size=512)
value_network = ValueNetwork(obs_space, hidden_size=512)

policy_optimizer = torch.optim.Adam(gaussian_policy.parameters(), lr=lr_policy)
value_optimizer = torch.optim.Adam(value_network.parameters(), lr=lr_value)

returns = []

gaussian_policy.train()
value_network.train()

for episode in range(MAX_EPISODES):
    
    rewards, actions, states, values = [], [], [], []
    action_probs = []

    S = env.reset()

    done = False
    episode_len = 0
    while not done:

        mu, std = gaussian_policy(torch.tensor(S)[None])
        # value = value_network(torch.tensor(S)[None])
        dist = Normal(mu, std)
        action = dist.sample()
        
        log_prob = dist.log_prob(action)
        log_prob = log_prob.sum()
        
        action = torch.tanh(action)
        action = action.numpy()[0]

        # print(action, action.item())
        S_, R, done, _ = env.step(action)
        episode_len += 1

        if env_name == "MountainCar-v0" and S_[0] > -0.2:
            R = 1

        states.append(S)
        actions.append(action)
        rewards.append(R)
        values.append(value)

        action_probs.append(log_prob)

        S = S_
        if done:
            break
    env.close()

    value_estimates = []
    for state in states:
        state = torch.from_numpy(state).float().unsqueeze(0)
        value_estimates.append(value_network(state))
        
    value_estimates = torch.stack(value_estimates).squeeze()
    
    G = []
    R = 0
    for r in rewards[::-1]:
        R = r + gamma * R
        G.insert(0, R)
    G = torch.tensor(G)
    
    advantage = []
    for value, R in zip(value_estimates, G):
        advantage.append(R - value)

    advantage = torch.Tensor(advantage)
    
    policy_loss = []
    for log_prob, adv in zip(action_probs, advantage):
        policy_loss.append( - log_prob * adv)
    
    v_loss = F.mse_loss(value_estimates, G)
    value_optimizer.zero_grad()
    v_loss.backward()
    value_optimizer.step()
    
    policy_loss = torch.stack(policy_loss).sum()
    policy_optimizer.zero_grad()
    policy_loss.backward()
    policy_optimizer.step()
    
    rewards = torch.tensor(rewards)
    
    avg_rewards = torch.sum(rewards)
    returns.append(avg_rewards)
    
    if (episode + 1) % 50 == 0:
        
        # print(states.shape, actions.shape, rewards.shape, Q_t.shape)
        print(f"Episode: {episode+1:4d}, Avg. Rewards: {np.mean(returns):.4f}")

Episode:   50, Avg. Rewards: -4.3610
Episode:  100, Avg. Rewards: -3.1189
Episode:  150, Avg. Rewards: -2.6732
Episode:  200, Avg. Rewards: -2.4493
Episode:  250, Avg. Rewards: -2.3181
Episode:  300, Avg. Rewards: -2.2327
Episode:  350, Avg. Rewards: -2.1646
Episode:  400, Avg. Rewards: -2.1164
Episode:  450, Avg. Rewards: -2.0841
Episode:  500, Avg. Rewards: -2.0598
Episode:  550, Avg. Rewards: -2.0340
Episode:  600, Avg. Rewards: -2.0133
Episode:  650, Avg. Rewards: -1.9962
Episode:  700, Avg. Rewards: -1.9817
Episode:  750, Avg. Rewards: -1.9698
Episode:  800, Avg. Rewards: -1.9577
Episode:  850, Avg. Rewards: -1.9458
Episode:  900, Avg. Rewards: -1.9355
Episode:  950, Avg. Rewards: -1.9273
Episode: 1000, Avg. Rewards: -1.9195
