In [1]:
import torch  
import gym
import numpy as np  
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import random

In [2]:
class OUActionNoise():
    def __init__(self, mu, sigma=0.15, theta=0.2, dt=1e-2, x0=None):
        self.theta = theta
        self.mu = mu
        self.sigma = sigma
        self.dt = dt
        self.x0 = x0
        self.reset()

    def __call__(self):
        x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + \
                self.sigma * np.sqrt(self.dt) * np.random.normal(size=self.mu.shape)
        self.x_prev = x

        return x

    def reset(self):
        self.x_prev = self.x0 if self.x0 is not None else np.zeros_like(self.mu)

In [3]:
class ReplayBuffer():
    def __init__(self, max_size, input_shape, n_actions):
        self.mem_size = max_size
        self.mem_cntr = 0
        self.state_memory = np.zeros((self.mem_size, input_shape))
        self.new_state_memory = np.zeros((self.mem_size, input_shape))
        self.action_memory = np.zeros((self.mem_size, n_actions))
        self.reward_memory = np.zeros(self.mem_size)
        self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool)

    def store_transition(self, state, action, reward, state_, done):
        index = self.mem_cntr % self.mem_size
        self.state_memory[index] = state
        self.action_memory[index] = action
        self.reward_memory[index] = reward
        self.new_state_memory[index] = state_
        self.terminal_memory[index] = done

        self.mem_cntr += 1

    def sample_buffer(self, batch_size):
        max_mem = min(self.mem_cntr, self.mem_size)

        batch = np.random.choice(max_mem, batch_size)

        states = self.state_memory[batch]
        actions = self.action_memory[batch]
        rewards = self.reward_memory[batch]
        states_ = self.new_state_memory[batch]
        dones = self.terminal_memory[batch]

        return states, actions, rewards, states_, dones

In [4]:
class CriticNetwork(nn.Module):
    def __init__(self, input_dims, fc1_dims, fc2_dims, n_actions, name):
        super().__init__()
        self.input_dims = input_dims
        self.fc1_dims = fc1_dims
        self.fc2_dims = fc2_dims
        self.n_actions = n_actions
        self.name = name

        self.fc1 = nn.Linear(self.input_dims, self.fc1_dims)
        self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims)
        
        self.bn1 = nn.LayerNorm(self.fc1_dims)
        self.bn2 = nn.LayerNorm(self.fc2_dims)

        self.action_value = nn.Linear(self.n_actions, self.fc2_dims)
        self.q = nn.Linear(self.fc2_dims, 1)
        
        f1 = 1./np.sqrt(self.fc1.weight.data.size()[0])
        self.fc1.weight.data.uniform_(-f1, f1)
        self.fc1.bias.data.uniform_(-f1, f1)

        f2 = 1./np.sqrt(self.fc2.weight.data.size()[0])
        self.fc2.weight.data.uniform_(-f2, f2)
        self.fc2.bias.data.uniform_(-f2, f2)

        f3 = 0.003
        self.q.weight.data.uniform_(-f3, f3)
        self.q.bias.data.uniform_(-f3, f3)

        f4 = 1./np.sqrt(self.action_value.weight.data.size()[0])
        self.action_value.weight.data.uniform_(-f4, f4)
        self.action_value.bias.data.uniform_(-f4, f4)
        
        self.optimizer = optim.Adam(self.parameters(), lr=1e-3, weight_decay=0.01)
        
        self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cuda:1')

        self.to(self.device)

    def forward(self, state, action):
        state_value = self.fc1(state)
        state_value = self.bn1(state_value)
        state_value = F.relu(state_value)
        state_value = self.fc2(state_value)
        state_value = self.bn2(state_value)
        action_value = self.action_value(action)
        state_action_value = F.relu(torch.add(state_value, action_value))
        state_action_value = self.q(state_action_value)
        return state_action_value

In [5]:
class ActorNetwork(nn.Module):
    def __init__(self, input_dims, fc1_dims, fc2_dims, n_actions, name):
        super().__init__()
        self.input_dims = input_dims
        self.fc1_dims = fc1_dims
        self.fc2_dims = fc2_dims
        self.n_actions = n_actions
        self.name = name

        self.fc1 = nn.Linear(self.input_dims, self.fc1_dims)
        self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims)
        
        self.bn1 = nn.LayerNorm(self.fc1_dims)
        self.bn2 = nn.LayerNorm(self.fc2_dims)
        
        self.mu = nn.Linear(self.fc2_dims, self.n_actions)
        
        f2 = 1./np.sqrt(self.fc2.weight.data.size()[0])
        self.fc2.weight.data.uniform_(-f2, f2)
        self.fc2.bias.data.uniform_(-f2, f2)

        f1 = 1./np.sqrt(self.fc1.weight.data.size()[0])
        self.fc1.weight.data.uniform_(-f1, f1)
        self.fc1.bias.data.uniform_(-f1, f1)

        f3 = 0.003
        self.mu.weight.data.uniform_(-f3, f3)
        self.mu.bias.data.uniform_(-f3, f3)
        
        self.optimizer = optim.Adam(self.parameters(), lr=1e-4)
        
        self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cuda:1')

        self.to(self.device)
        
    def forward(self, state):
        x = self.fc1(state)
        x = self.bn1(x)
        x = F.relu(x)
        x = self.fc2(x)
        x = self.bn2(x)
        x = F.relu(x)
        x = torch.tanh(self.mu(x))
        return x

In [6]:
def remember(state, action, reward, state_, done):
        memory.store_transition(state, action, reward, state_, done)
def update_network_parameters():
        tau = 1e-3

        actor_params = actor.named_parameters()
        critic_params = critic.named_parameters()
        target_actor_params = target_actor.named_parameters()
        target_critic_params = target_critic.named_parameters()

        critic_state_dict = dict(critic_params)
        actor_state_dict = dict(actor_params)
        target_critic_state_dict = dict(target_critic_params)
        target_actor_state_dict = dict(target_actor_params)

        for name in critic_state_dict:
            critic_state_dict[name] = tau*critic_state_dict[name].clone() + \
                                (1-tau)*target_critic_state_dict[name].clone()

        for name in actor_state_dict:
             actor_state_dict[name] = tau*actor_state_dict[name].clone() + \
                                 (1-tau)*target_actor_state_dict[name].clone()

        target_critic.load_state_dict(critic_state_dict)
        target_actor.load_state_dict(actor_state_dict)

In [7]:
# Constants
gamma=0.99
max_episodes = 40
batch_size = 64

# env_name='Pendulum-v0'
env_name='MountainCarContinuous-v0'
env = gym.make(env_name)
obs_dim = env.observation_space.shape[0]
n_actions = env.action_space.shape[0]
memory = ReplayBuffer(1000000, obs_dim, n_actions)
noise = OUActionNoise(mu=np.zeros(n_actions))

In [8]:
actor = ActorNetwork(input_dims=obs_dim, fc1_dims=400, fc2_dims=300, n_actions=n_actions, name='actor')
target_actor = ActorNetwork(input_dims=obs_dim, fc1_dims=400, fc2_dims=300, n_actions=n_actions, name='target_actor')
critic = CriticNetwork(input_dims=obs_dim, fc1_dims=400, fc2_dims=300, n_actions=n_actions, name='critic')
target_critic = CriticNetwork(input_dims=obs_dim, fc1_dims=400, fc2_dims=300, n_actions=n_actions, name='target_critic')

In [9]:
episode_a_loss = 0
episode_c_loss = 0
score_history = []
for episode in range(max_episodes):
    score = 0
    noise.reset()
    state = env.reset()
    done = False
    while not done:
        actor.eval()
        state = torch.as_tensor(state, dtype=torch.float32).to(actor.device)
        mu = actor.forward(state).to(actor.device)
        act = mu + torch.tensor(noise(), dtype=torch.float32).to(actor.device)
        actor.train()
        act = act.cpu().detach().numpy()
        state_, rew, done, _ = env.step(act)
        state_ = torch.tensor(state_, dtype=torch.float32).to(critic.device)
        remember(state.cpu(), act, rew, state_.cpu(), done)
        score += rew
        state = state_
#         env.render()
        if memory.mem_cntr > batch_size:
            states, actions, rewards, states_, dones = memory.sample_buffer(batch_size)
            states = torch.tensor(states, dtype=torch.float32).to(critic.device)
            actions = torch.tensor(actions, dtype=torch.float32).to(critic.device)
            rewards = torch.tensor(rewards, dtype=torch.float32).to(critic.device)
            states_ = torch.tensor(states_, dtype=torch.float32).to(critic.device)
            dones = torch.tensor(dones).to(critic.device)

            target_actions = target_actor.forward(states_)
            critic_value_ = target_critic.forward(states_, target_actions)
            critic_value = critic.forward(states, actions)
            
            critic_value_[done] = 0.0
            critic_value_ = critic_value_.flatten()
            
            target = rewards + gamma*critic_value_
            target = torch.as_tensor(target).to(critic.device)
            target = target.reshape(batch_size, 1)
            
            critic.optimizer.zero_grad()
            critic_loss = F.mse_loss(target, critic_value)
            episode_a_loss += critic_loss
            critic_loss.backward()
            critic.optimizer.step()

            actor.optimizer.zero_grad()
            actor_loss = -critic.forward(states, actor.forward(states))
            actor_loss = torch.mean(actor_loss)
            episode_a_loss += actor_loss
            actor_loss.backward()
            actor.optimizer.step()

            update_network_parameters()
    print('Episode: '+ str(episode) + ' - Score: ' + str(score))
    score_history.append(score)
# env.close()

Episode: 0 - Score: -34.01929184002758
Episode: 1 - Score: -50.47529714816169
Episode: 2 - Score: 47.6678103130207
Episode: 3 - Score: -109.77356505947476
Episode: 4 - Score: -106.50983763829035
Episode: 5 - Score: -81.4234257424921
Episode: 6 - Score: -123.72865452207374
Episode: 7 - Score: -80.85320906799558
Episode: 8 - Score: -66.95820715454404
Episode: 9 - Score: -48.94736289281819
Episode: 10 - Score: -19.235767279554405
Episode: 11 - Score: -10.38710593426718
Episode: 12 - Score: -14.601215833948828
Episode: 13 - Score: -41.42660461963508
Episode: 14 - Score: -3.0047616509549324
Episode: 15 - Score: -7.121531804895552
Episode: 16 - Score: -5.5176203741876195
Episode: 17 - Score: -6.1132833743710275
Episode: 18 - Score: -9.435618272883271
Episode: 19 - Score: -4.119656312910854
Episode: 20 - Score: -4.147068845883336
Episode: 21 - Score: -1.7771852888080664
Episode: 22 - Score: -3.4341158106455882
Episode: 23 - Score: -20.51398636209836
Episode: 24 - Score: -12.711145216778949
Ep