In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim
import numpy as np
import gym
from torch.autograd import Variable
import copy

In [10]:
M = 100
critic_learning_rate = 0.001
actor_learning_rate = 0.001
noise = 0.01
batch_size = 50
gamma = 0.9

In [11]:
class Step():
    def __init__(self,state,action,reward,next_state):
        self.state = state
        self.action = action
        self.reward = reward
        self.next_state = next_state

In [12]:
class Critic(nn.Module):
    def __init__(self):
        super(Critic, self).__init__()
        self.layer1 = nn.Linear(4,64)
        self.layer2 = nn.Linear(64,128)
        self.layer3 = nn.Linear(128,1)
    
    def forward(self,x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        x = F.tanh(self.layer3(x))
        return x
        
class Actor(nn.Module):
    def __init__(self):
        super(Actor, self).__init__()
        self.layer1 = nn.Linear(3,64)
        self.layer2 = nn.Linear(64,128)
        self.layer3 = nn.Linear(128,1)
        
    def forward(self,x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        x = F.tanh(self.layer3(x))
        return x

In [13]:
critic = Critic()
actor = Actor()

target_critic = copy.deepcopy(critic)
target_actor = copy.deepcopy(actor)

critic_optimizer = torch.optim.Adam(critic.parameters(), lr = critic_learning_rate)
actor_optimizer = torch.optim.Adam(actor.parameters(), lr = actor_learning_rate)

In [14]:
env = gym.make('Pendulum-v0')
episode = []

In [15]:
for _ in range(M):
    
    total_reward = 0
    
    state = env.reset()
    state = torch.from_numpy(state).float()
    
    terminal = False
    while not terminal:
        
        action = actor(state) * 2
        action = action.item()
        action += (noise * np.random.randn())
        
        next_state, reward, terminal, _ = env.step([action])
        
        total_reward += reward
        
        transition = Step(state, action, reward, next_state)
        episode.append(transition)
        
        #env.render()
        
        if batch_size < len(episode):
            
            batch = np.random.choice(episode,batch_size)
            batch_matrix = []
            
            for step in batch:
                temp_list = []
                temp_list.append(step.state)
                temp_list.append(step.action)
                temp_list.append(step.reward)
                temp_list.append(step.next_state)
                batch_matrix.append(temp_list)
            
            batch_matrix = np.array(batch_matrix)
            
            states = batch_matrix[:,0]
            
            actions = torch.from_numpy(np.array(batch_matrix[:,1],dtype = np.float16))

            rewards = torch.from_numpy(np.array(batch_matrix[:,2],dtype = np.float16))

            next_states = torch.Tensor((batch_matrix[:,3]).tolist())

            target_policy_output = target_actor(next_states)
            
            target_critic_input = torch.hstack([next_states,target_policy_output])

            y = rewards + (gamma * torch.flatten(target_critic(target_critic_input)))
            
            for i in range(batch_size):
                critic_optimizer.zero_grad()
                state_tensor = torch.Tensor(states[i])
                critic_input = torch.hstack([state_tensor,actions[i]])
                critic_output = critic(critic_input)

                loss = (y[i]  - critic_output) ** 2

                loss.backward(retain_graph=True)
                critic_optimizer.step()
                
            for i in range(batch_size):
                actor_optimizer.zero_grad()
                state_tensor = torch.Tensor(states[i])
                critic_input = torch.hstack([state_tensor,actor(state_tensor)])
                critic_output = critic(critic_input)

                loss = -critic_output

                loss.backward(retain_graph=True)
                actor_optimizer.step()
            
            with torch.no_grad():
                for index,parameter in enumerate(target_critic.parameters()):
                    critic_parameter = list(critic.parameters())[index]
                    new_value = (parameter * (1-critic_learning_rate)) + (critic_parameter * critic_learning_rate)
                    parameter.copy_(new_value)
                    
            with torch.no_grad():
                for index,parameter in enumerate(target_actor.parameters()):
                    actor_parameter = list(actor.parameters())[index]
                    new_value = (parameter * (1-actor_learning_rate)) + (actor_parameter * actor_learning_rate)
                    parameter.copy_(new_value)
                    
    print(total_reward)



-1364.1002169630556
-1044.8631553452415
-1312.0436620120527
-1456.609074734895
-1762.4644271461775
-1438.8476619189394
-1637.500229171146
-1445.2422131926196
-952.6326292462105
-1172.002343613598
-1216.0872663647488
-1356.8513818948572
-1189.4348941489018


KeyboardInterrupt: 