In [56]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim
import numpy as np
import gym
from torch.autograd import Variable
import copy
import time

In [57]:
M = 1000
critic_learning_rate = 0.1
actor_learning_rate = 0.1
noise = 0.05
batch_size = 50
gamma = 0.9

In [58]:
class Step():
    def __init__(self,state,action,reward,next_state):
        self.state = state
        self.action = action
        self.reward = reward
        self.next_state = next_state

In [59]:
class Critic(nn.Module):
    def __init__(self):
        super(Critic, self).__init__()
        self.layer1 = nn.Linear(4,64)
        self.layer2 = nn.Linear(64,64)
        self.layer3 = nn.Linear(64,1)
    
    def forward(self,x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        x = self.layer3(x)
        return x
        
class Actor(nn.Module):
    def __init__(self):
        super(Actor, self).__init__()
        self.layer1 = nn.Linear(3,64)
        self.layer2 = nn.Linear(64,64)
        self.layer3 = nn.Linear(64,1)
        
    def forward(self,x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        x = F.tanh(self.layer3(x))
        return x

In [60]:
critic = Critic()
actor = Actor()

target_critic = copy.deepcopy(critic)
target_actor = copy.deepcopy(actor)

critic_optimizer = torch.optim.Adam(critic.parameters(), lr = critic_learning_rate)
actor_optimizer = torch.optim.Adam(actor.parameters(), lr = actor_learning_rate)

In [61]:
t = time.time()
env = gym.make('Pendulum-v0')
episode = []

for n in range(M):
    
    total_reward = 0
    
    state = env.reset()
    state = torch.from_numpy(state).float()
    
    terminal = False
    while not terminal:
        
        #print(f"Start {time.time() - t}")
        
        action = actor(state) * 2
        
        #print(action)
        
        action = action.item()
        action += (noise * np.random.randn())
        
        next_state, reward, terminal, _ = env.step([action])
        
        total_reward += reward
        
        transition = Step(state, action, reward, next_state)
        episode.append(transition)
        
        env.render()
        
        #print(f"Before batch {time.time() - t}")
        
        if batch_size < len(episode):
            
            batch = np.random.choice(episode,batch_size)
            batch_matrix = []
            
            for step in batch:
                temp_list = []
                temp_list.append(step.state)
                temp_list.append(step.action)
                temp_list.append(step.reward)
                temp_list.append(step.next_state)
                batch_matrix.append(temp_list)
            
            #print(f"After batch {time.time() - t}")
            
            batch_matrix = np.array(batch_matrix)
            
            states = batch_matrix[:,0]
            
            actions = torch.from_numpy(np.array(batch_matrix[:,1],dtype = np.float16))

            rewards = torch.from_numpy(np.array(batch_matrix[:,2],dtype = np.float16))

            next_states = torch.Tensor((batch_matrix[:,3]).tolist())

            target_policy_output = target_actor(next_states)
            
            target_critic_input = torch.hstack([next_states,target_policy_output])

            y = rewards + (gamma * torch.flatten(target_critic(target_critic_input)))

            for i in range(batch_size):
                critic_optimizer.zero_grad()
                state_tensor = torch.Tensor(states[i])
                critic_input = torch.hstack([state_tensor,actions[i]])
                critic_output = critic(critic_input)

                loss = (y[i]  - critic_output) ** 2

                loss.backward(retain_graph=True)
                critic_optimizer.step()
            
            #print(f"After opt1 {time.time() - t}")
            
            #print(f"Before: {list(actor.parameters())[0][0]}")
            qvalues = []
            
            for i in range(batch_size):
                actor_optimizer.zero_grad()
                state_tensor = torch.Tensor(states[i])
                critic_input = torch.hstack([state_tensor,actor(state_tensor)])
                #print(critic_input)
                critic_output = critic(critic_input)
                #print(critic_output)
                qvalues.append(critic_output)
            
            #print(qvalues)
            qvalues = torch.cat(qvalues,0)
            loss = -qvalues.mean()
            #print(f"Loss: {loss}")
            #print(loss)
            loss.backward(retain_graph=True)
            actor_optimizer.step()
            
            #print(f"After: {list(actor.parameters())[0][0]}")
            
            #print(f"After opt2 {time.time() - t}")
            
            with torch.no_grad():
                for index,parameter in enumerate(target_critic.parameters()):
                    critic_parameter = list(critic.parameters())[index]
                    new_value = (parameter * (1-critic_learning_rate)) + (critic_parameter * critic_learning_rate)
                    parameter.copy_(new_value)
            
            #print(f"Before: {list(target_actor.parameters())[0][0]}")
            with torch.no_grad():
                for index,parameter in enumerate(target_actor.parameters()):
                    actor_parameter = list(actor.parameters())[index]
                    new_value = (parameter * (1-actor_learning_rate)) + (actor_parameter * actor_learning_rate)
                    parameter.copy_(new_value)
            #print(f"After: {list(target_actor.parameters())[0][0]}")
            
        state = torch.from_numpy(next_state).float()
        
    print(f"Episode {n+1}: {total_reward}")



Episode 1: -1346.1186047657757
Episode 2: -1411.9254885761488
Episode 3: -1654.0849243393898
Episode 4: -1482.8070471245276


KeyboardInterrupt: 

In [None]:
actor(torch.Tensor([1,1,1]))

In [66]:
actor(torch.Tensor([2,1,2]))

tensor([-1.], grad_fn=<TanhBackward0>)

In [68]:
actor(torch.Tensor([0,0,0]))

tensor([1.], grad_fn=<TanhBackward0>)