In [18]:
from datetime import datetime
from collections import deque
import os
import random
import gym
import torch
from torch.distributions import Categorical
from torch.nn import Module, Linear
import torch.nn.functional as F
import numpy as np


class QNetwork(Module):
    def __init__(self):
        super().__init__()
        self.fc = Linear(4, 48)
        self.fcQ1 = Linear(48, 64)
        self.fcQ2 = Linear(64, 1)

    def forward(self, x,a):
        
        x = torch.cat([x, a.detach()])
  
        x = self.fc(x)
        x = F.relu(x)
        x = self.fcQ1(x)
        x = F.relu(x)
        x = self.fcQ2(x)

        return x


class PolicyNetwork(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.fc = torch.nn.Linear(3, 1, bias=False)
    def forward(self, x):
        x = self.fc(x)
        return x
    
    
    

# network and optimizer
Q = QNetwork()
Q_optimizer = torch.optim.Adam(Q.parameters(), lr=0.0005)

pi = PolicyNetwork()
Actor_optimizer = torch.optim.Adam(pi.parameters(), lr=0.0005)

# target network
Q_target = QNetwork()

history = deque(maxlen=5000)  # replay buffer
discount = 0.99  # discount factor gamma

def update_Q():
    loss = 0

    for state, action, state_next, reward, done in random.sample(history, min(32, len(history))):
        with torch.no_grad():
            if done:
                target = reward
            else:
                target = reward + discount * Q_target(state_next,pi(state_next))
        loss = loss + (target - Q(state, action))**2

    loss = loss/min(32, len(history))
    Q_optimizer.zero_grad()
    loss.backward()
    Q_optimizer.step()

def update_pi():
    loss = 0
    for state, action, state_next, reward, done in random.sample(history, min(32, len(history))):
        loss = loss - Q(state, pi(state))
    loss = loss/min(32, len(history))
    Actor_optimizer.zero_grad()
    loss.backward()
    Actor_optimizer.step()    
    

# gym environment
env = gym.make("Pendulum-v0")
max_time_steps = 1000



# for updating target network
target_interval = 1000
target_counter = 0

# training
for episode in range(20000):
    # sum of accumulated rewards
    rewards = 0

    # get initial observation
    observation = env.reset()
    state = torch.tensor(observation, dtype=torch.float32)

    # loop until an episode ends
    for t in range(1, max_time_steps + 1):
        # display current environment
        #env.render()

        action = pi.forward(state)
        action = action.detach().numpy()
        action = action + np.random.normal(3)
        action = np.clip(action, -1.0, 1.0)
        observation_next, reward, done, info = env.step(action)
        state_next = torch.tensor(observation_next, dtype=torch.float32)
        action = torch.tensor(action)

        # collect reward
        rewards = rewards + reward

        # collect a transition
        history.append([state, action, state_next, reward, done])

        update_Q()
        update_pi()
        #Q_target.load_state_dict(Q.state_dict())
        # Soft update
        for target_param, param in zip(Q_target.parameters(), Q.parameters()):
            target_param.data.copy_(param.data * 0.8 + target_param.data * (1.0 - 0.8))

        if done:
            break

        # pass observation to the next step
        observation = observation_next
        state = state_next

    print('episode: {}, reward: {:.1f}'.format(episode, rewards))

env.close()


# TEST     
episode = 0
state = env.reset()     
while episode < 10:  # episode loop
    env.render()
    state = torch.tensor(state, dtype=torch.float32)
    action = pi(state)
    action = np.clip(action, -2, 2)
    next_state, reward, done, info = env.step(action)  # take a random action
    state = next_state

    if done:
        episode = episode + 1
        state = env.reset()
env.close()     




episode: 0, reward: -1285.0
episode: 1, reward: -949.2
episode: 2, reward: -1402.2


KeyboardInterrupt: 