In [19]:
import numpy as np
import torch


class ReplayBuffer(object):
    
    def __init__(self, state_dim, action_dim, max_size=int(1e6)):
        self.max_size = max_size
        self.ptr = 0
        self.size = 0
        self.state = np.zeros((max_size, state_dim))
        self.action = np.zeros((max_size, action_dim))
        self.next_state = np.zeros((max_size, state_dim))
        self.reward = np.zeros((max_size, 1))
        self.not_done = np.zeros((max_size, 1))
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
    def push(self, state, action, next_state, reward, done):
        self.state[self.ptr] = state
        self.action[self.ptr] = action
        self.next_state[self.ptr] = next_state
        self.reward[self.ptr] = reward
        self.not_done[self.ptr] = 1. - done
        self.ptr = (self.ptr + 1) % self.max_size
        self.size = min(self.size + 1, self.max_size)

    def sample(self, batch_size):
        ind = np.random.randint(0, self.size, size=batch_size)
        return (
            torch.FloatTensor(self.state[ind]).to(self.device),
            torch.FloatTensor(self.action[ind]).to(self.device),
            torch.FloatTensor(self.next_state[ind]).to(self.device),
            torch.FloatTensor(self.reward[ind]).to(self.device),
            torch.FloatTensor(self.not_done[ind]).to(self.device)
        )

In [20]:
ReplayBuffer(3, 4)

<__main__.ReplayBuffer at 0x1ca7b030fa0>

In [21]:
import copy
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

In [22]:
class Actor(nn.Module):
    
    def __init__(self, input_dim, output_dim, max_action):
        super(Actor, self).__init__()
        
        self.l1 = nn.Linear(input_dim, 256)
        self.l2 = nn.Linear(256, 256)
        self.l3 = nn.Linear(256, output_dim)
        self.max_action = max_action
        
    def forward(self, state):
        a = F.relu(self.l1(state))
        a = F.relu(self.l2(a))
        return self.max_action * torch.tanh(self.l3(a))

In [23]:
class Critic(nn.Module):
    
    def __init__(self, input_dim, output_dim):
        super(Critic, self).__init__()

        # Q1 architecture
        self.l1 = nn.Linear(input_dim + output_dim, 256)
        self.l2 = nn.Linear(256, 256)
        self.l3 = nn.Linear(256, 1)

        # Q2 architecture
        self.l4 = nn.Linear(input_dim + output_dim, 256)
        self.l5 = nn.Linear(256, 256)
        self.l6 = nn.Linear(256, 1)
        
    def forward(self, state, action):
        sa = torch.cat([state, action], 1)

        q1 = F.relu(self.l1(sa))
        q1 = F.relu(self.l2(q1))
        q1 = self.l3(q1)

        q2 = F.relu(self.l4(sa))
        q2 = F.relu(self.l5(q2))
        q2 = self.l6(q2)
        
        return q1, q2
    
    def Q1(self, state, action):
        sa = torch.cat([state, action], 1)
        
        q1 = F.relu(self.l1(sa))
        q1 = F.relu(self.l2(q1))
        q1 = self.l3(q1)
        
        return q1
    

In [24]:
class TD3(object):
    
    def __init__(self, input_dim, output_dim, max_action, cfg):
        self.max_action = max_action
        self.gamma = cfg.gamma
        self.lr = cfg.lr
        self.policy_noise = cfg.policy_noise
        self.noise_clip = cfg.noise_clip
        self.policy_freq = cfg.policy_freq
        self.batch_size =  cfg.batch_size 
        self.device = cfg.device
        self.total_it = 0
        
        self.actor = Actor(input_dim, output_dim, max_action).to(self.device)
        self.actor_target = copy.deepcopy(self.actor)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=3e-4)

        self.critic = Critic(input_dim, output_dim).to(self.device)
        self.critic_target = copy.deepcopy(self.critic)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=3e-4)
        self.memory = ReplayBuffer(input_dim, output_dim)
        
    def choose_action(self, state):
        state = torch.FloatTensor(state.reshape(1, -1)).to(self.device)
        return self.actor(state).cpu().data.numpy().flatten()
    
    def update(self):
        self.total_it += 1

        # Sample replay buffer 
        state, action, next_state, reward, not_done = self.memory.sample(self.batch_size)

        with torch.no_grad():
            # Select action according to policy and add clipped noise
            noise = (
                torch.randn_like(action) * self.policy_noise
            ).clamp(-self.noise_clip, self.noise_clip)

            next_action = (
                self.actor_target(next_state) + noise
            ).clamp(-self.max_action, self.max_action)

            # Compute the target Q value
            target_Q1, target_Q2 = self.critic_target(next_state, next_action)
            target_Q = torch.min(target_Q1, target_Q2)
            target_Q = reward + not_done * self.gamma * target_Q

        # Get current Q estimates
        current_Q1, current_Q2 = self.critic(state, action)

        # Compute critic loss
        critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)

        # Optimize the critic
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # Delayed policy updates
        if self.total_it % self.policy_freq == 0:

            # Compute actor losse
            actor_loss = -self.critic.Q1(state, self.actor(state)).mean()
            
            # Optimize the actor 
            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()

            # Update the frozen target models
            for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
                target_param.data.copy_(self.lr * param.data + (1 - self.lr) * target_param.data)

            for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
                target_param.data.copy_(self.lr * param.data + (1 - self.lr) * target_param.data)



In [29]:
import sys,os
curr_path = os.path.dirname(os.path.realpath('__file__'))
parent_path=os.path.dirname(curr_path) 
sys.path.append(parent_path) # add current terminal path to sys.path

In [30]:
import torch
import gym
import numpy as np
import datetime

from common.utils import save_results,make_dir,plot_rewards

curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time

  if LooseVersion(mpl.__version__) >= "3.0":
  other = LooseVersion(other)


In [63]:
class TD3Config:
    def __init__(self) -> None:
        self.algo_name = 'TD3'
        self.env_name = 'Pendulum-v1'#'HalfCheetah-v2'
        self.seed = 0
        self.result_path = curr_path+"/results/" +self.env_name+'/'+curr_time+'/results/'  # path to save results
        self.model_path = curr_path+"/results/" +self.env_name+'/'+curr_time+'/models/'  # path to save models
        self.start_timestep = 25e3 # Time steps initial random policy is used
        self.eval_freq = 5e3 # How often (time steps) we evaluate
        # self.train_eps = 800
        self.max_timestep = 4000000 # Max time steps to run environment
        self.expl_noise = 0.1 # Std of Gaussian exploration noise
        self.batch_size = 256 # Batch size for both actor and critic
        self.gamma = 0.99 # gamma factor
        self.lr = 0.0005 # Target network update rate 
        self.policy_noise = 0.2 # Noise added to target policy during critic update
        self.noise_clip = 0.5  # Range to clip target policy noise
        self.policy_freq = 2 # Frequency of delayed policy updates
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.save = False

In [64]:
def eval(env,agent, seed, eval_episodes=10):
    eval_env = gym.make(env)
    eval_env.seed(seed + 100)
    avg_reward = 0.
    for _ in range(eval_episodes):
        state, done = eval_env.reset(), False
        while not done:
            # eval_env.render()
            action = agent.choose_action(np.array(state))
            state, reward, done, _ = eval_env.step(action)
            avg_reward += reward
    avg_reward /= eval_episodes
    print("---------------------------------------")
    print(f"Evaluation over {eval_episodes} episodes: {avg_reward:.3f}")
    print("---------------------------------------")
    return avg_reward

In [74]:
def train(cfg,env,agent):
    # Evaluate untrained policy
    evaluations = [eval(cfg.env_name,agent, cfg.seed)]
    state, done = env.reset(), False
    ep_reward = 0
    ep_timesteps = 0
    episode_num = 0
    rewards = []
    ma_rewards = [] # moveing average reward
    for t in range(int(cfg.max_timestep)):
        ep_timesteps += 1
        # Select action randomly or according to policy
        if t < cfg.start_timestep:
            action = env.action_space.sample()
        else:
            action = (
                agent.choose_action(np.array(state))
                + np.random.normal(0, max_action * cfg.expl_noise, size=action_dim)
            ).clip(-max_action, max_action)
        # Perform action
        next_state, reward, done, _ = env.step(action) 
        done_bool = float(done) if ep_timesteps < env._max_episode_steps else 0
        # Store data in replay buffer
        agent.memory.push(state, action, next_state, reward, done_bool)
        state = next_state
        ep_reward += reward
        # Train agent after collecting sufficient data
        if t >= cfg.start_timestep:
            agent.update()
        if done: 
            # +1 to account for 0 indexing. +0 on ep_timesteps since it will increment +1 even if done=True
            print(f"Episode:{episode_num+1}, Episode T:{ep_timesteps}, Reward:{ep_reward:.3f}")
            # Reset environment
            state, done = env.reset(), False
            rewards.append(ep_reward)
            # 计算滑动窗口的reward
            if ma_rewards:
                ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
            else:
                ma_rewards.append(ep_reward) 
            ep_reward = 0
            ep_timesteps = 0
            episode_num += 1 
        # Evaluate episode
        if (t + 1) % cfg.eval_freq == 0:
            evaluations.append(eval(cfg.env_name,agent, cfg.seed))
    return rewards, ma_rewards

In [75]:
cfg = TD3Config()
env = gym.make(cfg.env_name)
torch.manual_seed(cfg.seed)
np.random.seed(cfg.seed)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
max_action = float(env.action_space.high[0])
agent = TD3(state_dim, action_dim, max_action, cfg)
rewards, ma_rewards = train(cfg, env, agent)

---------------------------------------
Evaluation over 10 episodes: -1570.625
---------------------------------------
Episode:1, Episode T:200, Reward:-1748.539
Episode:2, Episode T:200, Reward:-1343.161
Episode:3, Episode T:200, Reward:-1698.254
Episode:4, Episode T:200, Reward:-1407.550
Episode:5, Episode T:200, Reward:-870.398
Episode:6, Episode T:200, Reward:-1362.595
Episode:7, Episode T:200, Reward:-1213.569
Episode:8, Episode T:200, Reward:-862.322
Episode:9, Episode T:200, Reward:-1197.198
Episode:10, Episode T:200, Reward:-1514.470
Episode:11, Episode T:200, Reward:-1357.127
Episode:12, Episode T:200, Reward:-1252.634
Episode:13, Episode T:200, Reward:-1535.197
Episode:14, Episode T:200, Reward:-1193.046
Episode:15, Episode T:200, Reward:-969.986
Episode:16, Episode T:200, Reward:-888.127
Episode:17, Episode T:200, Reward:-1561.163
Episode:18, Episode T:200, Reward:-942.650
Episode:19, Episode T:200, Reward:-896.034
Episode:20, Episode T:200, Reward:-1716.117
Episode:21, Epis

Episode:168, Episode T:200, Reward:-1422.101
Episode:169, Episode T:200, Reward:-4.938
Episode:170, Episode T:200, Reward:-1353.578
Episode:171, Episode T:200, Reward:-1366.534
Episode:172, Episode T:200, Reward:-1436.047
Episode:173, Episode T:200, Reward:-1491.715
Episode:174, Episode T:200, Reward:-1416.136
Episode:175, Episode T:200, Reward:-1524.792
---------------------------------------
Evaluation over 10 episodes: -1453.684
---------------------------------------
Episode:176, Episode T:200, Reward:-1471.792
Episode:177, Episode T:200, Reward:-1460.349
Episode:178, Episode T:200, Reward:-1302.874
Episode:179, Episode T:200, Reward:-1521.424
Episode:180, Episode T:200, Reward:-1526.669
Episode:181, Episode T:200, Reward:-1376.724
Episode:182, Episode T:200, Reward:-1321.212
Episode:183, Episode T:200, Reward:-2.306
Episode:184, Episode T:200, Reward:-1346.477
Episode:185, Episode T:200, Reward:-1.928
Episode:186, Episode T:200, Reward:-1504.059
Episode:187, Episode T:200, Reward:

KeyboardInterrupt: 

In [71]:
env.observation_space

Box([-1. -1. -8.], [1. 1. 8.], (3,), float32)

In [70]:
env.action_space

Box(-2.0, 2.0, (1,), float32)