In [1]:
import gym
from utils import *

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import math

class NoiseLinear(nn.Linear):
    def __init__(self, in_, out_, val = 0.017, bias = True):
        super(NoiseLinear, self).__init__(in_,out_,bias)
        self.sigma_weight = nn.Parameter(torch.full((out_, in_), val))
        self.register_buffer("eps_weight", torch.zeros(out_, in_))
        if bias:
            self.sigma_bias = nn.Parameter(torch.full((out_,), val))
            self.register_buffer("eps_bias", torch.zeros(out_))
        self.reset_parameters()

    def reset_parameters(self):
        std = math.sqrt(1 / self.in_features)
        self.weight.data.uniform_(-std, std)
        self.bias.data.uniform_(-std, std)
        
    def forward(self, x):
        self.eps_weight.normal_()
        bias = self.bias
        if bias is not None:
            self.eps_bias.normal_()
            bias = bias + self.sigma_bias * self.eps_bias.data
        return F.linear(x, self.weight + self.sigma_weight * self.eps_weight, bias)

class Actor(nn.Module):
    def __init__(self, in_, out_, hidden=512):
        super(Actor, self).__init__()
        self.net=nn.Sequential(
            NoiseLinear(in_, hidden),
            nn.ReLU(),
            NoiseLinear(hidden, int(hidden/2)),
            nn.ReLU(),
            NoiseLinear(int(hidden/2), out_),
            nn.Tanh()
        )
        
    def get_action(self, act_v):
        if random.random()< 0.05:
            return np.random.normal(scale=np.abs(act_v).max(), size =len(act_v))
        return act_v
    
    def forward(self, x):
        return self.net(x)
        
class Critic(nn.Module):
    def __init__(self, in_, out_, hidden=512):
        super(Critic, self).__init__()
        
        self.value = nn.Sequential(
            nn.Linear(in_,hidden),
            nn.ReLU(),
        )
        self.value_out = nn.Sequential(
            nn.Linear(hidden + out_, int(hidden/2)),
            nn.ReLU(),
            nn.Linear(int(hidden/2), 1)
        )
    
    def forward(self, obs, act):
        return self.value_out(torch.cat([self.value(obs), act], dim=1))

In [3]:
env = gym.make("Pendulum-v0")
act_n = env.action_space.shape[0]
obs_n = env.observation_space.shape[0]

ACTOR_LR = 0.00002
CRITIC_LR = 0.0001
GAMMA = 0.99
TAU = 0.05

actor = Actor(obs_n, act_n).cuda()
actor_tgt = Actor(obs_n, act_n).cuda()
actor_tgt.load_state_dict(actor.state_dict())
actor_opt = optim.Adam(actor.parameters(), ACTOR_LR)

critic = Critic(obs_n, act_n).cuda()
critic_tgt = Critic(obs_n, act_n).cuda()
critic_tgt.load_state_dict(critic.state_dict())
critic_opt = optim.Adam(critic.parameters(), CRITIC_LR)

ST_SIZE = 100000
ST_INIT = 10000
BATCH = 512

noise = NoiseMaker(act_n, "ou", decay = True)
noise.param["decay"] = ST_SIZE

agent = Agent(env, actor, noise, 50, 1)
agent.set_n_step(2, GAMMA)

storage = Replay(ST_SIZE, prio=True)

In [10]:
EPOCH = 5000

for epoch in range(EPOCH):
    for i, step in enumerate(agent.episode(epoch)):
        storage.push(step)
        if len(storage) < ST_INIT:
            continue
        
        sample, indices, weights = storage.sample(BATCH)
        weights_ = torch.FloatTensor(weights).unsqueeze(1).cuda()
        obs, act_v, act, next_obs, rew, done, etc, unroll_n = list(zip(*sample))
        
        obs_ = torch.FloatTensor(obs).cuda()
        act_v_ = torch.FloatTensor(act_v).cuda()
        act_ = torch.LongTensor(act).unsqueeze(1).cuda()
        next_obs_ = torch.FloatTensor(next_obs).cuda()
        rew_ = torch.FloatTensor(rew).unsqueeze(1).cuda()
        done_ = torch.BoolTensor(done).cuda()
        unroll_n_ = torch.FloatTensor(unroll_n).unsqueeze(1).cuda()
        
        #Critic
        critic_opt.zero_grad()
        
        q_pred = critic(obs_, act_v_)
        
        next_action_v = actor_tgt(next_obs_)
        q_next = critic_tgt(next_obs_, next_action_v)
        q_next[done_] = 0
        q_target = rew_ + (GAMMA**unroll_n_) * q_next.detach()
        
        critic_loss = weights_ * (q_pred - q_target) ** 2
        critic_loss_mean = critic_loss.mean()
        critic_loss_mean.backward()
        
        critic_opt.step()
        
        #Actor
        actor_opt.zero_grad()
        
        actor_loss = -weights_ * critic(obs_, actor(obs_))
        actor_loss_mean = actor_loss.mean()
        actor_loss_mean.backward()
        
        actor_opt.step()
        
        #Target Update
        for off, tgt in zip(actor.parameters(), actor_tgt.parameters()):
            tgt.data.copy_(off.data*TAU + tgt.data*(1-TAU))
        for off, tgt in zip(critic.parameters(), critic_tgt.parameters()):
            tgt.data.copy_(off.data*TAU + tgt.data*(1-TAU))
            
        storage.update_priorities(indices, critic_loss.sum(1).cpu().data.numpy())
            
    print(epoch, i+1)

0 200
1 200


KeyboardInterrupt: 

In [12]:
noise.count

48537