# DDPG

## Environment - Pendulum
---
* action space : torque [-2 ~ 2] (continuous action)
* state space : cos, sin, angular speed [-1 ~ 1, -1 ~ 1, -8 ~ 8]
* reward (cost) : theta^2 + .1 * theta_dot^2 + .001*(torque^2)
* terminal state 는 따로 없음 (Infinity Horizon). 강제로 200 step 넘으면 종료함.
> cost 최소화 하는 문제. cost 식을 보면 결국 최소의 힘으로 막대를 거꾸로 세우는 문제

---

* 기본적으로 actor critic 이므로 네트워크 최소 2 개
* continuous action 이므로 Q 는 action, state 모두 입력받아야 함
* value 가 과대평가되는 상황을 피할 수 없음
* 그래도 버퍼를 사용하는 off policy 알고리즘
    * sample 효율 높음

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import gym
import random
import collections
import numpy as np

In [2]:
env = gym.make('Pendulum-v0')

In [3]:
# hyper parameters
ALPHA_P = .0005
ALPHA_V = .001
GAMMA = .95
EPSILON = 1
TOTAL_EP = 1000
TAU = .05

In [4]:
class DDPG_mu(nn.Module):
    def __init__(self):
        super(DDPG_mu, self).__init__()
        self.mu_1 = nn.Linear(3, 128)
        self.mu_2 = nn.Linear(128, 64)
        self.mu_3 = nn.Linear(64, 1)

    def mu(self, x):
        x = torch.relu(self.mu_1(x))
        x = torch.relu(self.mu_2(x))
        x = torch.tanh(self.mu_3(x)) * 2
        return x

class DDPG_value(nn.Module):
    def __init__(self):
        super(DDPG_value, self).__init__()
        self.v_s = nn.Linear(3, 64)
        self.v_a = nn.Linear(1, 64)
        self.q = nn.Linear(128, 32)
        self.v_out = nn.Linear(32, 1)

    def value(self, x, a):
        x_1 = torch.relu(self.v_s(x))
        x_2 = torch.relu(self.v_a(a))
        cat = torch.cat([x_1, x_2], dim=1)
        q = torch.relu(self.q(cat))
        q = self.v_out(q)
        return q

In [5]:
class OrnsteinUhlenbeckNoise:
    def __init__(self, mu):
        self.theta, self.dt, self.sigma = 0.1, 0.01, 0.1
        self.mu = mu
        self.x_prev = np.zeros_like(self.mu)

    # 함수처럼 호출할 때 실행되는 메서드
    def __call__(self):
        x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + \
                self.sigma * np.sqrt(self.dt) * np.random.normal(size=self.mu.shape)
        self.x_prev = x
        return x

In [6]:
def train(mu, mu_targ, v, v_targ, memory, opt_mu, opt_v):
    batch = random.sample(memory, 32)
    
    s_lst, a_lst, c_lst, s2_lst, d_lst = [], [], [], [], []
    for s,a,c,s2,d in batch:
        s_lst.append(s)
        a_lst.append([a])
        c_lst.append([c])
        s2_lst.append(s2)
        d_lst.append([d])
        
    s_lst = torch.tensor(s_lst, dtype=torch.float)
    a_lst = torch.tensor(a_lst)
    c_lst = torch.tensor(c_lst, dtype=torch.float)
    s2_lst = torch.tensor(s2_lst, dtype=torch.float)
    d_lst = torch.tensor(d_lst, dtype=torch.float)

    targ = c_lst + GAMMA * v_targ.value(s2_lst, mu_targ.mu(s2_lst))
    loss_v = F.smooth_l1_loss(targ.detach(), v.value(s_lst, a_lst))
    opt_v.zero_grad()
    loss_v.backward()
    opt_v.step()
    
    loss_p = -v.value(s_lst, mu.mu(s_lst)).mean()
    opt_mu.zero_grad()
    loss_p.backward()
    opt_p.step()


In [7]:
def soft_update(net, net_target):
    for param_target, param in zip(net_target.parameters(), net.parameters()):
        param_target.data.copy_(param_target.data * (1.0 - TAU) + param.data * TAU)

In [8]:
# 학습 네트워크와 타겟 네트워크 생성
pi, pi_t = DDPG_mu(), DDPG_mu()
v, v_t = DDPG_value(), DDPG_value()
noise = OrnsteinUhlenbeckNoise(np.zeros(1))

pi_t.load_state_dict(pi.state_dict())
v_t.load_state_dict(v.state_dict())

buffer = collections.deque(maxlen=50000)
ep = 1
opt_p = optim.Adam(pi.parameters(), ALPHA_P)
opt_v = optim.Adam(v.parameters(), ALPHA_V)

render = False
score = 0

while(ep < TOTAL_EP):
    done = False
    state = env.reset()
    step = 1
    
    while(not done):
        action = pi.mu(torch.from_numpy(state).float()).item() + noise()[0]
        if(render):
            env.render()
        state_next, cost, done, _ = env.step([action])
        step += 1
        score += cost
        buffer.append((state, action, cost/100.0, state_next, done))
        state = state_next
        if(done):
            ep += 1
            if(score > -500):
                render = True
            if(ep % 10 == 0):
                print(ep, int(score/10))
                score = 0
            if(len(buffer) > 2000):
                for i in range(10):
                    train(pi, pi_t, v, v_t, buffer, opt_p, opt_v)
                    soft_update(pi, pi_t)
                    soft_update(v, v_t)
env.close()

-0.28112750345607157
-0.3321373460133576
-0.31588476632169477
-0.30057169281288987
-0.26106000991335737
-0.23601168646414034
-0.22654434078635913
-0.23664118809103166
-0.21914835551066522
-0.1761006426915913
-0.16040274568777174
-0.14539367274462917
-0.1459490855718849
-0.14436023161496186
-0.18771153710546837
-0.22124085886820244
-0.2223344384995398
-0.22846069421814652
-0.22396543007817213
-0.1920231951544154
-0.1640181691622588
-0.14440081212349853
-0.13735712793139443
-0.1430025309326089
-0.1767117147923253
-0.14924600621923595
-0.06622040173569806
0.06843686279521294
0.15247903039088712
0.21619248276814013
0.3081447622144808
0.40615347444112565
0.4613345247676466
0.5244791446792333
0.5770267858546471
0.6295655807760551
0.6281582200209883
0.5792430702765446
0.5093734962511056
0.4281606752931988
0.35588638057794275
0.2565698621562738
0.1454587587430493
0.03581501095796387
-0.05333341993560534
-0.09641645562872286
-0.12594542404027392
-0.21715723353552055
-0.341697616421493
-0.312305

KeyboardInterrupt: 