# PPO - continuous action space
> Continuous action spaces use normal/gaussian distribution. In this case the model output is the mean+std which define the normal distribution to use for the action selection. The action is sampled from this distribution. The probability is in this case the probabilty of the actions value under the given normal distribution (I don't know the math for that but you can look it up).
---
* 연속액션 환경에서는 정규분포로 action 을 선택

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical
import gym
import random
import collections

In [2]:
env = gym.make('Pendulum-v0')

In [41]:
# hyper parameters
ALPHA = .0005
EPSILON = 1
T = 10 # T step 만큼 데이터 쌓고 학습할 것
LAMBDA = .95
K = 3
GAMMA = .99
e = .3

In [42]:
class PPO(nn.Module):
    def __init__(self):
        super(PPO, self).__init__()
        self.fc1 = nn.Linear(3, 128)
        self.fc_pi = nn.Linear(128, 32)
        self.fc_pi2 = nn.Linear(32, 1)
        self.fc_v = nn.Linear(128, 32)
        self.fc_v_2 = nn.Linear(32, 1)
        self.optimizer = optim.Adam(self.parameters(), ALPHA)
    
    # 출력이 Normal 분포 -> sampling 을 해서 실수값 뽑아서 사용해야함!
    def pi(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc_pi(x))
        prob = torch.tanh(self.fc_pi2(x))
        prob = torch.distributions.normal.Normal(prob, .1) # 평균 : prob, 분산 : .01
        return prob
    
    def v(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc_v(x))
        x = self.fc_v_2(x)
        return x

In [43]:
def train_net(net, data, optimizer):
    s, a, r, s2, d, prob = batch_factory(data)

    # epoch K 만큼
    for i in range(K):
        td_target = r + GAMMA * net.v(s2)
        delta = td_target - net.v(s)
        delta = delta.detach().numpy() # 1 step advantage
        advantage_lst = []
        advantage = 0.0

        # GAE 계산
        for delta_t in delta[::-1]:
            advantage = GAMMA * LAMBDA * advantage + delta_t[0]
            advantage_lst.append([advantage])
        advantage_lst.reverse()
        advantage = torch.tensor(advantage_lst, dtype=torch.float)

        pi_a = net.pi(s).sample()
        ratio = torch.exp(torch.log(pi_a) - torch.log(prob))

        surr1 = ratio * advantage
        surr2 = torch.clamp(ratio, 1 - e, 1 + e) * advantage
        loss = -torch.min(surr1, surr2) + F.smooth_l1_loss(net.v(s) , td_target.detach())

        optimizer.zero_grad()
        loss.mean().backward()
        optimizer.step()

In [44]:
def batch_factory(memory):
    s_, a_, r_, s2_, d_, prob_ = [], [], [], [], [], []
    for s, a, r, s2, d, p in memory:
        s_.append(s)
        a_.append([a])
        r = -100 if d else r
        r_.append([r])
        s2_.append(s2)
        d = 0 if d else 1
        d_.append([d])
        prob_.append([p])
        
    s_ = torch.tensor(s_, dtype=torch.float)
    a_ = torch.tensor(a_)
    r_ = torch.tensor(r_, dtype=torch.float)
    s2_ = torch.tensor(s2_, dtype=torch.float)
    d_ = torch.tensor(d_, dtype=torch.float)
    prob_ = torch.tensor(prob_)
    
    return s_, a_, r_, s2_, d_, prob_


In [None]:
net = PPO()
ep = 1
total_ep = 10000
gamma = .95
total_reward = 0
data = []
epsilon = .1
optimizer = optim.Adam(net.parameters(), ALPHA)

while(ep < total_ep):
    done = False
    state = env.reset()
    while(not done):
        # T step 움직인 후 clipping - T 가 너무 크면 불안정??
        for t in range(T):
            prob = net.pi(torch.from_numpy(state).float())
            action = env.action_space.sample()
            #print(action)
            state_next, reward, done, _ = env.step(action)
            total_reward += reward
            data.append((state, action, reward/100.0, state_next, done, action))
            state = state_next
            if(done):
                break

        train_net(net, data, optimizer)
        data = []
        
    ep += 1
    if(ep%10 == 0):
        print(ep, total_reward/10.0)
        total_reward = 0

10 -1157.1429691764401
20 -1233.3929367235355
30 -1314.6250742529203
40 -1330.8527113213102
50 -1248.5087452695675
60 -1251.7373624157722
70 -1071.319313571381
80 -1410.9258096711542
90 -1268.5580070981673
100 -1227.7331624886344
110 -1414.0550779485536
120 -1041.6981542676601
130 -1231.6959874054576
140 -1187.0722833527623
150 -1203.4940719713588
160 -1169.2936722129054
170 -1335.7373703491457
180 -1198.369998682712
190 -1361.2995551870442
200 -1161.136702768257
210 -1170.2848164965103
220 -1180.19540880028
230 -1186.4087012620325
240 -1192.7455058068442
250 -1143.3313112258625
260 -1193.0499145575227
270 -1280.1364039316813
280 -1305.2485586330693
290 -1351.7704155629951
300 -1295.0466268261966
310 -1281.4252999164719
320 -1303.6796514760933
330 -1350.3125163941284
340 -1261.575993741809
350 -1193.768350708924
360 -1340.740599517588
370 -1415.4995044968775
380 -1253.3477943511089
390 -1106.0109162408958
400 -1317.6243168363167
410 -1212.0637208937123
420 -1390.9861850336215
430 -1112