# PPO - continuous action space
> Continuous action spaces use normal/gaussian distribution. In this case the model output is the mean+std which define the normal distribution to use for the action selection. The action is sampled from this distribution. The probability is in this case the probabilty of the actions value under the given normal distribution (I don't know the math for that but you can look it up).
---
* 연속액션 환경에서는 정규분포로 action 을 선택

In [153]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical
import gym
import random
import collections

In [154]:
env = gym.make('Pendulum-v0')

In [166]:
# hyper parameters
ALPHA = .01
EPSILON = 1
T = 20 # T step 만큼 데이터 쌓고 학습할 것
LAMBDA = .95
K = 3
GAMMA = .99
e = .1

In [167]:
class PPO(nn.Module):
    def __init__(self):
        super(PPO, self).__init__()
        self.fc1 = nn.Linear(3, 128)
        self.fc_pi = nn.Linear(128, 1)
        self.fc_v = nn.Linear(128,1)
        self.optimizer = optim.Adam(self.parameters(), ALPHA)
    
    # 출력이 Normal 분포 -> sampling 을 해서 실수값 뽑아서 사용해야함!
    def pi(self, x):
        x = torch.relu(self.fc1(x))
        prob = torch.tanh(self.fc_pi(x))
        prob = torch.distributions.normal.Normal(prob, 1) # 평균 : prob, 분산 : 1
        return prob
    
    def v(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc_v(x)
        return x

In [168]:
def train_net(net, data, optimizer):
    s, a, r, s2, d, prob = batch_factory(data)

    # epoch K 만큼
    for i in range(K):
        td_target = r + GAMMA * net.v(s2)
        delta = td_target - net.v(s)
        delta = delta.detach().numpy() # 1 step advantage
        advantage_lst = []
        advantage = 0.0

        # GAE 계산
        for delta_t in delta[::-1]:
            advantage = GAMMA * LAMBDA * advantage + delta_t[0]
            advantage_lst.append([advantage])
        advantage_lst.reverse()
        advantage = torch.tensor(advantage_lst, dtype=torch.float)

        pi_a = net.pi(s).sample()
        ratio = torch.exp(torch.log(pi_a) - torch.log(prob))

        surr1 = ratio * advantage
        surr2 = torch.clamp(ratio, 1 - e, 1 + e) * advantage
        loss = -torch.min(surr1, surr2) + F.smooth_l1_loss(net.v(s) , td_target.detach())

        optimizer.zero_grad()
        loss.mean().backward()
        optimizer.step()

In [169]:
def batch_factory(memory):
    s_, a_, r_, s2_, d_, prob_ = [], [], [], [], [], []
    for s, a, r, s2, d, p in memory:
        s_.append(s)
        a_.append([a])
        r = -100 if d else r
        r_.append([r])
        s2_.append(s2)
        d = 0 if d else 1
        d_.append([d])
        prob_.append([p])
        
    s_ = torch.tensor(s_, dtype=torch.float)
    a_ = torch.tensor(a_)
    r_ = torch.tensor(r_, dtype=torch.float)
    s2_ = torch.tensor(s2_, dtype=torch.float)
    d_ = torch.tensor(d_, dtype=torch.float)
    prob_ = torch.tensor(prob_)
    
    return s_, a_, r_, s2_, d_, prob_


In [170]:
net = PPO()
ep = 1
total_ep = 10000
gamma = .95
total_reward = 0
data = []
optimizer = optim.Adam(net.parameters(), ALPHA)

while(ep < total_ep):
    done = False
    state = env.reset()
    while(not done):
        # T step 움직인 후 clipping - T 가 너무 크면 불안정??
        for t in range(T):
            prob = net.pi(torch.from_numpy(state).float())
            action = prob.sample()
            state_next, reward, done, _ = env.step(action)
            total_reward += reward
            data.append((state, action, reward/100.0, state_next, done, action))
            state = state_next
            if(done):
                break

        train_net(net, data, optimizer)
        data = []
        
    ep += 1
    if(ep%10 == 0):
        print(ep, total_reward/10.0)
        total_reward = 0

10 tensor(-1319.4460)
20 tensor(-1195.6279)
30 tensor(-1167.4396)
40 tensor(-1447.1493)
50 tensor(-1342.1516)
60 tensor(-1231.2814)
70 tensor(-1246.1453)
80 tensor(-1281.2484)
90 tensor(-1427.9601)
100 tensor(-1276.8274)
110 tensor(-1307.1760)
120 tensor(-1471.2776)
130 tensor(-1424.7491)
140 tensor(-1510.1278)
150 tensor(-1430.8926)
160 tensor(-1389.5225)
170 tensor(-1313.4579)
180 tensor(-1290.9614)
190 tensor(-1297.7971)
200 tensor(-1384.6165)
210 tensor(-1334.6996)
220 tensor(-1339.9568)
230 tensor(-1177.8865)
240 tensor(-1376.0032)
250 tensor(-1285.7224)
260 tensor(-1153.7280)
270 tensor(-1212.8689)
280 tensor(-1200.8904)
290 tensor(-1317.1940)
300 tensor(-1345.5668)
310 tensor(-1195.6144)
320 tensor(-1193.9432)
330 tensor(-1368.7186)
340 tensor(-1225.2917)
350 tensor(-1176.4589)
360 tensor(-1396.6317)
370 tensor(-1289.9927)
380 tensor(-1177.1929)
390 tensor(-1272.0933)
400 tensor(-1205.3234)
410 tensor(-1229.7969)
420 tensor(-1117.5101)
430 tensor(-1177.4915)
440 tensor(-1210.740

KeyboardInterrupt: 