In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical
import gym
import random
import collections

In [3]:
env = gym.make('CartPole-v1')

In [149]:
# hyper parameters
ALPHA = .001
EPSILON = 1
T = 20 # T step 만큼 데이터 쌓고 학습할 것
LAMBDA = .95
K = 3
GAMMA = .99
e = .05

In [150]:
class PPO(nn.Module):
    def __init__(self):
        super(PPO, self).__init__()
        self.fc1 = nn.Linear(4, 256)
        self.fc_pi = nn.Linear(256, 2)
        self.fc_v = nn.Linear(256,1)
        self.optimizer = optim.Adam(self.parameters(), ALPHA)
    
    def pi(self, x, softmax_dim=0):
        x = torch.relu(self.fc1(x))
        prob = torch.softmax(self.fc_pi(x), dim = softmax_dim) # batch 처리 (학습할 떈 1 차원)
        return prob
    
    def v(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc_v(x)
        return x

In [151]:
def train_net(net, data, optimizer):
    s, a, r, s2, d, prob = batch_factory(data)

    # epoch K 만큼
    for i in range(K):
        td_target = r + GAMMA * net.v(s2)
        delta = td_target - net.v(s)
        delta = delta.detach().numpy() # 1 step advantage
        advantage_lst = []
        advantage = 0.0

        # GAE 계산
        for delta_t in delta[::-1]:
            advantage = GAMMA * LAMBDA * advantage + delta_t[0]
            advantage_lst.append([advantage])
        advantage_lst.reverse()
        advantage = torch.tensor(advantage_lst, dtype=torch.float)

        pi = net.pi(s, softmax_dim=1)
        pi_a = pi.gather(1, a)
        ratio = torch.exp(torch.log(pi_a) - torch.log(prob))

        surr1 = ratio * advantage
        surr2 = torch.clamp(ratio, 1 - e, 1 + e) * advantage
        loss = -torch.min(surr1, surr2) + F.smooth_l1_loss(net.v(s) , td_target.detach())

        optimizer.zero_grad()
        loss.mean().backward()
        optimizer.step()

In [152]:
def batch_factory(memory):
    s_, a_, r_, s2_, d_, prob_ = [], [], [], [], [], []
    for s, a, r, s2, d, p in memory:
        s_.append(s)
        a_.append([a])
        r = -100 if d else r
        r_.append([r])
        s2_.append(s2)
        d = 0 if d else 1
        d_.append([d])
        prob_.append([p])
        
    s_ = torch.tensor(s_, dtype=torch.float)
    a_ = torch.tensor(a_)
    r_ = torch.tensor(r_, dtype=torch.float)
    s2_ = torch.tensor(s2_, dtype=torch.float)
    d_ = torch.tensor(d_, dtype=torch.float)
    prob_ = torch.tensor(prob_)
    
    return s_, a_, r_, s2_, d_, prob_


In [153]:
net = PPO()
ep = 1
total_ep = 10000
gamma = .95
total_reward = 0
data = []
optimizer = optim.Adam(net.parameters(), ALPHA)

while(ep < total_ep):
    done = False
    state = env.reset()
    while(not done):
        # T step 움직인 후 clipping - T 가 너무 크면 불안정??
        for t in range(T):
            prob = net.pi(torch.from_numpy(state).float())
            action = Categorical(prob).sample().item()
            state_next, reward, done, _ = env.step(action)
            total_reward += reward
            data.append((state, action, reward/100.0, state_next, done, prob[action].item()))
            state = state_next
            if(done):
                break

        train_net(net, data, optimizer)
        data = []
        
    ep += 1
    if(ep%10 == 0):
        print(ep, total_reward/10.0)
        total_reward = 0

10 20.6
20 24.8
30 39.5
40 21.8
50 40.9
60 43.5
70 53.0
80 64.7
90 72.3
100 76.2
110 86.3
120 127.8
130 66.1
140 85.6
150 128.2
160 219.4
170 147.1
180 231.1
190 248.9
200 187.8
210 534.2
220 397.8
230 173.2
240 223.2
250 305.4
260 147.8
270 212.4
280 92.2
290 143.3
300 204.0
310 171.8
320 48.8
330 24.0
340 23.0
350 25.8
360 123.2
370 132.0
380 185.9
390 122.1
400 174.8
410 186.7
420 137.8
430 112.5
440 230.0
450 420.6
460 269.9
470 195.0
480 596.1
490 1374.5
500 497.0
510 523.8
520 1053.8
530 108.9
540 212.4
550 1097.7
560 538.2
570 874.5
580 1787.5


KeyboardInterrupt: 