# Vanilla PG
---
* Monte-Carlo 방식 업데이트이므로 variance 가 높다
---
* torch 의 tensor 는 numpy array 와 유사하지만 GPU 에서 돌아갈 수 있음.

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical
from torch.autograd import variable
import numpy as np
import gym
import random

In [2]:
env = gym.make('CartPole-v1')

In [17]:
EPISODE = 1000
EPSILON = 1
RATE = .001
GAMMA = .95

In [18]:
class Policy(nn.Module):
    def __init__(self):
        super(Policy, self).__init__()
        self.data = []
        self.fc1 = nn.Linear(4, 64)
        self.outlayer = nn.Linear(64, 2)
        self.optimizer = optim.Adam(self.parameters(), RATE)
        
    def pi(self, x):
        x = F.relu(self.fc1(x))
        x = F.softmax(self.outlayer(x), dim=0)
        return x
    
    def train(self):
        discounted = 0
        for r, log_p in self.data[::-1]:
            discounted = r + GAMMA * discounted
            # loss 정의
            loss = - log_p * discounted
            # 그래디언트 계산
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()
        self.data = []

In [19]:
state = env.reset()
ep = 1
net = Policy()

while(ep < EPISODE):
    done = False
    total_reward = 0
    
    while(not done):
        #env.render()

        # policy run
        action_prob = net.pi(torch.from_numpy(state).float())
        m = Categorical(action_prob)
        a = m.sample()

        # action 선택
        if(EPSILON < random.randrange(0,1)):
            action = env.action_space.sample()
        else:
            action = a.item()

        # step
        state_next, reward, done, _ = env.step(action)

        # reward 추가
        total_reward += reward
        
        if(done):
            reward = -100

        # data 추가
        net.data.append((reward, torch.log(action_prob[a])))

        # state 갱신
        state = state_next

        # end episode
        if(done):
            print(total_reward)
            total_reward = 0
            ep += 1
            
            # env 초기화
            state = env.reset()
            EPSILON = 1 / (ep / 100 + 1)
            
            # 학습
            net.train()
env.close()

34.0
21.0
12.0
14.0
10.0
8.0
12.0
19.0
30.0
21.0
18.0
62.0
39.0
27.0
19.0
23.0
24.0
16.0
12.0
17.0
15.0
20.0
9.0
41.0
19.0
18.0
24.0
10.0
57.0
11.0
26.0
44.0
31.0
16.0
9.0
26.0
38.0
19.0
18.0
19.0
18.0
27.0
32.0
25.0
37.0
23.0
15.0
39.0
40.0
20.0
21.0
20.0
26.0
33.0
36.0
45.0
30.0
18.0
43.0
13.0
49.0
23.0
43.0
26.0
27.0
25.0
20.0
28.0
15.0
9.0
9.0
25.0
88.0
19.0
39.0
51.0
24.0
49.0
32.0
20.0
34.0
21.0
30.0
66.0
17.0
40.0
11.0
23.0
9.0
28.0
49.0
40.0
46.0
26.0
35.0
20.0
14.0
32.0
23.0
31.0
27.0
72.0
29.0
99.0
78.0
40.0
108.0
88.0
43.0
57.0
46.0
126.0
157.0
70.0
160.0
132.0
81.0
56.0
27.0
90.0
204.0
106.0
145.0
151.0
34.0
103.0
447.0
146.0
96.0
156.0
79.0
49.0
104.0
75.0
240.0
551.0
135.0
112.0
315.0
676.0
75.0
1238.0
137.0
160.0
521.0
440.0
404.0
121.0
593.0
141.0
217.0
388.0
300.0
605.0
570.0
105.0
133.0
307.0
101.0
491.0
127.0
569.0
294.0
594.0
158.0
164.0
640.0
98.0
302.0
2000.0
480.0
447.0
1796.0
216.0
241.0
1179.0
272.0
186.0
273.0
276.0
264.0
203.0
208.0
398.0
237.0
306.0
267.0
27

KeyboardInterrupt: 