# Lunar Lander (discrete mode)
## A2C
---
* state : (pos.x - VIEWPORT_W/SCALE/2) / (VIEWPORT_W/SCALE/2),
            (pos.y - (self.helipad_y+LEG_DOWN/SCALE)) / (VIEWPORT_H/SCALE/2),
            vel.x*(VIEWPORT_W/SCALE/2)/FPS,
            vel.y*(VIEWPORT_H/SCALE/2)/FPS,
            self.lander.angle,
            20.0*self.lander.angularVelocity/FPS,
            1.0 if self.legs[0].ground_contact else 0.0,
            1.0 if self.legs[1].ground_contact else 0.0
            ]
* action : No, 우측 엔진, 메인 엔진, 좌측 엔진
* reward : 메인 엔진 : -0.3, 측면 엔진 : -0.03, 다리 하나 착지에 + 10, 충돌하면 -100, 목표에 착지하면 +100, 착륙선 상태에 대한 보상
* terminal : 다리가 아닌 몸체가 충돌하거나 착지하면 종료, 200 점 달성이 목표
* 연료는 무한
* 최대한 정자세로 최소한의 연료로 착지하는 방향으로 학습
* 눈으로 확인해서 적당히 되면 학습된거임

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.nn.init as init
from torch.distributions import Categorical
import gym
import random
import collections

env = gym.make('LunarLander-v2')

# hyper parameters
EPSILON = 1
ALPHA = .001
GAMMA = .95

# actor
class Actor(nn.Module):
    def __init__(self):
        super(Actor, self).__init__()
        self.fc_1 = nn.Linear(8, 32)
        self.fc_pi = nn.Linear(32, 4)
    
    # policy 는 discrete action 의 확률이므로 softmax 통과
    def pi(self, x):
        x = torch.relu(self.fc_1(x))
        x = torch.softmax(self.fc_pi(x), dim=0)
        return x


# critic
class Critic(nn.Module):
    def __init__(self):
            super(Critic, self).__init__()
            self.fc_1 = nn.Linear(8, 32)
            self.fc_v = nn.Linear(32, 1)
    
    # value 는 advantage 계산을 위한 것이므로 0차원 스칼라
    def v(self, x):
        x = torch.relu(self.fc_1(x))
        x = self.fc_v(x)
        return x

            
def train(critic, critic_t, data, optim_a, optim_v):
    batch = random.sample(data, 64)
    s_, a_, r_, s2_, l_p_ = [], [], [], [], []
    for tr in batch:
        s, a, r, s2, l_p = tr
        s_.append(s)
        a_.append([a])
        r_.append([r])
        s2_.append(s2)
        l_p_.append([l_p])
    s_ = torch.tensor(s_, dtype=torch.float)
    a_ = torch.tensor(a_, dtype=torch.float)
    r_ = torch.tensor(r_, dtype=torch.float)
    s2_ = torch.tensor(s2_, dtype=torch.float)
    l_p_ = torch.tensor(l_p_, dtype=torch.float)
    
    v_target = r_ + GAMMA * critic_t.v(s2_)
    advantage = v_target - critic.v(s_)
     
    log_p = l_p_
    loss = F.mse_loss(v_target.detach(), critic.v(s2_) -log_p * advantage.detach())
    optim_a.zero_grad()
    optim_v.zero_grad()
    loss.mean().backward()
    optim_a.step()
    optim_v.step()
    
ep = 1
total_ep = 1000
actor, actor_t, critic, critic_t = Actor(), Actor(), Critic(), Critic()
optimizer_a = optim.Adam(actor.parameters(), ALPHA)
optimizer_c = optim.Adam(critic.parameters(), ALPHA)
critic_t.load_state_dict(critic.state_dict())
buffer = collections.deque(maxlen = 50000)

while(ep < total_ep):
    done = False
    state = env.reset()
    total_reward = 0
    step = 0
    while(not done):
        if(ep > 400):
            env.render()
        prob = actor.pi(torch.from_numpy(state).float())
        m = Categorical(prob)
        
        if(random.random() < EPSILON):
            action = env.action_space.sample()
        else:
            action = m.sample().item()
        
        state_next, reward, done, _ = env.step(action)
        total_reward += reward
        #print(reward)
        log_prob = torch.log(prob[action])
        transition = (state, action, reward, state_next, log_prob)
        buffer.append(transition)
        if(len(buffer) > 2000):
            train(critic, critic_t, buffer, optimizer_a, optimizer_c)
        state = state_next
        step += 1
        
        if(done):
            ep += 1
            EPSILON = 1 / ((ep / 100) + 2)
            print(ep, int(total_reward), step)
            if(ep % 10 == 0):
                critic_t.load_state_dict(critic.state_dict())

2 -114 57
3 -210 115
4 -192 114
5 -259 128
6 -224 97
7 -119 130
8 -46 91
9 -263 118
10 -76 69
11 -350 77
12 -273 84
13 -95 86
14 -154 118
15 -318 112
16 -299 101
17 -20 110
18 -29 96
19 -58 83
20 -215 113
21 -199 126
22 -123 108
23 -381 87
24 -68 75
25 -416 120
26 -197 120
27 -371 120
28 -107 98
29 -223 98
30 -103 84
31 -340 74
32 -240 80
33 -30 69
34 -149 138
35 -67 67
36 -147 72
37 -56 105
38 -81 90
39 -289 95
40 -126 98
41 -237 116
42 -94 96
43 -168 72
44 -421 87
45 -250 84
46 -122 132
47 -382 134
48 -102 95
49 -94 72
50 -131 87
51 -105 75
52 -139 111
53 -268 112
54 -170 73
55 -120 88
56 -398 95
57 -252 107
58 -74 107


KeyboardInterrupt: 

# PPO
---
* https://www.kaggle.com/thimac/ppo-lunar-lander-reinforcement-learning

In [48]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.nn.init as init
from torch.distributions import Categorical
import gym
import random

env = gym.make('LunarLander-v2')

# hyper parameters
ALPHA = .001
EPSILON = 1
T = 10 # T step 만큼 데이터 쌓고 학습할 것
LAMBDA = .95
K = 5
GAMMA = .99
e = .2

class PPO_p(nn.Module):
    def __init__(self):
        super(PPO_p, self).__init__()
        self.fc1 = nn.Linear(8, 128)
        self.fc2 = nn.Linear(128, 256)
        self.fc_pi = nn.Linear(256, 4)
        self.optimizer = optim.Adam(self.parameters(), ALPHA)
    
    def pi(self, x, softmax_dim=0):
        x = torch.selu(self.fc1(x))
        x = torch.selu(self.fc2(x))
        prob = nn.LogSoftmax(self.fc_pi(x)) # batch 처리 (학습할 떈 1 차원)
        return prob


class PPO_v(nn.Module):
    def __init__(self):
        super(PPO_v, self).__init__()
        self.fc1 = nn.Linear(8, 128)
        self.fc2 = nn.Linear(128, 256)
        self.fc_v = nn.Linear(256, 1)
        self.optimizer = optim.Adam(self.parameters(), ALPHA)

    def v(self, x):
        x = torch.selu(self.fc1(x))
        x = torch.selu(self.fc2(x))
        x = self.fc_v(x)
        return x


def train_net(pi, v, data, optimizer_p, optimizer_v):
    s, a, r, s2, d, prob = batch_factory(data)

    # epoch K 만큼
    for i in range(K):
        td_target = r + GAMMA * net.v(s2)
        delta = td_target - net.v(s)
        delta = delta.detach().numpy() # 1 step advantage
        advantage_lst = []
        advantage = 0.0

        # GAE 계산
        for delta_t in delta[::-1]:
            advantage = GAMMA * LAMBDA * advantage + delta_t[0]
            advantage_lst.append([advantage])
        advantage_lst.reverse()
        advantage = torch.tensor(advantage_lst, dtype=torch.float)

        p = pi.pi(s, softmax_dim=1)
        p_a = p.gather(1, a)
        ratio = torch.exp(torch.log(p_a) - torch.log(prob))

        surr1 = ratio * advantage
        surr2 = torch.clamp(ratio, 1 - e, 1 + e) * advantage
        loss = -torch.min(surr1, surr2) + F.smooth_l1_loss(v.v(s) , td_target.detach())

        optimizer_p.zero_grad()
        optimizer_v.zero_grad()
        loss.mean().backward()
        optimizer_p.step()
        optimizer_v.step()

def batch_factory(memory):
    s_, a_, r_, s2_, d_, prob_ = [], [], [], [], [], []
    for s, a, r, s2, d, p in memory:
        s_.append(s)
        a_.append([a])
        r = -100 if d else r
        r_.append([r])
        s2_.append(s2)
        d = 0 if d else 1
        d_.append([d])
        prob_.append([p])
        
    s_ = torch.tensor(s_, dtype=torch.float)
    a_ = torch.tensor(a_)
    r_ = torch.tensor(r_, dtype=torch.float)
    s2_ = torch.tensor(s2_, dtype=torch.float)
    d_ = torch.tensor(d_, dtype=torch.float)
    prob_ = torch.tensor(prob_)
    
    return s_, a_, r_, s2_, d_, prob_

pi = PPO_p()
v = PPO_v()
ep = 1
total_ep = 10000
gamma = .95
total_reward = 0
data = []
optimizer_p = optim.Adam(pi.parameters(), ALPHA)
optimizer_v = optim.Adam(v.parameters(), ALPHA)

while(ep < total_ep):
    done = False
    state = env.reset()
    step = 0
    
    while(not done):
        # T step 움직인 후 clipping - T 가 너무 크면 불안정??
        for t in range(T):
            if(ep > 1000):
                env.render()
            step += 1
            prob = pi.pi(torch.from_numpy(state).float())
            prob = Categorical(prob.squeeze(0).cpu())
            if(random.random() < EPSILON):
                action = env.action_space.sample()
            else:
                action = Categorical(prob).sample().cpu()
            state_next, reward, done, _ = env.step(action)
            total_reward += reward
            data.append((state, action, reward/100.0, state_next, done, prob[action].item()))
            state = state_next
            if(done):
                break

        train_net(pi, v, data, optimizer_p, optimizer_v)
        data = []
        
    ep += 1
    EPSILON = 1 / (ep/50 + 2)
    if(ep%1 == 0):
        print(ep, int(total_reward), step)
        total_reward = 0

AttributeError: 'LogSoftmax' object has no attribute 'squeeze'

# DQN

In [49]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
from torch.distributions import Categorical
import gym
import random
import collections

In [50]:
env = gym.make('LunarLander-v2')

In [51]:
# hyper parameters
EPSILON = 1
EPISODE = 2000
GAMMA = .98
ALPHA = .001
Q_TARG_PERIOD = 10

In [52]:
class DQN_Net(nn.Module):
    def __init__(self):
        super(DQN_Net, self).__init__()
        self.fc1 = nn.Linear(8, 16)
        self.fc2 = nn.Linear(16, 8)
        self.fc3 = nn.Linear(8, 4)
        
    def Q(self, x):
        x = torch.sigmoid(self.fc1(x))
        x = torch.tanh(self.fc2(x))
        #x = F.softmax(self.fc3(x), dim=0)
        #x = F.softmax(self.fc3(x), dim=0)
        x = self.fc3(x)
        return x

In [53]:
def train(q, q_tar, optimizer):
    
    # buffer 에서 랜덤으로 데이터 뽑기
    batch = random.sample(buffer, 32)
    s_buf, a_buf, r_buf, s2_buf, d_buf = [], [], [], [], []
    
    # 학습리스트에 데이터 할당
    for transition in batch:
        s, a, r, s2, d = transition
        s_buf.append(s)
        a_buf.append([a])
        r_buf.append([r])
        s2_buf.append(s2)
        d_buf.append([d])
    
    s_buf = torch.tensor(s_buf, dtype=torch.float)
    a_buf = torch.tensor(a_buf)
    r_buf = torch.tensor(r_buf)
    s2_buf = torch.tensor(s2_buf, dtype=torch.float)
    d_buf = torch.tensor(d_buf)

    # Q 계산
    Q = q.Q(s_buf)
    Q = Q.gather(1, a_buf) # a_buf 를 index 로 취급하여 Q 의 값을 추려낸다.
    
    # target Q 계산
    max_Q = q_tar.Q(s2_buf).max(1)[0].unsqueeze(1) # 차원 줄이거나 늘리기. view 함수도 차원변환함
    Q_targ = r_buf + GAMMA * max_Q
    
    # double DQN 업데이트
    #a = q_net.Q(s2_buf).max(1)[0].unsqueeze(1)
    #double_q = q_tar.Q(s2_buf).gather(1, a_buf)
    #Q_targ = r_buf + GAMMA * double_q
    
    # loss
    # mse 대신 huber loss 사용 - 덜 민감해서 급격한 변화 방지
    # It is less sensitive to outliers than the MSELoss and in some cases prevents exploding gradients 
    loss = F.smooth_l1_loss(Q, Q_targ)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

In [54]:
buffer = collections.deque(maxlen = 50000)
ep = 1

# network 생성
q_net = DQN_Net()
q_targ = DQN_Net()

# target net 에 train net
q_targ.load_state_dict(q_net.state_dict())

# env 초기화
state = env.reset()

# optimizer
optimizer = optim.Adam(q_net.parameters(), ALPHA)

while(ep < EPISODE):
    done = False
    total_reward = 0
    step = 0
    #EPSILON = max(0.01, 0.08 - 0.01*(ep/200))
    
    while(not done):
        if(ep > 100):
            env.render()
            
        # Q value 뽑기
        Q_value = q_net.Q(torch.from_numpy(state).float())

        # action 선택
        if(random.random() < EPSILON):
            action = env.action_space.sample()
        else:
            action = Q_value.argmax().item()

        # step 진행
        state_next, reward, done, _ = env.step(action)
        
        # reward 합산
        total_reward += reward
        
        # buffer 에 data stack
        buffer.append((state, action, reward, state_next, done))
        
        # state 갱신
        state = state_next
        step += 1
        
        # 학습
        if(len(buffer) > 2000):
            train(q_net, q_targ, optimizer)
        
        if(done):
            
            # periodical Update target net
            if ep % Q_TARG_PERIOD == 0:
                q_targ.load_state_dict(q_net.state_dict())
                
            print(ep, int(total_reward), step)
            ep += 1
            EPSILON = 1 / ((ep / 100) + 1)
            state = env.reset()

1 -139 104
2 -92 73
3 -114 139
4 -305 99
5 -135 62
6 -41 139
7 -85 89
8 -377 101
9 -251 104
10 -447 101
11 -201 141
12 -146 102
13 -108 121
14 -157 66
15 -255 113
16 -177 109
17 -82 68
18 -303 103
19 -169 86
20 -260 145
21 -451 125
22 -470 86
23 -100 124
24 -106 94
25 -318 163
26 -24 101
27 -40 110
28 -284 110
29 -114 117
30 -343 105
31 -105 165
32 -56 85
33 -435 168
34 -155 127
35 -138 70
36 -491 122
37 -175 126
38 -322 86
39 -130 180
40 -165 129
41 -143 96
42 -47 105
43 -85 147
44 -124 78
45 -82 96
46 -106 113
47 -83 159
48 -6 168
49 -105 185
50 -200 106
51 -108 145
52 -95 114
53 -92 137
54 -147 106
55 -148 165
56 -148 111
57 -189 114
58 -18 158
59 -119 82
60 -221 161
61 -123 120
62 -63 187
63 -102 117
64 -82 110
65 -106 123
66 -143 176
67 -114 149
68 -124 174
69 -197 143
70 -44 92
71 -95 82
72 -125 125
73 37 80
74 -103 170
75 -76 105
76 -80 92
77 -40 1000
78 -15 205
79 -195 178
80 0 133
81 -121 141
82 -67 98
83 -72 129
84 -122 116
85 -53 153
86 -29 1000
87 -17 264
88 -257 216
89 -18

KeyboardInterrupt: 

In [22]:
from __future__ import print_function

import sys, gym, time

#
# Test yourself as a learning agent! Pass environment name as a command-line argument, for example:
#
# python keyboard_agent.py SpaceInvadersNoFrameskip-v4
#

env = gym.make('LunarLander-v2')

if not hasattr(env.action_space, 'n'):
    raise Exception('Keyboard agent only supports discrete action spaces')
ACTIONS = env.action_space.n
SKIP_CONTROL = 0    # Use previous control decision SKIP_CONTROL times, that's how you
                    # can test what skip is still usable.

human_agent_action = 0
human_wants_restart = False
human_sets_pause = False

def key_press(key, mod):
    global human_agent_action, human_wants_restart, human_sets_pause
    if key==0xff0d: human_wants_restart = True
    if key==32: human_sets_pause = not human_sets_pause
    a = int( key - ord('0') )
    if a <= 0 or a >= ACTIONS: return
    human_agent_action = a

def key_release(key, mod):
    global human_agent_action
    a = int( key - ord('0') )
    if a <= 0 or a >= ACTIONS: return
    if human_agent_action == a:
        human_agent_action = 0

env.render()
env.unwrapped.viewer.window.on_key_press = key_press
env.unwrapped.viewer.window.on_key_release = key_release

def rollout(env):
    global human_agent_action, human_wants_restart, human_sets_pause
    human_wants_restart = False
    obser = env.reset()
    skip = 0
    total_reward = 0
    total_timesteps = 0
    while 1:
        if not skip:
            #print("taking action {}".format(human_agent_action))
            a = human_agent_action
            total_timesteps += 1
            skip = SKIP_CONTROL
        else:
            skip -= 1

        obser, r, done, info = env.step(a)
        if r != 0:
            print("reward %0.3f" % r)
        total_reward += r
        window_still_open = env.render()
        if window_still_open==False: return False
        if done: break
        if human_wants_restart: break
        while human_sets_pause:
            env.render()
            time.sleep(0.1)
        time.sleep(0.1)
    print("timesteps %i reward %0.2f" % (total_timesteps, total_reward))

print("ACTIONS={}".format(ACTIONS))
print("Press keys 1 2 3 ... to take actions 1 2 3 ...")
print("No keys pressed is taking action 0")

while 1:
    window_still_open = rollout(env)
    if window_still_open==False: break

ACTIONS=4
Press keys 1 2 3 ... to take actions 1 2 3 ...
No keys pressed is taking action 0
reward -1.862
reward -1.912
reward -1.856
reward -1.799
reward -1.742
reward -1.685
reward -1.627
reward -1.569
reward -1.511
reward -1.452
reward -1.394
reward -1.335
reward -1.276
reward -1.217
reward -1.157
reward -1.098
reward -1.039
reward -0.979
reward -0.920
reward -0.860
reward -0.801
reward -0.741
reward -0.681
reward -0.622
reward -0.655
reward -0.500
reward -0.440
reward -0.380
reward -0.321
reward -0.261
reward -0.201
reward -0.142
reward -0.082
reward -0.022
reward 0.037
reward 0.097
reward 0.157
reward 0.216
reward 0.276
reward 0.335
reward 0.395
reward 0.454
reward 0.514
reward 0.573
reward 0.632
reward 0.691
reward 0.749
reward 0.807
reward 0.864
reward 0.921
reward 0.975
reward 1.027
reward 1.075
reward 1.112
reward 1.127
reward 1.075
reward 0.728
reward 8.593
reward -100.000
timesteps 59 reward -110.71
reward -0.509
reward -0.591
reward -0.609
reward -0.631
reward -0.656
reward