In [2]:
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Normal
from tqdm import tqdm  # tqdm 임포트
import matplotlib.pyplot as plt
from torch.utils.tensorboard import SummaryWriter
import custom_pendulum

# 환경 설정
# env = gym.make('Pendulum-v1')
env = custom_pendulum.PendulumEnv()

# 하이퍼파라미터
learning_rate = 0.0003
gamma = 0.99
lmbda = 0.95
eps_clip = 0.2
K_epoch = 10
T_horizon = 2048
seq_length=16
batch_size=256
minibatch_size=32
entropy_coef = 0.01
critic_coef = 0.5

# PPO 에이전트의 구성 요소
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]

Box([-1. -1. -8.], [1. 1. 8.], (3,), float32)
Box(-2.0, 2.0, (1,), float32)
3
1


In [3]:
# 정책 네트워크
class Actor(nn.Module):
    def __init__(self):
        super(Actor, self).__init__()
        self.fc1 = nn.Linear(state_dim, 64)
        self.fc2 = nn.Linear(64, 128)
        self.fc3 = nn.Linear(128, 64)
        self.mu_head = nn.Linear(64, action_dim)
        self.sigma_head = nn.Linear(64, action_dim)
        self.sigma_head_act = nn.Sigmoid()  # nn.Softplus를 사용하여 sigma가 항상 양수임을 보장
        
        
    def forward(self, x):
        x = torch.tanh(self.fc1(x))
        x = torch.tanh(self.fc2(x))
        x = torch.tanh(self.fc3(x))
        mu = torch.tanh(self.mu_head(x))*2
        sigma = self.sigma_head_act(self.sigma_head(x)) + 1e-5  # 0 이상의 값 보장
        return mu, sigma

# 가치 네트워크
class Critic(nn.Module):
    def __init__(self):
        super(Critic, self).__init__()
        self.fc1 = nn.Linear(state_dim, 64)
        self.fc2 = nn.Linear(64, 64)
        self.v_head = nn.Linear(64, 1)
        
    def forward(self, x):
        x = torch.tanh(self.fc1(x))
        x = torch.tanh(self.fc2(x))
        value = self.v_head(x)
        return value


In [7]:
import pdb
writer = SummaryWriter()

class PPO:
    def __init__(self):
        self.data = []
        self.actor = Actor()
        self.critic = Critic()
        self.optimizer = optim.Adam([
            {'params': self.actor.parameters(), 'lr': learning_rate},
            {'params': self.critic.parameters(), 'lr': learning_rate}
        ])
        self.loss_fn = nn.MSELoss()
        
    def get_action(self, state):
        state = torch.from_numpy(state).float()
        mu, sigma = self.actor(state)
        dist = Normal(mu, sigma)
        action = dist.sample()
        log_prob = dist.log_prob(action)
        
#         mu, sigma = self.actor(mini_state)
#         dist = Normal(mu, sigma)
#         log_prob = dist.log_prob(mini_action).sum(1, keepdim=True)
        
#         print('size : ', log_prob.shape, action.shape, log_prob, log_prob.sum())
    
    
        return action.numpy(), dist.log_prob(action).sum().item()

    def put_data(self, transition):
        self.data.append(transition)

        
    def make_batch(self):
        state_lst, action_lst, reward_lst, next_state_lst, log_prob_lst = [], [], [], [], []
        for transition in self.data:
            state, action, reward, log_prob, next_state = transition

            state_lst.append(state)
            action_lst.append([action])
            reward_lst.append([reward])
            next_state_lst.append(next_state)
            log_prob_lst.append([log_prob])
        
        # 데이터 텐서로 변환
        state = torch.tensor(state_lst, dtype=torch.float)
        action = torch.tensor(action_lst, dtype=torch.float)
        reward = torch.tensor(reward_lst, dtype=torch.float)
        next_state = torch.tensor(next_state_lst, dtype=torch.float)
        log_prob = torch.tensor(log_prob_lst, dtype=torch.float)
        
        self.data = []
        return state, action, reward, next_state, log_prob
    
    def mini_batch():
        pass
    
    def compute_gae(self, rewards, values, next_values, masks, gamma=gamma, lmbda=lmbda):
        deltas = rewards + gamma * next_values * masks - values
        gae = 0
        advantages = []
        for delta in deltas.flip(0):
            gae = delta + gamma * lmbda * gae
            advantages.insert(0, gae)
        return torch.tensor(advantages, dtype=torch.float)
    
    def train(self):
        state, action, reward, next_state, old_log_prob = self.make_batch()
        
        for i in range(K_epoch):
            td_target = reward + gamma * self.critic(next_state)
            advantage = self.compute_gae(
                reward,
                self.critic(state).detach(),
                self.critic(next_state).detach(),
                torch.ones(reward.size()),
                gamma,
                lmbda)
            
            for _ in range(len(state) // minibatch_size):
                indices = np.random.choice(len(state), minibatch_size, replace=False)
                mini_state = state[indices]
                mini_action = action[indices].squeeze(1) # 32,1
                mini_reward = reward[indices]
                mini_next_state = next_state[indices]
                mini_old_log_prob = old_log_prob[indices]
                mini_advantage = advantage[indices]

                mu, sigma = self.actor(mini_state)
                dist = Normal(mu, sigma)
                
                # mini_action 32 1
                # dist.log_prob(mini_action) 32 32 1
                
                log_prob = dist.log_prob(mini_action).sum(1, keepdim=True) # keepdim 제거 
                entropy = dist.entropy().sum(1, keepdim=True) #언제 높아지고 언제 낮아지는지 keep????
                ratio = torch.exp(log_prob) / torch.exp(mini_old_log_prob) #?? / current log prob
                ###########################################################################3
#                 print(mini_action.shape, log_prob.shape, dist.log_prob(mini_action).shape, dist.log_prob(mini_action))
#                 print(ratio)
        
                surr1 = ratio * mini_advantage
                surr2 = torch.clamp(ratio, 1 - eps_clip, 1 + eps_clip) * mini_advantage


                actor_loss = -torch.min(surr1, surr2).mean()
                critic_loss = critic_coef * self.loss_fn(self.critic(mini_state), td_target[indices].detach())
                entropy_loss = - entropy_coef * entropy.mean() #분포가 넓어질때 엔트로피 로스가 작아짐

                
                
                loss = actor_loss + critic_loss + entropy_loss

                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()

            # 텐서보드에 손실 기록
            writer.add_scalar('Loss/Actor', actor_loss.item(), n_epi)
            writer.add_scalar('Loss/Critic', critic_loss.item(), n_epi)
            writer.add_scalar('Loss/Total', loss.item(), n_epi)
                
    def load(self, filepath):
        self.actor.load_state_dict(torch.load(filepath + '_actor.pth'))
        self.critic.load_state_dict(torch.load(filepath + '_critic.pth'))
        
    def save(self, filepath):
        torch.save(self.actor.state_dict(), filepath + '_actor.pth')
        torch.save(self.critic.state_dict(), filepath + '_critic.pth')

In [8]:
# PPO 학습 과정
agent = PPO()

In [14]:
score = 0.0
print_interval = 20

quit = False
result = []

for n_epi in range(1000): #에피소드 1000번
    state = env.reset()
    state = np.array(state[0])  # 상태를 numpy 배열로 변환
    
    score = 0.0
    end = 0
    tensorboard_score = 0
    
    done = False
    while not done:
        action, log_prob = agent.get_action(state) # 현재의 log_prob
        next_state, reward, _, _, _ = env.step(action)
        next_state = np.array(next_state)  # 상태를 numpy 배열로 변환

        score += reward
        tensorboard_score += reward

        agent.put_data((state, action, reward, log_prob, next_state))

        if len(agent.data) == batch_size:
            agent.train()
            writer.add_scalar('tensorboard reward', tensorboard_score, n_epi)
            tensorboard_score = 0
            

        if end == 256:
            done = True

        if done:
#                 print("Done")
            break

        state = next_state
        end += 1

        
    if n_epi % print_interval == 0 and n_epi != 0:
        print(f"# of episode: {n_epi}, avg score: {score/print_interval:.1f}")    
        


# of episode: 20, avg score: -57.6
# of episode: 40, avg score: -50.7
# of episode: 60, avg score: -51.1
# of episode: 80, avg score: -56.3
# of episode: 100, avg score: -57.8
# of episode: 120, avg score: -65.0
# of episode: 140, avg score: -70.6
# of episode: 160, avg score: -70.6
# of episode: 180, avg score: -64.8
# of episode: 200, avg score: -63.0
# of episode: 220, avg score: -57.7
# of episode: 240, avg score: -67.1
# of episode: 260, avg score: -70.1
# of episode: 280, avg score: -70.8
# of episode: 300, avg score: -61.7
# of episode: 320, avg score: -64.5
# of episode: 340, avg score: -65.8
# of episode: 360, avg score: -55.5
# of episode: 380, avg score: -61.6
# of episode: 400, avg score: -69.3
# of episode: 420, avg score: -66.2
# of episode: 440, avg score: -57.3
# of episode: 460, avg score: -62.9
# of episode: 480, avg score: -67.2
# of episode: 500, avg score: -64.6
# of episode: 520, avg score: -72.6
# of episode: 540, avg score: -51.5
# of episode: 560, avg score: -6

In [15]:
# 학습된 모델 저장
agent.save("ppo_pendulum")

In [16]:
# 학습된 에이전트 평가 및 시각화
agent.load("ppo_pendulum")

In [17]:
# 학습된 에이전트 평가 및 시각화
# 평가를 위해 렌더링 모드가 있는 환경 재설정
eval_env = gym.make('Pendulum-v1', render_mode="human")

state = eval_env.reset()
state = np.array(state[0])  # 상태를 numpy 배열로 변환
done = False
trajectory = []

while not done:
    action, _ = agent.get_action(state)
    next_state, reward, done, fail, _ = eval_env.step(action)
    next_state = np.array(next_state)  # 상태를 numpy 배열로 변환
    trajectory.append(state)
    state = next_state
    print(action)
    eval_env.render()

#     if fail:
#         print("fail")
#         break

eval_env.close()

# 궤적 시각화
trajectory = np.array(trajectory)
plt.plot(trajectory[:, 0], trajectory[:, 1])
plt.title('Pendulum-v1 PPO Trajectory')
plt.show()

[-1.5809444]
[-2.000668]
[-1.474394]
[-1.973661]
[-1.5502007]
[-1.4855342]
[-1.7503097]
[-1.1369228]
[-2.2757058]
[-1.2824223]
[-1.7419736]
[-2.4585457]
[-1.5408239]
[-1.53026]
[-0.3890519]
[2.1143818]
[2.1019888]
[1.3477745]
[1.1083484]
[1.3409894]
[2.8087068]
[2.253179]
[2.5683048]
[2.0904136]
[2.2935486]
[1.7455451]
[1.9091164]
[1.4730238]
[1.9482578]
[1.2684625]
[1.9276619]
[0.5066744]
[1.1174095]
[1.338372]
[-1.0357442]
[-0.8160814]
[-2.5210736]
[-1.5359735]
[-1.7933702]
[-0.6402426]
[1.0555128]
[1.7974039]
[1.7039341]
[1.1221318]
[2.7237794]
[1.6341909]
[-1.5570531]
[-1.8394971]
[-2.1220922]
[-1.7845372]
[-2.0700815]
[-1.5096667]
[-1.3107231]
[-2.1766503]
[-1.3378929]
[-0.50112593]
[-1.0315297]
[0.94448817]
[1.0309856]
[1.7477765]
[0.7975221]
[2.7099786]
[1.0029564]
[1.4371756]
[0.21700323]
[1.2675436]
[-0.06633523]
[-0.20488906]
[-0.71793896]
[0.44634083]
[-0.78254235]
[0.70895356]
[1.2265961]
[1.2671231]
[2.7584426]
[0.9816216]
[0.23158228]
[2.223296]
[1.0318795]
[1.3116115]
[0

[0.983265]
[-0.840754]
[0.27587992]
[0.7607546]
[0.45815438]
[0.27059418]
[0.20719059]
[0.18995357]
[0.65141934]
[0.4797486]
[-0.49078715]
[0.13051355]
[-0.70101404]
[-0.47638184]
[-0.84849286]
[-0.68282235]
[0.1334768]
[-1.1120094]
[0.64842594]
[-0.24595696]
[-0.6706083]
[-0.14699343]
[-0.28576145]
[0.25094086]
[1.0598662]
[0.94295686]
[1.5328748]
[1.5604954]
[2.1364956]
[2.4958045]
[1.5372145]
[1.3035785]
[-0.6329403]
[-1.6964469]
[-1.635607]
[-1.9427292]
[-0.12846637]
[0.32154852]
[1.3468819]
[0.8770488]
[1.459707]
[-0.07832814]
[-0.1648762]
[1.2948282]
[1.1304394]
[0.3673279]
[0.69166696]
[-0.569504]
[0.7120954]
[-0.12419567]
[1.2304611]
[-0.27378696]
[0.18374291]
[-0.05513193]
[-0.07212584]
[-0.6387523]
[-0.05198127]
[-0.41471225]
[-0.30179888]
[-0.06184655]
[-0.6060208]
[-1.8792641]
[-0.09123969]
[-1.2525792]
[-1.528054]
[-0.58556205]
[-0.06047837]
[1.1374087]
[0.77924293]
[0.8148649]
[1.135391]
[1.4025698]
[1.6375513]
[1.091222]
[1.8864248]
[1.3580915]
[-0.8570105]
[-1.6117755]


KeyboardInterrupt: 

In [18]:
eval_env.close()