#### Note: 1. DiscreteREINFORCE.ipynb, 2. ContinousREINFORCE.ipynb, 3. DiscreteREINFORCE_with_Baseline.ipynb 와 겹치는 변수 및 함수는 설명이 기재되어있지 않습니다.
#### 따라서 설명이 있는 부분만 보시면, 1번, 2번, 3번 알고리즘과의 차이를 확인할 수 있습니다.

#### Actor-Critic Method (REINFORCE with Critic) 은 sutton책의 chapter 13의 13.5의 pseudo code를 참조하시면 됩니다. 

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchsummary import summary

import gym
import os 
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import clear_output

# for using sampling with gradient-tracking when selecting an action
from torch.distributions import Categorical

In [2]:
class Policy(nn.Module):

    def __init__(self, state_dim, action_dim, hidden):
        super(Policy, self).__init__()
        self.fc1 = nn.Linear(state_dim, hidden)
        self.fc2 = nn.Linear(hidden, action_dim)
        
    def forward(self, state):
        out = F.relu(self.fc1(state)) 
        out = F.softmax(self.fc2(out), dim=1) 
        return out 

In [3]:
class Critic(nn.Module):
    '''
        Sutton책에 나와있는 REINFORCE with Baseline 기법 중, Actor-Critic의 Critic을 정의한 부분입니다.
        
        앞서 소개된 baseline 기법은, accumulated return의 variance를 줄이는 방법으로써 근사 함수(neural net)를 도입한 것이라면,
        Critic이란, Bootstrapping 방식에서 state value를 추정하는데 근사 함수를 도입하는 것을 말합니다.
        
        해당 노트북은 one-step bootstrapping을 적용하였습니다. 이는 TD(0)라고도 합니다. 
        
        더 자세한 설명은 chapter.13의 13.5와 pseudo code를 참고해주세요.
        13.5에 psedu code가 총 2개 소개되어 있는데요, 이 중 One-step Actor-Critic을 구현하였고, 
        "Actor-Critic with Eligibility Traces"는 two-step 이상 bootstrapping하여 update하는 방법입니다.
    '''
    def __init__(self, state_dim, action_dim, hidden):
        super(Critic, self).__init__()
        
        self.fc1 = nn.Linear(state_dim, hidden)
        self.fc2 = nn.Linear(hidden, 1) 
        
    def forward(self, state):
        '''
            Input: State
            Output: Value
            Baseline에서 쓰인 network과 차이점은 없습니다.
        '''
        out = F.relu(self.fc1(state))
        out = self.fc2(out)
        return out 

In [4]:
class Agent():
    def __init__(self, env, n_epi, max_steps, gamma, plot_freq, state_dim, action_dim, hidden, policy_learning_rate, critic_learning_rate, device, save_mode, model_name, saving_start_epi):
        # environment parameter
        self.gamma = gamma
        self.env = env

        # trainig parameter
        self.device = device
        self.n_epi = n_epi
        self.max_steps = max_steps
        self.plot_freq = plot_freq
        self.frame_cnt = 0

        # network paramter
        self.p_lr = policy_learning_rate
        self.c_lr = critic_learning_rate
        self.policy = Policy(state_dim, action_dim, hidden).to(self.device)
        self.critic = Critic(state_dim, action_dim, hidden).to(self.device) # 클래스 변수 이름만 base에서 critic으로 바꿔주었습니다.
        self.policy_optimizer = optim.Adam(self.policy.parameters(), lr=self.p_lr)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=self.c_lr)
        
        # log parameter
        self.avg = []
        self.scores = []
        self.p_losses = []
        self.c_losses = []
        self.save_mode = save_mode
        self.model_name = model_name
        self.saving_start = saving_start_epi
        
    def select_action(self, state):
        prob = self.policy(torch.FloatTensor(state).view(1, -1).to(self.device)) 
        cate_dist = Categorical(prob) 
        action = cate_dist.sample() 
        return action.item(), cate_dist.log_prob(action), prob.detach().cpu().numpy() 
    
    def train(self):
        
        for i_episode in range(1, self.n_epi+1):
            epi_history = []
            rewards = [] 
            score = 0
            n_step = 0
            p_loss_epi = 0
            c_loss_epi = 0

            state = self.env.reset() 
            for step in range(self.max_steps):
                action, log_prob, prob = self.select_action(state)
                next_state, reward, done, _ = env.step(action)
                
                # update의 변수는 bootsrapping에 필요한 변수들 입니다.
                # done변수: 마지막 state를 update할 떄, next_state의 Q값을 고려하면 안되므로 next_state의 Q값을 update할 때 빼기위해 쓰입니다.
                loss = self._update_networks(state, next_state, reward, int(done), log_prob, n_step)
                p_loss_epi += loss[0] 
                c_loss_epi += loss[1]
                
                state = next_state
                score += reward
                n_step += 1
                self.frame_cnt += 1
                if done: break
                    
            self.scores.append(score)
            self.p_losses.append(p_loss_epi) 
            self.c_losses.append(c_loss_epi)
            if i_episode%self.plot_freq == 0:
                self._plot_status(i_episode, self.p_losses, self.c_losses, self.scores)
                
            if self.save_mode & (i_episode > self.saving_start): self._save_model()
            self.avg.append(np.mean(self.scores[-10:]))
            
    def test(self, model_path):
        ''' 저장된 pt파일을 불러와 test 합니다 '''
        
        self.policy.load_state_dict(torch.load(model_path))
        self.policy.eval()
        
        state = self.env.reset()
        done = False
        score = 0
        accum_frames = []
        while not done:
            accum_frames.append(self.env.render(mode="rgb_array"))
            action, log_prob, prob  = self.select_action(state)
            next_state, reward, done, _ = self.env.step(action)

            state = next_state
            score += reward
        
        print("score: ", score)
        self.env.close()
        
        return accum_frames
    
    def _update_networks(self, state, next_state, reward, done, log_prob, n_step):
        '''  
            Actor-Critic의 one-step bootsrapping을 적용한 update 함수입니다.
            sutton책으로는, 13.5의 pseudo code 중 "One-step Actor-Critic" 입니다.
        '''
        critic_value = self.critic(torch.FloatTensor(state).unsqueeze(0).to(device)) 
        target_value = reward + (1-done)*self.gamma*self.critic(torch.FloatTensor(next_state).unsqueeze(0).to(device)) 
        delta = (target_value - critic_value).detach() 
        
        p_loss = -log_prob * delta * (self.gamma**n_step)
        c_loss = -critic_value * delta
        
        self.policy_optimizer.zero_grad() 
        p_loss.backward() 
        self.policy_optimizer.step() 

        self.critic_optimizer.zero_grad() 
        c_loss.backward() 
        self.critic_optimizer.step() 
        
        return [p_loss.item(), c_loss.item()] 
    
    def _save_model(self):
        last_mean = np.mean(self.scores[-10:])
        if  max(self.avg) < last_mean:
            torch.save(self.policy.state_dict(), self.model_name+f'Score_{round(last_mean, 3)}.pt')
    
    def _plot_status(self, i_episode, p_losses, c_losses, score_hist):
        subplot_params = [
            (311, f"Scores in episode_{i_episode}", score_hist),
            (312, f"Policy loss in episode:{i_episode}", p_losses),
            (313, f"Critic loss in episode:{i_episode}", c_losses),
        ]

        clear_output(True)
        plt.figure(figsize=(20, 20), facecolor='w')
        for loc, title, values in subplot_params:
            plt.subplot(loc)
            plt.title(f'Frame:{self.frame_cnt} '+title)
            plt.plot(values)
        plt.show()

In [None]:
device = torch.device(
            "cuda:1" if torch.cuda.is_available() else "cpu"
        )

env_name_list = ["CartPole-v0", "MountainCar-v0", "LunarLander-v2"]
env_name = env_name_list[0]
env = gym.make(env_name)

# 몇 episode 이후부터 저장할 것인지 정하는 변수 입니다.
saving_start_epi = 100

# model을 저장할 폴더를 지정합니다. save_mode 변수로 저장 여부를 결정합니다.
save_mode = True
model_save_folder = './model_save'
if not os.path.exists(model_save_folder):
    os.mkdir(model_save_folder)
model_name = f"./{model_save_folder}/Discrete_Actor_Critic_{env_name}_"

state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

print("Environment: ", env_name)
print("State Dimension:", state_dim, " Action Dimension:", action_dim)

n_epi = 100000 
max_steps = 1000
gamma = 0.99
plot_freq = 10

hidden = 32
P_learning_rate = 0.001
C_learning_rate = 0.01

In [6]:
agent = Agent(
            env,
            n_epi,
            max_steps,
            gamma,
            plot_freq,
            state_dim,
            action_dim,
            hidden,
            P_learning_rate,
            C_learning_rate,
            device, save_mode,
            model_name,
            saving_start_epi)

In [None]:
agent.train() 

In [None]:
model_path = f"{model_name}Score_200.0.pt"
frames = agent.test(model_path)

import imageio
from IPython.display import Video
imageio.mimwrite('./test1.mp4', frames, fps=30)
Video('./test1.mp4', width=480, height=360)

#### 하단의 그래프는 모두 CartPole 환경에서의 결과입니다. hyperparameter에 따라 결과가 많이 다르다는 것을 확인할 수 있습니다.
#### 그리고 REINFORCE보다 학습이 더 빨리 되지 않다는 것을 보실 수 있는데요, 최적 hyper parameter를 찾기가 쉽지 않았습니다. 

    n_epi = 100000 
    max_steps = 1000
    gamma = 0.99
    plot_freq = 10

    hidden = 32
    policy_learning_rate = 0.0001
    critic_learning_rate = 0.003

![image.png](attachment:image.png)

    n_epi = 100000 
    max_steps = 1000
    gamma = 0.99
    plot_freq = 10

    hidden = 128
    policy_learning_rate = 0.001
    critic_learning_rate = 0.01

![image.png](attachment:image.png)

    n_epi = 100000 
    max_steps = 1000
    gamma = 0.99
    plot_freq = 10

    hidden = 128
    policy_learning_rate = 0.0001
    critic_learning_rate = 0.0001

![image.png](attachment:image.png)

    n_epi = 100000 
    max_steps = 1000
    gamma = 0.99
    plot_freq = 10

    hidden = 64
    policy_learning_rate = 0.0001
    critic_learning_rate = 0.001

![image.png](attachment:image.png)