## Initial Setting

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.distributions as distributions

import matplotlib.pyplot as plt
import numpy as np
import gym

train_env = gym.make('CartPole-v1')
test_env = gym.make('CartPole-v1')

SEED = 1234

train_env.seed(SEED);
test_env.seed(SEED+1);
np.random.seed(SEED);
torch.manual_seed(SEED);

## Vanila Multi Layer Perceptron

In [5]:
class MLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, dropout = 0.5):
        super().__init__()

        self.fc_1 = nn.Linear(input_dim, hidden_dim)
        self.fc_2 = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.fc_1(x)
        x = self.dropout(x)
        x = F.relu(x)
        x = self.fc_2(x)
        return x


In [6]:
INPUT_DIM = train_env.observation_space.shape[0]
HIDDEN_DIM = 128
OUTPUT_DIM = train_env.action_space.n

policy = MLP(INPUT_DIM, HIDDEN_DIM, OUTPUT_DIM)


In [7]:
def init_weights(m):
    if type(m) == nn.Linear:
        torch.nn.init.xavier_normal_(m.weight) # init 하는 algorithm이 있음.
        m.bias.data.fill_(0)

In [8]:
policy.apply(init_weights)

MLP(
  (fc_1): Linear(in_features=4, out_features=128, bias=True)
  (fc_2): Linear(in_features=128, out_features=2, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [9]:
LEARNING_RATE = 0.01

optimizer = optim.Adam(policy.parameters(), lr = LEARNING_RATE)

### train

In [11]:
def train(env, policy, optimizer, discount_factor):
    
    policy.train()
    
    log_prob_actions = []
    rewards = []
    done = False
    episode_reward = 0

    state = env.reset()

    while not done:

        state = torch.FloatTensor(state).unsqueeze(0)

        action_pred = policy(state)
        
        action_prob = F.softmax(action_pred, dim = -1)
                
        dist = distributions.Categorical(action_prob)
        action = dist.sample() # 확률값으로 sample하게 해줌
        
        log_prob_action = dist.log_prob(action) # pass : prob를 튜닝해 줌
        
        state, reward, done, _ = env.step(action.item())

        log_prob_actions.append(log_prob_action)
        rewards.append(reward)

        episode_reward += reward

    log_prob_actions = torch.cat(log_prob_actions)
        
    # 실제 Q table을 만듦.
    returns = calculate_returns(rewards, discount_factor)
        
    # update
    loss = update_policy(returns, log_prob_actions, optimizer)

    return loss, episode_reward

In [12]:
def calculate_returns(rewards, discount_factor, normalize = True):
    """apply discount factor and
    squeeze reward into regular distribution is is_normalize """
    returns = []
    R = 0
    
    for r in reversed(rewards):
        R = r + R * discount_factor
        returns.insert(0, R)
        
    returns = torch.tensor(returns)
    
    if normalize:
        returns = (returns - returns.mean()) / returns.std()
        
    return returns

In [13]:
def update_policy(returns, log_prob_actions, optimizer):
    
    returns = returns.detach()
    
    loss = - (returns * log_prob_actions).sum()
    
    # set grad to zero
    optimizer.zero_grad()
    
    loss.backward() # requires_grad=True 인 para들의 미분 값을 구함. 
    
    # perform one optimizer step
    optimizer.step()
    
    return loss.item()

In [15]:
def evaluate(env, policy):
    
    policy.eval() # # set the policy to evaluate mode
    
    done = False
    episode_reward = 0

    state = env.reset()

    while not done:
        
        state = torch.FloatTensor(state).unsqueeze(0)
        
        with torch.no_grad(): # disabled gradient calculation
        
            action_pred = policy(state)
        
            action_prob = F.softmax(action_pred, dim = -1)
                            
        action = torch.argmax(action_prob, dim = -1)
            
        state, reward, done, _ = env.step(action.item())

        episode_reward += reward
        
    return episode_reward

In [17]:
MAX_EPISODES = 500
DISCOUNT_FACTOR = 0.99
N_TRIALS = 25
REWARD_THRESHOLD = 475
PRINT_EVERY = 10

train_rewards = []
test_rewards = []

for episode in range(1, MAX_EPISODES):

    loss, train_reward = train(train_env, policy, optimizer, DISCOUNT_FACTOR)

    test_reward = evaluate(test_env, policy)

    train_rewards.append(train_reward)
    test_rewards.append(test_reward)

    mean_train_rewards = np.mean(train_rewards[-N_TRIALS:])
    mean_test_rewards = np.mean(test_rewards[-N_TRIALS:])

    if episode % PRINT_EVERY == 0:

        print(f'| Episode: {episode:3} | Mean Train Rewards: {mean_train_rewards:5.1f} | Mean Test Rewards: {mean_test_rewards:5.1f} |')

    if mean_test_rewards >= REWARD_THRESHOLD:

        print(f'Reached reward threshold in {episode} episodes')
        
        break

| Episode:  10 | Mean Train Rewards:  26.3 | Mean Test Rewards:  12.5 |
| Episode:  20 | Mean Train Rewards:  35.3 | Mean Test Rewards:  39.0 |
| Episode:  30 | Mean Train Rewards:  41.8 | Mean Test Rewards:  48.2 |
| Episode:  40 | Mean Train Rewards:  56.0 | Mean Test Rewards:  75.0 |
| Episode:  50 | Mean Train Rewards:  63.2 | Mean Test Rewards:  97.9 |
| Episode:  60 | Mean Train Rewards:  70.5 | Mean Test Rewards: 161.2 |
| Episode:  70 | Mean Train Rewards:  76.3 | Mean Test Rewards: 202.8 |
| Episode:  80 | Mean Train Rewards:  80.9 | Mean Test Rewards: 229.1 |
| Episode:  90 | Mean Train Rewards:  97.1 | Mean Test Rewards: 293.7 |
| Episode: 100 | Mean Train Rewards: 122.2 | Mean Test Rewards: 238.6 |
| Episode: 110 | Mean Train Rewards: 175.4 | Mean Test Rewards: 291.9 |
| Episode: 120 | Mean Train Rewards: 253.7 | Mean Test Rewards: 361.5 |
| Episode: 130 | Mean Train Rewards: 347.1 | Mean Test Rewards: 444.6 |
| Episode: 140 | Mean Train Rewards: 271.5 | Mean Test Rewards: 