# REINFORCE - Monte Carlo Policy Gradient Method for Cartpole

REINFORCE is a on-policy monte carlo method. It let the agent complete the rollout and use the accumulated reward to update the policy. The basic idea in the vanilla version of REINFORCE is to reduce the probability of actions that are involved in a bad rollout, and increase the probablity of actions in a good roll out. So over time, the good action will be separated from the bad action.

Sum of reward of a roll out $G = r_1 + \_gamma * r_2 + \_gamma^2 * r_3 + ... + \_gamma^{n-1} * r_n$
Then calculate the gradient for the probability of the action. If G is good, increase the probablility of the action, otherwise decrease it. 

It may seem obvious that equally assign blame to all actions in a bad roll out is not a good idea. There is an improved version of REINFORCE that try to estimate a baseline and assign blame / praise to action by comparing the reward to the base line.

Baseline $A = G - Baseline$  
Now update the action base on A instead of G.

The intuition with this is sometimes the rewards are bound to be bad after certain point and nothing the agent can do to improve that. For example in pong, when the ball pass the bar. There is nothing the agent can save that.

The usual choise of a baseline is a value function approximation. And it can be learnt by using the monte carlo reward collected by the policy. Then train a neural network with supervised learning.

In [1]:
import os
import gym
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.distributions import Categorical, Normal

In [2]:
use_cuda = torch.cuda.is_available()
FloatTensor = torch.cuda.FloatTensor if use_cuda else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if use_cuda else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if use_cuda else torch.ByteTensor

In [3]:
def save_torch_model(model, filename):
  if not os.path.exists(os.path.dirname(filename)):
    os.makedirs(os.path.dirname(filename))
  torch.save(model.state_dict(), filename)

def load_torch_model(model, filename):
  model.load_state_dict(torch.load(filename))


In [4]:
class PolicyNet_discret(nn.Module):
  def __init__(self, input_size, output_size):
    super(PolicyNet_discret,self).__init__()
    self.l1_linear = nn.Linear(input_size, 256)
    self.l2_linear = nn.Linear(256, 128)
    self.l3_linear = nn.Linear(128, output_size)
    nn.init.kaiming_normal_(self.l1_linear.weight)
    nn.init.kaiming_normal_(self.l2_linear.weight)
    self.l3_linear.weight.data.zero_()
    
  def forward(self,x):
    out = F.relu(self.l1_linear(x))
    out = F.relu(self.l2_linear(out))
    out = F.softmax(self.l3_linear(out),dim=0)
    return out

In [5]:
class ValueNet(nn.Module):
  def __init__(self, input_size):
    super(ValueNet,self).__init__()
    self.l1_linear = nn.Linear(input_size, 256)
    self.l2_linear = nn.Linear(256, 128)
    self.l3_linear = nn.Linear(128, 1)
    nn.init.kaiming_normal_(self.l1_linear.weight)
    nn.init.kaiming_normal_(self.l2_linear.weight)
    self.l3_linear.weight.data.zero_()

  def forward(self, x):
    out = F.relu(self.l1_linear(x))
    out = F.relu(self.l2_linear(out))
    out = self.l3_linear(out)
    return out    

In [76]:
class REINFORCE_wBaseline():
  def __init__(self, env, steps_in_state = 2):
    self.steps_in_state = steps_in_state
    self.policy = PolicyNet_discret(env.observation_space.shape[0] * steps_in_state,env.action_space.n)
    self.value = ValueNet(env.observation_space.shape[0] * steps_in_state)
    self.env = env
    self._gamma = 0.96
    
  def predict_value(self, state):
    return self.value(state)
  
  def predict_action(self, state):
    return self.policy(state)
  
  def pick_action(self, state):
    probs = self.predict_action(state)
    action_dist = Categorical(probs)
    action = action_dist.sample()
    action = action.item()
    return (action, action_dist.log_prob(FloatTensor([action])))
  
  def update_policy_and_value(self, episode):
    (states, actions, rewards, log_probs) = zip(*episode)
    
    MC_rewards = []
    R = 0
    for r in rewards[::-1]:
      R = r + self._gamma * R
      MC_rewards.insert(0, R)
      
    value_prediction = self.value(torch.stack(states))
    value_loss = F.mse_loss(value_prediction, FloatTensor(MC_rewards).view(-1,1))
    
    policy_loss = []
    for (log_prob, reward, baseline) in zip(log_probs, MC_rewards, value_prediction.view(-1).tolist()):
      policy_loss.append(-log_prob*(reward - baseline))
    policy_loss = torch.stack(policy_loss).sum()
    
    self.value_optimizer.zero_grad()
    value_loss.backward()
    self.value_optimizer.step()
    
    self.policy_optimizer.zero_grad()
    policy_loss.backward()
    self.policy_optimizer.step()

  def train(self, env, update_limit=1000, samples_per_update=200, lr=1e-3, lr_policy=None, lr_value=None, checkpoint=100):
    lr_policy = lr if lr_policy == None else lr_policy
    lr_value = lr if lr_value == None else lr_value
    self.policy_optimizer = torch.optim.Adam(self.policy.parameters(), lr=lr_policy, weight_decay=1e-3)
    self.value_optimizer = torch.optim.Adam(self.value.parameters(), lr=lr_value, weight_decay=1e-3)
    best_score = -99999
    running_score = None
    samples = []
    samples_per_update = samples_per_update
    update_count = 0
    while update_count < update_limit:
      s0 = env.reset()
      seq = [s0] * self.steps_in_state
      state = FloatTensor(seq).view(-1)
      episode_ended = False
      score = 0
      while not episode_ended:
        (action, log_prob) =  self.pick_action(state)
        (s1, reward, episode_ended, info) = env.step(action)
        samples.append((state, action, reward, log_prob))
        seq = seq[1:]
        seq.append(s1)
        next_state = FloatTensor(seq).view(-1)
        if episode_ended:
          ended = 1
        else:
          ended = 0
        s0 = s1
        state = next_state
        score += reward
        
      if running_score == None:
        running_score = score
      else:
        running_score = running_score * 0.9 + score * 0.1
        
      if len(samples) > samples_per_update:
        if (update_count + 1) % checkpoint == 0:
          is_best = False
          if running_score > best_score:
            is_best = True
            save_torch_model(self.policy, 'model/reinforce_cartpole_policy_best.pth')
            best_score = running_score
          save_torch_model(self.policy,'model/reinforce_cartpole_policy_iter_%d.pth' %(update_count+1))
          print('%d: running_score:%.2f, is_best:%s' %(update_count+1, running_score, is_best))
        update_count += 1
        self.update_policy_and_value(samples)
        samples = []
        

In [77]:
env = gym.make('CartPole-v0')
agent = REINFORCE_wBaseline(env)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [78]:
agent.train(env, update_limit=2000, samples_per_update=100, lr=1e-4, checkpoint=200)

200: running_score:24.56, is_best:True
400: running_score:25.27, is_best:True
600: running_score:25.59, is_best:True
800: running_score:25.76, is_best:True
1000: running_score:37.80, is_best:True
1200: running_score:48.21, is_best:True
1400: running_score:104.48, is_best:True
1600: running_score:154.74, is_best:True
1800: running_score:176.12, is_best:True
2000: running_score:195.02, is_best:True


In [79]:
class REINFORCE_wBaseline_MC():
  def __init__(self, env, steps_in_state = 2):
    self.steps_in_state = steps_in_state
    self.policy = PolicyNet_discret(env.observation_space.shape[0] * steps_in_state,env.action_space.n)
    self.value = ValueNet(env.observation_space.shape[0] * steps_in_state)
    self.env = env
    self._gamma = 0.96
    
  def predict_value(self, state):
    return self.value(state)
  
  def predict_action(self, state):
    return self.policy(state)
  
  def pick_action(self, state):
    probs = self.predict_action(state)
    action_dist = Categorical(probs)
    action = action_dist.sample()
    action = action.item()
    return (action, action_dist.log_prob(FloatTensor([action])))
  
  def update_policy_and_value(self, episode):
    (states, actions, rewards, log_probs) = zip(*episode)
    
    MC_rewards = []
    R = 0
    for r in rewards[::-1]:
      R = r + self._gamma * R
      MC_rewards.insert(0, R)
      
    value_prediction = self.value(torch.stack(states))
    value_loss = F.mse_loss(value_prediction, FloatTensor(MC_rewards).view(-1,1))
    
    policy_loss = []
    for (log_prob, reward, baseline) in zip(log_probs, MC_rewards, value_prediction.view(-1).tolist()):
      policy_loss.append(-log_prob*(reward - baseline))
    policy_loss = torch.stack(policy_loss).sum()
    
    self.value_optimizer.zero_grad()
    value_loss.backward()
    self.value_optimizer.step()
    
    self.policy_optimizer.zero_grad()
    policy_loss.backward()
    self.policy_optimizer.step()

  def train(self, env, update_limit=1000, samples_per_update=200, lr=1e-3, lr_policy=None, lr_value=None, checkpoint=100):
    lr_policy = lr if lr_policy == None else lr_policy
    lr_value = lr if lr_value == None else lr_value
    self.policy_optimizer = torch.optim.Adam(self.policy.parameters(), lr=lr_policy, weight_decay=1e-3)
    self.value_optimizer = torch.optim.Adam(self.value.parameters(), lr=lr_value, weight_decay=1e-3)
    best_score = -99999
    running_score = None
    samples = []
    samples_per_update = samples_per_update
    update_count = 0
    while update_count < update_limit:
      s0 = env.reset()
      seq = [s0] * self.steps_in_state
      state = FloatTensor(seq).view(-1)
      episode_ended = False
      score = 0
      while not episode_ended:
        (action, log_prob) =  self.pick_action(state)
        (s1, reward, episode_ended, info) = env.step(action)
        if reward > 0:
          print('hit goal!!!')
        reward = abs(s1[0] - 0.5)
        samples.append((state, action, reward, log_prob))
        seq = seq[1:]
        seq.append(s1)
        next_state = FloatTensor(seq).view(-1)
        if episode_ended:
          ended = 1
        else:
          ended = 0
        s0 = s1
        state = next_state
        score += reward
        
      if running_score == None:
        running_score = score
      else:
        running_score = running_score * 0.9 + score * 0.1
        
      if len(samples) > samples_per_update:
        if (update_count + 1) % checkpoint == 0:
          is_best = False
          if running_score > best_score:
            is_best = True
            save_torch_model(self.policy, 'model/reinforce_cartpole_policy_best.pth')
            best_score = running_score
          save_torch_model(self.policy,'model/reinforce_cartpole_policy_iter_%d.pth' %(update_count+1))
          print('%d: running_score:%.2f, is_best:%s' %(update_count+1, running_score, is_best))
        update_count += 1
        self.update_policy_and_value(samples)
        samples = []
        

In [83]:
env = gym.make('MountainCar-v0')
agent = REINFORCE_wBaseline_MC(env)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [84]:
agent.train(env, update_limit=2000, samples_per_update=500, lr=1e-4, checkpoint=200)

200: running_score:203.67, is_best:True
400: running_score:207.20, is_best:True
600: running_score:227.50, is_best:True
800: running_score:232.21, is_best:True
1000: running_score:233.59, is_best:True
1200: running_score:234.64, is_best:True
1400: running_score:235.61, is_best:True
1600: running_score:235.61, is_best:False
1800: running_score:235.38, is_best:False
2000: running_score:235.31, is_best:False


In [None]:
env = gym.make('KungFuMaster-ram-v0')
env = gym.make('Pendulum-v0')
def avg_reward_random(env, iteration):
  total_reward = 0.0
  for i in range(iteration):
    env.reset()
    ended = False
    while not ended:
      (state, reward, ended, info) = env.step(env.action_space.sample())
      reward = abs(state[0] - 0.5)
      total_reward += reward
  print (total_reward / iteration)
avg_reward_random(env, 50)