[View in Colaboratory](https://colab.research.google.com/github/julianw/learn_ml/blob/master/pytorch_cartpole_sqn.ipynb)

## Shallow neural network for Q function approximation
The following 3 cells are intended to be use in google colab environment https://colab.research.google.com/ which allow free gpu usage

In [0]:
#!kill -9 -1

In [0]:
#!pip3 install torch==0.4.0 torchvision==0.2.1 > /dev/null

In [0]:
#!pip3 install gym > /dev/null

In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import numpy as np
import random
import os
import gym
from google.colab import files

In [0]:
use_cuda = torch.cuda.is_available()
FloatTensor = torch.cuda.FloatTensor if use_cuda else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if use_cuda else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if use_cuda else torch.ByteTensor

In [0]:
def download(filename):
  files.download(filename)

def save_torch_model(model, filename):
  if not os.path.exists(os.path.dirname(filename)):
    os.makedirs(os.path.dirname(filename))
  torch.save(model.state_dict(), filename)

def load_torch_model(model, filename):
  model.load_state_dict(torch.load(filename))


## Replay memory 
Save every step as the agent performs an action in an environment (in this case is the cartpole-v0)

In [0]:
class ReplayMemory():
  def __init__(self, memory_size = 1000):
    self.transitions = []
    self.memory_size = memory_size
    self.loc_pointer = 0
  
  def clear(self):
    self.transitions = []
    self.loc_pointer = 0
  
  def add(self, step_tuple):
    # expect a tuple of transition contain:
    # state, action, reward, next_state, ended
    if len(self.transitions) <= self.loc_pointer:
      self.transitions.append(None)
    self.transitions[self.loc_pointer] = step_tuple
    self.loc_pointer += 1
    if self.loc_pointer >= self.memory_size:
      self.loc_pointer %= self.memory_size
  
  def get_sample(self, batch_size):
    return random.sample(self.transitions, batch_size)

## Define the nerual network
Keep in mind this is NN is used for Q value approximation. The output value / range is depends on the environment. Don't blindly add a activation function at the output layer

In [0]:
class NN(nn.Module):
  def __init__(self, input_size, output_size):
    super(NN,self).__init__()
    self.l1_linear = nn.Linear(input_size, 128, bias=False)
    nn.init.kaiming_normal_(self.l1_linear.weight)
    self.l2_linear = nn.Linear(128, output_size, bias=False)
    nn.init.kaiming_normal_(self.l2_linear.weight)
    
  def forward(self,x):
    out = F.relu(self.l1_linear(x))
    out = self.l2_linear(out)
    return out

In [0]:
class SQN():
  _epsilon = 0.2
  _gamma = 0.9
  replay_memory = ReplayMemory()
  
  def __init__(self, env, state_size = None, output_size = None):
    self.env = env
    self.output_size = output_size
    self.state_size = state_size
    self.Q = NN(state_size * 2, output_size)
    if use_cuda:
      self.Q.cuda()
      
  def epsilon(self):
    return self._epsilon
      
  def predict(self, state):
    s = Variable(FloatTensor([state]))
    action_value = self.Q(s)
    return action_value.data.tolist()[0]

  def pick_action(self, state):
    action = None
    if random.random() < self.epsilon():
      action =  random.randint(0, self.output_size -1)
    else:
      action_value = self.predict(state)
      action = action_value.index(max(action_value))
    return action
  
  def update_Q(self, batch):
    # Q learning, Q(s,a) = Q(s,a) + alpha * [reward + gamma * max(Q(s')) - Q(s,a)]
    # Target of the Q function is the one step bellman equation "reward + gamma * max(Q(s'))"
    # so error is [taget - current estimation] = [reward + gamma * max(Q(s')) - Q(s,a)]
    
    (state, action, reward, next_state, ended) = tuple(zip(*batch))

    var_state = Variable(FloatTensor(state))
    var_action = Variable(LongTensor(action))
    var_ended = Variable(FloatTensor(ended))
    var_reward = Variable(FloatTensor(reward))
    var_next_state = Variable(FloatTensor(next_state))
    
    # current estimation, take the Q value of the action performed
    state_action_values = self.Q(var_state).gather(1, var_action.view(-1,1))
    
    # target. If an episode ended at this step, only reward is used as there are no next state
    target_values = Variable(var_reward + (1 - var_ended) * self._gamma * self.Q(var_next_state).max(1)[0])
    
    loss = F.mse_loss(state_action_values, target_values.view(-1,1))
    self.optimizer.zero_grad()
    loss.backward()
    self.optimizer.step()

  def train(self, env, episode, iter_per_episode = 100, batch_size = 32, lr=1e-3):
    self.optimizer = torch.optim.Adam(self.Q.parameters(), lr=lr, weight_decay=1e-3)
    best_score = 0
    for i in range(episode):
      s0 = env.reset()
      state = np.append(s0,s0)
      episode_ended = False
      step = 0
      while not episode_ended:
        action =  self.pick_action(state)
        (s1, reward, episode_ended, info) = env.step(action)
        next_state = np.append(s0,s1)
        step += 1
        if episode_ended:
          ended = 1
          if step < 200:
            # pole tipped over
            reward = 0.0
          else:
            # environment terminate as max step reached
            reward = 1.0
        else:
          ended = 0
        self.replay_memory.add((state,action,reward,next_state,ended))
        s0 = s1
        state = next_state
        
      if (i + 1) % 100 == 0:
        if step > best_score:
          save_torch_model(self.Q, 'model/cartpole_sqn_best.pth')
          best_score = step
        save_torch_model(self.Q,'model/cartpole_sqn_iter_%d.pth' %(i+1))
        # longer the better, that mean the agent can keep the pole up for longer period
        print(i+1,': pole tip over at step:', step)
        
      if len(self.replay_memory.transitions) > batch_size:
        for j in range(iter_per_episode):
          batch = self.replay_memory.get_sample(batch_size)
          self.update_Q(batch)


In [105]:
env = gym.make('CartPole-v0')
agent = SQN(env,4,2)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [106]:
# run 600 epidsodes, each episode train the Q network for 50 times using 64 batches
agent.train(env, episode=600, iter_per_episode=50, batch_size=64, lr=5e-4)

100 : periods the agent can keep the pole up: 14
200 : periods the agent can keep the pole up: 45
300 : periods the agent can keep the pole up: 41
400 : periods the agent can keep the pole up: 164
500 : periods the agent can keep the pole up: 200
600 : periods the agent can keep the pole up: 155
