In [41]:
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import random

In [42]:
use_cuda = torch.cuda.is_available()
FloatTensor = torch.cuda.FloatTensor if use_cuda else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if use_cuda else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if use_cuda else torch.ByteTensor

In [43]:
class TransitionHistory():
  def __init__(self, max_count = 1000):
    self.transitions = []
    self.max_count = max_count
    self.loc_pointer = 0
  
  def clear(self):
    self.transitions = []
    self.loc_pointer = 0
  
  def add_transition(self, s0, a0, r, s1):
    if len(self.transitions) <= self.loc_pointer:
      self.transitions.append(None)
    self.transitions[self.loc_pointer] = (s0, a0, r, s1)
    self.loc_pointer += 1
    if self.loc_pointer >= self.max_count:
      self.loc_pointer %= self.max_count
  
  def get_sample_transition(self, batch_size):
    return random.sample(self.transitions, batch_size)

In [44]:
class SNN(nn.Module):
  def __init__(self, input_size, output_size):
    super(SNN,self).__init__()
    self.l1_linear = nn.Linear(input_size, 128)
    self.l2_linear = nn.Linear(128, 64)
    self.l3_linear = nn.Linear(64, output_size)
    
  def forward(self,x):
    out = F.dropout(F.relu(self.l1_linear(x)), p=0.5)
    out = F.relu(self.l2_linear(out))
    out = F.sigmoid(self.l3_linear(out))
    return out

In [45]:
class SQN():
  _epsilon = 0.2
  _gamma = 0.9
  transition_history = TransitionHistory()
  
  def __init__(self, state_size, output_size):
    self.output_size = output_size
    self.state_size = state_size
    self.Q = SNN(state_size * 3, output_size)
    if use_cuda:
      self.Q.cuda()
#     self.optimizer = torch.optim.RMSprop(self.Q.parameters())
    self.optimizer = torch.optim.SGD(self.Q.parameters(), lr=1 , momentum=0.9)
#     self.optimizer = torch.optim.Adagrad(self.Q.parameters(),weight_decay=1e-3)
      
  def epsilon(self):
    return self._epsilon
      
  def predict(self, state):
    state = np.append(state[0],[state[1],state[1] - state[0]])
    s = Variable(FloatTensor([state]))
    action_value = self.Q(s)
    return action_value.data.tolist()[0]

  def pick_action(self, state):
    action = None
    if random.random() < self.epsilon():
      action =  random.randint(0, self.output_size -1)
    else:
      action_value = self.predict(state)
      action = action_value.index(max(action_value))
    return action
  
  def update_Q(self, batch):
    # Q learning, Q(s,a) = Q(s,a) + alpha * [reward + gamma * max(Q(s')) - Q(s,a)]
    
    (state, action, reward, next_state) = tuple(zip(*batch))
    state = [np.append(s[0], [s[1], s[1]-s[0]]) for s in state]
    next_state = [np.append(s[0],[s[1],s[1]-s[0]]) if s is not None else None for s in next_state]
    non_final_mask = torch.ByteTensor(tuple(map(lambda s:s is not None, next_state)))
    non_final_next_states = Variable(FloatTensor([s for s in next_state if s is not None]), volatile=True)

    state_batch = Variable(FloatTensor(state))
    action_batch = Variable(LongTensor([a for a in action]))
    reward_batch = Variable(FloatTensor(reward))
    state_action_values = self.Q(state_batch).gather(1, action_batch.view(-1,1))
    
    next_state_values = Variable(torch.zeros(len(batch)).type(FloatTensor))
    next_state_values[non_final_mask] = self.Q(non_final_next_states).max(1)[0]
    
    expected_state_action_values = (next_state_values * self._gamma) + reward_batch
    expected_state_action_values = Variable(expected_state_action_values.view(-1,1).data)
    
#     print('state_action_values',state_action_values)
#     print('expected_state_action_values',expected_state_action_values)
    loss = F.smooth_l1_loss(state_action_values, expected_state_action_values)
#     print('loss', loss.data[0])
    self.optimizer.zero_grad()
    loss.backward()
    self.optimizer.step()

  def train(self, env, episode, iter_per_episode = 100, batch_size = 32):
    episode_length = []
    for i in range(episode):
      s0 = env.reset()
      s0 = [s0, s0]
      a0 = self.pick_action(s0)
      episode_ended = False
      step = 0
      while not episode_ended:
        (s1, reward, episode_ended, info) = env.step(a0)
        s1 = [s0[1],s1]
        step += 1
        if episode_ended:
          episode_length.append(step)
          reward = -1
          s1 = None
          a1 = None
        else:
          a1 = self.pick_action(s1)
        self.transition_history.add_transition(s0,a0,reward,s1)
        s0 = s1
        a0 = a1
      if len(self.transition_history.transitions) > batch_size:
        for j in range(iter_per_episode):
          batch = self.transition_history.get_sample_transition(batch_size)
          self.update_Q(batch)
      if (i + 1) % 100 == 0:
        episode_length = episode_length[-10:]
        print(i+1,': last 10 episode avg length:', sum(episode_length)/ 10)

In [46]:
env = gym.make('CartPole-v1')
agent = SQN(4,2)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [47]:
agent.train(env, 1000, 1000)

100 : last 10 episode avg length: 9.6
200 : last 10 episode avg length: 11.2
300 : last 10 episode avg length: 11.7
400 : last 10 episode avg length: 10.9
500 : last 10 episode avg length: 9.3
600 : last 10 episode avg length: 11.1


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/Users/ju/anaconda2/envs/pytorch36/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2910, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-47-04186ca7d761>", line 1, in <module>
    agent.train(env, 1000, 1000)
  File "<ipython-input-45-1b88c04ba0a3>", line 87, in train
    self.update_Q(batch)
  File "<ipython-input-45-1b88c04ba0a3>", line 46, in update_Q
    state_action_values = self.Q(state_batch).gather(1, action_batch.view(-1,1))
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/ju/anaconda2/envs/pytorch36/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 1828, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'KeyboardInterrupt' object has no attribute '_render_traceback_'

During handling of the above exception, another exception occurred:

Traceback (mo

KeyboardInterrupt: 

In [8]:
agent.transition_history.get_sample_transition(1)

[([array([-0.03419475, -0.03714792,  0.04386825,  0.02219254]),
   array([-0.03419475, -0.03714792,  0.04386825,  0.02219254])],
  1,
  1.0,
  [array([-0.03419475, -0.03714792,  0.04386825,  0.02219254]),
   array([-0.03493771,  0.15731836,  0.0443121 , -0.25633312])])]

In [79]:
agent.Q(Variable(FloatTensor([s])))

NameError: name 's' is not defined

In [117]:
agent.predict(Variable(FloatTensor([s])))

Variable containing:
 0.5310  0.4690
[torch.FloatTensor of size 1x2]

In [92]:
v = Variable(FloatTensor([0.1,0.2]))

In [106]:
type(v)

torch.autograd.variable.Variable

In [161]:
x = [0,1,2]
y = [3,4,5]

In [162]:
x.extend(y)

In [163]:
x[3:].extend(y)

In [164]:
x1 = x[3:]

In [165]:
x1.extend(y)
x1

[3, 4, 5, 3, 4, 5]

In [158]:
x2

In [149]:
a = env.action_space

In [150]:
a.n

2

In [198]:
p = agent.Q(Variable(FloatTensor([[0,1,2,3,4,5,6,7]])))

In [201]:
p.data.tolist()

[[0.5245333909988403, 0.47546666860580444]]