In [21]:
import math
import random
import time
from jupyterthemes import jtplot
jtplot.style()

import gym
gym.logger.set_level(40)
import numpy as np


import nnabla as nn
import nnabla.logger as logger
import nnabla.functions as F
import nnabla.parametric_functions as PF
import nnabla.solver as S
from nnabla.contrib.context import extension_context
from nnabla.monitor import Monitor, MonitorSeries, MonitorTimeElapsed

In [22]:
from IPython.display import clear_output
import matplotlib.pyplot as plt
%matplotlib inline

In [23]:
epsilon_start = 1.0
epsilon_final = 0.01
epsilon_decay = 500
epsilon_by_frame = lambda frame_idx: epsilon_final + (epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay)

In [24]:
from collections import deque
class ReplayBuffer(object):
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)
    
    def push(self, state, action, reward, next_state, done):
        state      = np.expand_dims(state, 0)
        next_state = np.expand_dims(next_state, 0)
        self.buffer.append((state, action, reward, next_state, done))
    
    def sample(self, batch_size):
        state, action, reward, next_state, done = zip(*random.sample(self.buffer, batch_size))
        return np.concatenate(state), action, reward, np.concatenate(next_state), done
    
    def __len__(self):
        return len(self.buffer)

In [211]:
hidden_size = 128
max_frames = 2000
state_size = 1
batch_size = 32
replay_buffer_size = 1000
gamma = 0.99
learning_rate = 1e-3

In [212]:
class DQN:
    def __init__(self, num_states, num_actions):
        self.state_dim = num_states
        self.action_dim = num_actions 
        self.state = nn.Variable([state_size, self.state_dim])
    
    def forward(self, x):
        with nn.parameter_scope("DQN"):
            with nn.parameter_scope("affine1"):
                h = F.relu(PF.affine(x, hidden_size))
            with nn.parameter_scope("affine2"):
                h = F.relu(PF.affine(h, hidden_size))
            with nn.parameter_scope("affine3"):
                y = PF.affine(h, self.action_dim)
        return y
    
    def act(self, state, epsilon):
        if random.random() > epsilon:
            self.state.d = state
            q_value = self.forward(self.state)
            print(q_value.d)
            action  = np.argmax(q_value.d)
            print("action:",action)
        else:
            action = random.randrange(self.action_dim)
        return action
    


In [213]:
env = gym.make("CartPole-v0")
print("state_num ",env.observation_space.shape[0])
print("action_num ",env.action_space.n)

model = DQN(env.observation_space.shape[0], env.action_space.n)
replay_buffer = ReplayBuffer(replay_buffer_size)

DQN_solver = S.Adam(learning_rate)
with nn.parameter_scope("DQN"):
    DQN_solver.set_parameters(nn.get_parameters())

losses = []
overall_rewards = []
episode_reward = 0

state_num  4
action_num  2


In [214]:
def compute_td_loss(batch_size):
    state, action, reward, next_state, done = replay_buffer.sample(batch_size)
#    state      = Variable(torch.FloatTensor(np.float32(state)))
#    next_state = Variable(torch.FloatTensor(np.float32(next_state)), volatile=True)
#    action     = Variable(torch.LongTensor(action))
#    reward     = Variable(torch.FloatTensor(reward))
#    done       = Variable(torch.FloatTensor(done))
    state, action, reward, next_state, done = replay_buffer.sample(batch_size)
    State = nn.Variable.from_numpy_array(state)
    Next_State = nn.Variable.from_numpy_array(next_state)
    Action = nn.Variable(action)
    Reward = nn.Variable(reward)
    Done = nn.Variable(done)
#    print("action",action)
    
    q_values = model.forward(State)
#    print("q_values.d",q_values.d)
    q_values = np.array([q_values.d[i,action[i]] for i in range(batch_size)])
#    print("q_values",q_values)

    next_q_values = model.forward(Next_State)
#    print("next_q_values.d",next_q_values.d)
    next_q_values = next_q_values.d.max(axis=1)
#    print("next_q_values", next_q_values)
    reward = np.array(reward)
    done = np.array(done)
#    print("Reward",reward)
#    print("next_q_value",next_q_values)
    expected_q_value = reward + gamma * next_q_values * (1 - done)
#    print("type",type(q_values),type(expected_q_value))
    q_values = nn.Variable.from_numpy_array(q_values)
    expected_q_value = nn.Variable.from_numpy_array(expected_q_value)
#    print("type",type(q_values),type(expected_q_value))
    DQN_loss = F.squared_error(q_values, expected_q_value)
    
    DQN_solver.zero_grad()
    DQN_loss.forward()
    DQN_loss.backward()
    DQN_solver.update()
    
    return DQN_loss

In [215]:
state = env.reset()
for frame in range(1, max_frames):
    epsilon = epsilon_by_frame(frame)
    print(state,frame)
    action = model.act(state, epsilon)
    next_state, reward, done, _ = env.step(action)
    replay_buffer.push(state, action, reward, next_state, done)
    state = next_state
    episode_reward += reward   
    if done:
        print("done")
        state = env.reset()
        overall_rewards.append(episode_reward)
        episode_reward = 0
        
    if len(replay_buffer) > batch_size:
        loss = compute_td_loss(batch_size)
        losses.append(loss.data[0])
        
    if frame % 200 == 0:
        plot(frame, all_rewards, losses)


[ 0.005859   -0.01475264  0.03807998 -0.0389824 ] 1
[ 0.00556395  0.17980314  0.03730033 -0.3194117 ] 2
[ 0.00916001 -0.01582963  0.0309121  -0.01520289] 3
[ 0.00884342  0.17883567  0.03060804 -0.29797465] 4
[ 0.01242013 -0.01670892  0.02464855  0.00420221] 5
[ 0.01208596 -0.21217553  0.02473259  0.30455901] 6
[ 0.00784244 -0.40764105  0.03082377  0.60493827] 7
[-3.10376432e-04 -6.03180191e-01  4.29225369e-02  9.07168399e-01] 8
[-0.01237398 -0.40866481  0.0610659   0.62827941] 9
[-0.02054728 -0.60458347  0.07363149  0.93955184] 10
[-0.03263895 -0.80061661  0.09242253  1.25443291] 11
[-0.04865128 -0.60679171  0.11751119  0.99206996] 12
[-0.06078711 -0.41342148  0.13735259  0.73848381] 13
[-0.06905554 -0.22043656  0.15212226  0.49198756] 14
[-0.07346427 -0.41734004  0.16196201  0.82848357] 15
[-0.08181107 -0.61426169  0.17853169  1.16740925] 16
[-0.09409631 -0.42185396  0.20187987  0.93559492] 17
done
[-0.01409046  0.03988075 -0.03985705  0.03111267] 18
[-0.01329285  0.23555093 -0.039234



[-0.0723585  -0.16443244  0.044674    0.0981869 ] 68
[-0.07564715 -0.36016525  0.04663774  0.40462297] 69
[-0.08285046 -0.5559165   0.05473019  0.71163711] 70
[-0.09396879 -0.36159342  0.06896294  0.43667157] 71
[-0.10120066 -0.55762034  0.07769637  0.75027205] 72
[-0.11235306 -0.36365107  0.09270181  0.48301567] 73
[-0.11962608 -0.55995074  0.10236212  0.80341621] 74
[-0.1308251  -0.36637013  0.11843045  0.54460715] 75
[-0.1381525  -0.56293973  0.12932259  0.8721334 ] 76
[-0.1494113  -0.36979119  0.14676526  0.62274488] 77
[-0.15680712 -0.17699034  0.15922016  0.37964697] 78
[-0.16034693 -0.37397274  0.16681309  0.71799637] 79
[-0.16782638 -0.570962    0.18117302  1.05819456] 80
[-0.17924562 -0.76796093  0.20233691  1.40183048] 81
done
[-0.01337337  0.01679464 -0.03163923 -0.04130588] 82
[-0.01303748  0.21235567 -0.03246534 -0.34380099] 83
[-0.00879037  0.01771026 -0.03934136 -0.06152986] 84
[-0.00843616  0.21337354 -0.04057196 -0.36636107] 85
[[7.309675e-37 0.000000e+00]]
action: 0
[

NameError: name 'plot' is not defined

In [30]:
X = np.arange(24).reshape(12,2)

In [31]:
X

array([[ 0,  1],
       [ 2,  3],
       [ 4,  5],
       [ 6,  7],
       [ 8,  9],
       [10, 11],
       [12, 13],
       [14, 15],
       [16, 17],
       [18, 19],
       [20, 21],
       [22, 23]])

In [101]:
Y = np.array([0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1])

In [102]:
Y

array([0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1])

In [103]:
X[0,Y[0]]

0

In [104]:
X[1,Y[1]]

3

In [123]:
Z = np.array([X[i,Y[i]] for i in range(12)])
#for i in range(12):
#    print (X[i,Y[i]])
Z

array([ 0,  3,  4,  7,  8, 11, 12, 15, 16, 19, 20, 23])

In [92]:
X[i,Y[i]]
i=1
X[i,Y[i]]

3