In [2]:
import math
import random
import time
from jupyterthemes import jtplot
jtplot.style()

import gym
gym.logger.set_level(40)
import numpy as np


import nnabla as nn
import nnabla.logger as logger
import nnabla.functions as F
import nnabla.parametric_functions as PF
import nnabla.solver as S
from nnabla.contrib.context import extension_context
from nnabla.monitor import Monitor, MonitorSeries, MonitorTimeElapsed

2018-08-10 20:13:44,164 [nnabla][INFO]: Initializing CPU extension...


In [3]:
from IPython.display import clear_output
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
epsilon_start = 1.0
epsilon_final = 0.01
epsilon_decay = 500
epsilon_by_frame = lambda frame_idx: epsilon_final + (epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay)

In [5]:
from collections import deque
class ReplayBuffer(object):
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)
    
    def push(self, state, action, reward, next_state, done):
        state      = np.expand_dims(state, 0)
        next_state = np.expand_dims(next_state, 0)
        self.buffer.append((state, action, reward, next_state, done))
    
    def sample(self, batch_size):
        state, action, reward, next_state, done = zip(*random.sample(self.buffer, batch_size))
        return np.concatenate(state), action, reward, np.concatenate(next_state), done
    
    def __len__(self):
        return len(self.buffer)

In [6]:
hidden_size = 128
max_frames = 20000
state_size = 1
batch_size = 32
replay_buffer_size = 1000
gamma = 0.99
learning_rate = 1e-3

In [7]:
class DQN:
    def __init__(self, num_states, num_actions):
        self.state_dim = num_states
        self.action_dim = num_actions 
        self.state = nn.Variable([batch_size, self.state_dim])
    
    def forward(self, x):
        with nn.parameter_scope("DQN"):
            with nn.parameter_scope("affine1"):
                h = F.relu(PF.affine(x, hidden_size))
            with nn.parameter_scope("affine2"):
                h = F.relu(PF.affine(h, hidden_size))
            with nn.parameter_scope("affine3"):
                y = PF.affine(h, self.action_dim)
        return y
    
    def act(self, state, epsilon):
        if random.random() > epsilon:
            self.state.d = state
            q_value = self.forward(self.state)
#            print(q_value.d)
            action  = np.argmax(q_value.d)
#            print("action:",action)
        else:
            action = random.randrange(self.action_dim)
        return action

In [8]:
env = gym.make("CartPole-v0")
print("state_num ",env.observation_space.shape[0])
print("action_num ",env.action_space.n)

model = DQN(env.observation_space.shape[0], env.action_space.n)
replay_buffer = ReplayBuffer(replay_buffer_size)

DQN_solver = S.Adam(learning_rate)
with nn.parameter_scope("DQN"):
    DQN_solver.set_parameters(nn.get_parameters())

losses = []
overall_rewards = []
episode_reward = 0

state_num  4
action_num  2


In [9]:
def compute_td_loss(batch_size):
    state, action, reward, next_state, done = replay_buffer.sample(batch_size)
    State = nn.Variable.from_numpy_array(state)
    Next_State = nn.Variable.from_numpy_array(next_state)
    Action = nn.Variable(action)
    Reward = nn.Variable(reward)
    Done = nn.Variable(done)
#    print("action",action)
    
    q_values = model.forward(State)
#    print("q_values.d",q_values.d)
    q_values = np.array([q_values.d[i,action[i]] for i in range(batch_size)])
#    print("q_values",q_values)

    next_q_values = model.forward(Next_State)
    print("next_q_values.d",next_q_values.d)
    next_q_values = next_q_values.d.max(axis=1)
#    print("next_q_values", next_q_values)
    reward = np.array(reward)
    done = np.array(done)
#    print("Reward",reward)
#    print("next_q_value",next_q_values)
    expected_q_value = reward + gamma * next_q_values * (1 - done)
#    print("type",type(q_values),type(expected_q_value))
    q_values = nn.Variable.from_numpy_array(q_values)
    expected_q_value = nn.Variable.from_numpy_array(expected_q_value)
#    print("type",type(q_values),type(expected_q_value))
    DQN_loss = F.squared_error(q_values, expected_q_value)
    
    DQN_solver.zero_grad()
    DQN_loss.forward()
    DQN_loss.backward()
    DQN_solver.update()
    return DQN_loss

In [10]:
def plot(frame, rewards, losses):
    clear_output(True)
    plt.figure(figsize=(20,5))
    plt.subplot(131)
    plt.title('frame %s. reward: %s' % (frame, np.mean(rewards[-10:])))
    plt.plot(rewards)
    plt.subplot(132)
    plt.title('loss')
#    plt.plot(losses)
    plt.show()

In [11]:
state = env.reset()
for frame in range(1, max_frames):
    epsilon = epsilon_by_frame(frame)
#    print(state,frame)
    action = model.act(state, epsilon)
    next_state, reward, done, _ = env.step(action)
    replay_buffer.push(state, action, reward, next_state, done)
    state = next_state
    episode_reward += reward   
    env.render()
    if done:
        print("done")
        state = env.reset()
        overall_rewards.append(episode_reward)
        episode_reward = 0
        
    if len(replay_buffer) > batch_size:
        loss = compute_td_loss(batch_size)
        print("loss",loss)
#        losses.append(loss.data[0])
#        print(loss.data[0])
    if frame % 200 == 0:
        plot(frame, overall_rewards, losses)


2018-08-10 20:13:47,616 [root][INFO]: Generating grammar tables from /usr/lib/python3.5/lib2to3/Grammar.txt
2018-08-10 20:13:47,644 [root][INFO]: Generating grammar tables from /usr/lib/python3.5/lib2to3/PatternGrammar.txt


done
done


AssertionError: 43 (<class 'numpy.int64'>) invalid

In [None]:
X = np.arange(24).reshape(12,2)

In [None]:
X

In [None]:
Y = np.array([0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1])

In [None]:
Y

In [None]:
X[0,Y[0]]

In [None]:
X[1,Y[1]]

In [None]:
Z = np.array([X[i,Y[i]] for i in range(12)])
#for i in range(12):
#    print (X[i,Y[i]])
Z

In [None]:
X[i,Y[i]]
i=1
X[i,Y[i]]