In [1]:
import math
import random

import gym
from envs import env_list

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Normal

import numpy as np

from ddpg.policynetwork import PolicyNetwork
from ddpg.replay_buffer import ReplayBuffer
from ddpg.ou_noise import OUNoise

from model import ModelOptimizer, Model

from IPython.display import clear_output
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
import torch
import torch.autograd as autograd
from torch.autograd import Variable
from torch.autograd.gradcheck import zero_gradients

def compute_jacobian(inputs, output, create_graph=False):
    """
    :param inputs: Batch X Size (e.g. Depth X Width X Height)
    :param output: Batch X Classes
    :return: jacobian: Batch X Classes X Size
    """
    assert inputs.requires_grad

    num_classes = output.size()[1]

    jacobian = torch.zeros(num_classes, *inputs.size())
    grad_output = torch.zeros(*output.size())
    if inputs.is_cuda:
        grad_output = grad_output.cuda()
        jacobian = jacobian.cuda()

    for i in range(num_classes):
        zero_gradients(inputs)
        grad_output.zero_()
        grad_output[:, i] = 1
        output.backward(grad_output, retain_graph=True, create_graph=create_graph)
        jacobian[i] = inputs.grad.data
    
    return torch.transpose(jacobian, dim0=0, dim1=1)

class DeterministicCtrl(object):
    
    def __init__(self, model, policy, T=10, eps=1e-1):
        self.model = model
        self.policy = policy
        self.T = T
        self.eps = eps
        
        self.num_states = model.num_states
        self.num_actions = model.num_actions
        
        self.u = torch.cat(
            [torch.zeros(1, self.num_actions) for t in range(self.T-1)], dim=0)
        self.u.requires_grad = True
    
    def reset(self):
        self.u = torch.cat(
            [torch.zeros(1, self.num_actions) for t in range(self.T-1)], dim=0)
        self.u.requires_grad = True
           
    
    def __call__(self, state):
        with torch.no_grad():
            self.u[:-1] = self.u[1:].clone()
            self.u[-1].zero_()
            x_t = torch.FloatTensor(state).unsqueeze(0)
            x = []
            ep_rew = 0.
            for t in range(self.T-1):
                x.append(x_t.clone())
#                 u_t = self.policy(x_t)
                x_t, r_t = self.model.step(x_t, self.u[t].unsqueeze(0))
                ep_rew += r_t.data
        
        
        # compute those derivatives
        x = torch.cat(x)
        x.requires_grad = True
        
        pred_state, pred_rew = self.model.step(x, self.u)# + self.policy(x)[0])
        
        dfdx = compute_jacobian(x, pred_state)
        dfdu = compute_jacobian(self.u, pred_state)
        dldx = compute_jacobian(x, pred_rew)
        dldu = compute_jacobian(self.u, pred_rew)
        
        with torch.no_grad():
            rho = torch.zeros(1, self.num_states)
            for t in reversed(range(self.T-1)):
                rho = dldx[t] + rho.mm(dfdx[t])
                self.u[t] = self.u[t] + self.eps*(dldu[t] + rho.mm(dfdu[t]))
        return self.u[0].data.numpy()

In [3]:
env = env_list['PendulumEnv']()
ou_noise = OUNoise(env.action_space)
action_dim = env.action_space.shape[0]
state_dim  = env.observation_space.shape[0]

In [4]:
policy = PolicyNetwork(num_inputs=state_dim, num_actions=action_dim, hidden_size=128)
model  = Model(state_dim, action_dim, )

replay_buffer = ReplayBuffer(10000)

ctrl = DeterministicCtrl(model, policy)

model_optim = ModelOptimizer(model, replay_buffer, lr=1e-3)
batch_size  = 128

frame_skip = 2

In [5]:
for k in range(200):
    state = env.reset()
    ou_noise.reset()
    ctrl.reset()
    
    for t in range(200):
        action = ou_noise.get_action(ctrl(state))
        for _ in range(frame_skip):
            next_state, reward, done, _ = env.step(action.copy())
        replay_buffer.push(state, action, reward, next_state, done)
        if len(replay_buffer) > batch_size:
            model_optim.update_model(batch_size)
        env.render()

        state = next_state
        if done:
            break
#         print(model_optim.log['loss'][-1])


KeyboardInterrupt: 

In [6]:
ctrl.u

tensor([[ 0.6080],
        [ 0.4241],
        [ 0.2985],
        [ 0.1276],
        [-0.0007],
        [ 0.0101],
        [ 0.0597],
        [-0.0292],
        [-0.0062]], requires_grad=True)

In [32]:
x = torch.randn(5,2, requires_grad = True)

In [33]:
x

tensor([[ 0.1072, -0.4898],
        [-0.0619, -2.1170],
        [-2.1056, -0.1492],
        [-0.2882, -0.9707],
        [ 2.0131,  0.5740]], requires_grad=True)

In [34]:
x[:-1] = x[1:].clone()

In [38]:
x[-1].zero_()

tensor([0., 0.], grad_fn=<AsStridedBackward>)

In [36]:
x

tensor([[-0.0619, -2.1170],
        [-2.1056, -0.1492],
        [-0.2882, -0.9707],
        [ 2.0131,  0.5740],
        [ 0.0000,  0.0000]], grad_fn=<CopySlices>)