# DDPG
Deep Deterministic Policy Gradient
同时吸收了actor-critic单步更新和DQN价值导向的精华，合成为一个新算法。
 - deep: value-based的思想，即一个经验池和两个结构相同的深度神经网络促进学习。
 - Deterministic Policy Gradient: 相对于Policy Gradient改变了动作输出的过程，从生成分布并采样改为在连续的动作仅输出一个动作值。 

## 1. env & utils

In [1]:
import gym
import numpy as np
import torch
from torch.autograd import Variable
import random
from collections import deque
import torch.nn as nn
from torch.optim import Adam
from torch.autograd import Variable
import matplotlib.pyplot as plt

torch.manual_seed(28)
%matplotlib inline

In [3]:
EPOCHS = 500
MAX_STEPS = 2000
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

env = gym.make("BipedalWalker-v3")    # 0 up, 1 right, 2 down, 3 left
S_DIM = env.observation_space.shape[0]
A_DIM = env.action_space.shape[0]
A_MAX = env.action_space.high[0]
 
print(' State Dimensions : ', S_DIM)
print(' Action Dimensions : ', A_DIM)
print(' Action Max : ', A_MAX)

 State Dimensions :-  24
 Action Dimensions :-  4
 Action Max :-  1.0


In [None]:
for epoch in range(EPOCHS):
	obs = env.reset()
	for r in range(MAX_STEPS):

		# env.render()
		state = np.float32(obs)

		action = DDPG.get_exploration_action(state)
		# if epoch%5 == 0:
		# 	# validate every 5th episode
		# 	action = trainer.get_exploitation_action(state)
		# else:
		# 	# get action based on observation, use exploration policy here
		# 	action = trainer.get_exploration_action(state)

		obs_next, reward, done, info = env.step(action)

		if done:
			new_state = None
		else:
			new_state = np.float32(obs_next)
			# push this exp in ram
			ram.add(state, action, reward, new_state)

		obs = obs_next

		# perform optimization
		DDPG.learn()
		if done:
			break

	# check memory consumption and clear memory
	gc.collect()


In [None]:
class Replay_buffer:

    def __init__(self, N):

        self.capacity = N
        self.counter = 0
        self.buf = deque(maxlen=self.capacity)

    def __len__(self):
        return self.counter

    def add(self, s1, a, r, s2):
        transition = (s1, a, r, s2)
        self.counter += 1
        if self.counter > self.capacity:
            self.counter = self.capacity
        self.buf.append(transition)

    def sample(self, minibatch):

        batch_num = min(minibatch, self.counter)
        batch = random.sample(self.buf, batch_num)

        b_s1 = Variable(torch.FloatTensor(t[0] for t in batch))
        b_a = Variable(torch.FloatTensor(t[1] for t in batch))
        b_r = Variable(torch.FloatTensor(t[2] for t in batch))
        b_s2 = Variable(torch.FloatTensor(t[3] for t in batch))
        return b_s1, b_a, b_r, b_s2 

class Critic(nn.Module):

	def __init__(self, state_dim, action_dim):
		super(Critic, self).__init__()

		self.state_dim = state_dim
		self.action_dim = action_dim
		self.feature_state = nn.Sequential(
            nn.Linear(state_dim,256),
            nn.ReLU(inplace=True),
            nn.Linear(256, 128),
            nn.ReLU(inplace=True)
        )
		self.feature_action = nn.Sequential(
            nn.Linear(action_dim, 128),
            nn.ReLU(inplace=True)
        )
		self.value = nn.Sequential(
            nn.Linear(256, 128),
            nn.ReLU(inplace=True),
            nn.Linear(128, 1)
        )


	def forward(self, state, action):

		f_state = self.feature_state(state)
		f_action = self.feature_action(action)
		val = torch.cat((f_state, f_action),dim=1)

		val = self.value(val)

		return val


class Actor(nn.Module):

	def __init__(self, state_dim, action_dim, action_lim):
		super(Actor, self).__init__()

		self.state_dim = state_dim
		self.action_dim = action_dim
		self.action_lim = action_lim

		self.feature = nn.Sequential(
			nn.Linear(state_dim,256),
			nn.ReLU(inplace=True),
			nn.Linear(256, 128),
			nn.ReLU(inplace=True),
			nn.Linear(128, 64),
			nn.ReLU(inplace=True),
		)
		self.classify = nn.Sequential(
			nn.Linear(64, action_dim),
			nn.Tanh()
		)
	def forward(self, state):
		
		x = self.feature(state)
		a = self.classify(x)

		action = a * self.action_lim

		return action
