# 基本自动求导（Automatic Differentiation AD）

自动求导有两种形式，其分别为前向模式（Forward Mode）和反向模式（Reverse Mode）

* Forward Mode依靠**二元数**，在前向运算过程中同时计算值和导数值
* Reverse Mode分为两个步骤，第一步为前向运算，第二步为反向传播

参考论文：[Automatic Differentiation in Machine Learning: a Survey](https://arxiv.org/pdf/1502.05767.pdf)

参考实现：[autograd-by-borgwang](https://github.com/borgwang/toys/blob/master/ml-autograd/autograd.ipynb)

参考解析：

1. [神经网络自动求导的设计与实现](https://zhuanlan.zhihu.com/p/82582926)

2. [tensorflow的函数自动求导是如何实现的](https://www.zhihu.com/question/54554389)

在这里，由于是对神经网络的分析，因此主要讨论自动求导的**Reverse Mode**

## AD实现要点

对于AD，其在运算过程中需要记录如下数据

* 当前运算的运算结果
* 当前运算的梯度计算方法
* 当前运算直接依赖的前置运算

## AD过程：

假设已完成前向计算，并保留了上述需要的数据

* 从整个运算的结果出发，记录当前运算的梯度，并将梯度传递给当前运算直接依赖的前置运算
* 递归上述步骤，直到所有计算结束

## 基于AD的神经网络优化流程
* **运算定义**：若要实现AD，每一个运算不仅要计算结果，还需要对运算的前置依赖关系以及梯度计算方法进行记录
* **反向传播**: 在完成正向计算后，需要进行反向传播
* **参数更新**：在完成反向传播后，依靠梯度计算结果更新参数

In [1]:
import numpy as np
import matplotlib.pyplot as plt

In [None]:
def as_tensor(obj):
    if not isinstance(obj, MyTensor):
        obj = MyTensor(obj)
    return obj


class MyTensor(object):

    def __init__(self, values, requires_grad=False, dependency=None):
        self._values = np.array(values)
        self._shape = self._values.shape

        self.grad = None
        self.requires_grad = requires_grad

        if self.requires_grad:
            self.zero_grad()

        if dependency is None:
            self.dependency = list()
        else:
            self.dependency = dependency
        
    def zero_grad(self):
        self.grad = np.zeros(self._shape)
    
    @property
    def values(self):
        return self._values
    
    @ property
    def shape(self):
        return self._shape
    
    @values.setter
    def values(self, values):
        self._values = np.array(values)
        self.grad = None

    def backward(self, grad=None):
        """
        反向传播核心代码，累计当前tensor的梯度，同时反向传播梯度
        """
        assert self.requires_grad, "Call backward() on a non-requires-grad tensor"

        if grad is None:
            grad = 1.0
        grad = np.array(grad)

        # 梯度叠加
        self.grad += grad

        # 反向传播梯度到直接依赖的运算
        for dep in self.dependency:
            # 注意这里是直接将梯度传播到依赖的运算，而不是传播梯度叠加的结果
            # 其原因在于这里使用的是深度优先遍历
            # 由于在反向传播梯度后，会进行直接依赖的运算反向传播，因此是深度优先
            # 这么一来在遍历完成后，所有经过当前运算传递到依赖运算的路径均会被搜索到
            
            # 比如
            # 假设当前节点前有两个后继节点
            # 那么反向传播流程为

            # 梯度清零
            # 后继节点一反向传播，当前节点梯度+grad_1
            # 当前节点继续反向传播，前置节点梯度+grad_1
            # 另一条路径：后继节点二反向传播，当前节点梯度+grad_2
            # 当前节点继续反向传播，前置节点梯度+grad_2

            # 若在上述过程中每次传递的是累计梯度，则会导致前置节点梯度+grad_1+(grad_1+grad_2)
            # 即在第二条路径中对第一条路径的梯度又叠加了一次
            grad_cal_method_for_dep = dep["grad_func"](grad)
            # 迭代调用直接依赖的运算的反向传播
            dep["tensor"].backward(grad_cal_method_for_dep)
    
    # 运算重载，对于反向传播，其不仅要计算值，还需要计算梯度
    # 重载运算

    # 二元运算：
    # 矩阵乘法：左乘、右乘、原位乘
    # 加法：左加、右加、原位加
    # 减法：左减、右减、原位减
    # 按位乘法：左乘、右乘、原位乘

    # 一元运算：
    # 取反
    # sigmoid
    # ReLU
    # LeakyReLU

    def __add__(self, other):
        """
        右加法重载
        
        return: self + other
        """
        return self._add(self, as_tensor(other))
    
    def __radd__(self, other):
        """
        左加法重载

        return: other + self
        """
        return self._add(as_tensor(other), self)
    
    def __iadd__(self, other):
        """
        原位加法重载

        self += other

        原位加法不会生成新的节点，因此不会记录梯度
        """
        self.values = self.values + as_tensor(other).values
        return self

    def _add(self, tensor1, tensor2):
        """
        运算为：c = a + b

        Dc/Da = 1
        Dc/Db = 1

        在本运算中，强制要求 a.shape == b.shape == c.shape
        """
        if tensor1.shape != tensor2.shape:
            raise RuntimeError("Add expects each tensor to be equal size, but got {} at entry 0 and {} at entry 1".format(tensor1.shape, tensor2.shape))
        _result = tensor1 + tensor2

        def grad_func_tensor1(grad):
            return grad

        def grad_func_tensor2(grad):
            return grad
        
        return self.build_binary_ops_result_tensor(tensor1, tensor2, grad_func_tensor1, grad_func_tensor2, _result)

    def __sub__(self, other):
        """
        右减法重载
        
        return: self - other
        """
        return self._sub(self, as_tensor(other))
    
    def __rsub__(self, other):
        """
        左减法重载

        return：other - self
        """
        return self._sub(as_tensor(other), self)
    
    def __isub__(self, other):
        """
        原位减法
        self -= other

        原位减法不会生成新的节点，因此不会记录梯度
        """
        self.values = self.values - as_tensor(other).values
        return self

    def _sub(self, tensor1, tensor2):
        """
        运算为：c = a - b

        Dc/Da = 1
        Dc/Db = -1
        在本运算中，强制要求 a.shape == b.shape == c.shape
        """
        if tensor1.shape != tensor2.shape:
            raise RuntimeError("Add expects each tensor to be equal size, but got {} at entry 0 and {} at entry 1".format(tensor1.shape, tensor2.shape))
        _result = tensor1 - tensor2

        def grad_func_tensor1(grad):
            return grad

        def grad_func_tensor2(grad):
            return -grad
        
        return self.build_binary_ops_result_tensor(tensor1, tensor2, grad_func_tensor1, grad_func_tensor2, _result)
    
    def __matmul__(self, other):
        """
        矩阵右乘

        return self @ other
        """
        return self._matmul(self, as_tensor(other))
    
    def __rmatmul__(self, other):
        """
        矩阵左乘
        
        return other @ self
        """
        return self._matmul(as_tensor(other), self)
    
    def __imatmul__(self, other):
        """
        原位乘法
        self @= other
        """
        self.values = self.values @ as_tensor(other).values
        return self

    def _matmul(self, tensor1, tensor2):
        """
        运算为: c = a @ b

        Dc/Da = grad @ b.T
        Dc/Da = a.T @ grad
        """
        if tensor1.shape != tensor2.shape:
            raise RuntimeError("RuntimeError: size mismatch, m1: {}, m2: {}".format(tensor1.shape, tensor2.shape))

        _result = tensor1 @ tensor2
        def grad_func_tensor1(grad):
            return grad @ tensor2.values.T
        
        def grad_func_tensor2(grad):
            return tensor1.values.T @ grad

        return self.build_binary_ops_result_tensor(tensor1, tensor2, grad_func_tensor1, grad_func_tensor2, _result)

    def __neg__(self):
        """
        取反运算
        return -self
        """
        return self._neg(self)
    
    def _neg(self, tensor):
        """
        运算为: c = -a

        Dc/Da = -1
        """
        _result = -tensor

        def grad_func(grad):
            return -grad
        
        return self.build_unary_ops_result_tensor(tensor, grad_func, _result)

    @staticmethod
    def build_binary_ops_result_tensor(tensor1, tensor2, grad_func_tensor1, grad_func_tensor2, values):
        """
        建立二元运算结果的tensor
        """
        requires_grad = tensor1.requires_grad or tensor2.required_grad
        
        dependency = list()
        if tensor1.requires_grad:
            dependency.append({"tensor": tensor1, "grad_func": grad_func_tensor1})
        if tensor2.requires_grad:
            dependency.append({"tensor": tensor2, "grad_func": grad_func_tensor2})

        return MyTensor(values, requires_grad, dependency)
    
    @staticmethod
    def build_unary_ops_result_tensor(tensor, grad_func, values):
        """
        建立一元运算结果的tensor
        """

        requires_grad = tensor.requires_grad
        dependency = list()

        if tensor.requires_grad:
            dependency.append({"tensor": tensor, "grad_func": grad_func})
        
        return MyTensor(values, requires_grad, dependency)
        
        

        

