In [1]:
import numpy as np

def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)
# note: 减去 np.max(x) 使得浮点数不溢出 数值在0-1范围


def sigmoid(x):
    return 1 / (1 + np.exp(-x))


def initialize_adam(parameters) :
    """
    Initializes v and s as two python dictionaries with:
                - keys: "dW1", "db1", ..., "dWL", "dbL" 
                - values: numpy arrays of zeros of the same shape as the corresponding gradients/parameters.

    Arguments:
    parameters -- python dictionary containing your parameters.
                    parameters["W" + str(l)] = Wl
                    parameters["b" + str(l)] = bl

    Returns: 
    v -- python dictionary that will contain the exponentially weighted average of the gradient.
                    v["dW" + str(l)] = ...
                    v["db" + str(l)] = ...
    s -- python dictionary that will contain the exponentially weighted average of the squared gradient.
                    s["dW" + str(l)] = ...
                    s["db" + str(l)] = ...

    """

    L = len(parameters) // 2 # number of layers in the neural networks
    v = {}
    s = {}
    for l in range(L):
        v["dW" + str(l)] = np.zeros(parameters["W" + str(l+1)].shape)
        v["db" + str(l)] = np.zeros(parameters["b" + str(l+1)].shape)
        s["dW" + str(l)] = np.zeros(parameters["W" + str(l+1)].shape)
        s["db" + str(l)] = np.zeros(parameters["b" + str(l+1)].shape)
    
    return v, s

    