In [29]:
import numpy as np
def softmax(Z):
    exp_Z = np.exp(Z-np.max(Z,axis=1,keepdims=True))
    return exp_Z/np.sum(exp_Z,axis=1,keepdims=True)

def softmax_gradient(z,isF = False):   
    if isF:
        f = z
    else:
        f = softmax(z)
    grad = -np.outer(f, f) + np.diag(f.flatten())
    return grad

def softmax_backward(z,df,isF = False):     
    grad = softmax_gradient(z,isF)    
    return df@grad

x = np.array([[1, 2]])
print(softmax_gradient(x))
df = np.array([1, 3])
print(softmax_backward(x,df))

[[ 0.19661193 -0.19661193]
 [-0.19661193  0.19661193]]
[-0.39322387  0.39322387]


In [31]:
def softmax_gradient(z,isF = False): 
    if isF:
        f = z
    else:
        f = softmax(z)
    
    if len(df)==1:
        return -np.outer(f, f) + np.diag(f.flatten())
    else:  
        grads = []
        for i in range(len(f)):
            fi = f[i]
            grad = -np.outer(fi, fi) + np.diag(fi.flatten())  
            grads.append(grad)
        return np.array(grads)

x = np.array([[1, 2],[2, 5]])
print(softmax_gradient(x))

[[[ 0.19661193 -0.19661193]
  [-0.19661193  0.19661193]]

 [[ 0.04517666 -0.04517666]
  [-0.04517666  0.04517666]]]


In [35]:
def softmax_gradient(Z,isF = False):  
    if isF:
        F = Z
    else:
        F = softmax(Z)   
    D = []
    for i in range(F.shape[0]):
        f = F[i]
        D.append(np.diag(f.flatten()))
    grads = D-np.einsum('ij,ik->ijk',F,F)
    return grads

print(softmax_gradient(x))

[[[ 0.19661193 -0.19661193]
  [-0.19661193  0.19661193]]

 [[ 0.04517666 -0.04517666]
  [-0.04517666  0.04517666]]]


In [37]:
def softmax_backward(Z,dF,isF = True):     
    grads = softmax_gradient(Z,isF)    
    grad = np.einsum("bj, bjk -> bk", dF, grads)  # [B,D]*[B,D,D] -> [B,D]
    return grad

df = np.array([[1, 3],[2, 4]])
print(softmax_backward_2(x,df))     

[[-0.39322387  0.39322387]
 [-0.09035332  0.09035332]]


In [38]:
import torch
import torch.nn as nn
import torch.nn.functional as F

B,D= 1,3
a = torch.randn([B,D],requires_grad=True)
print("a",a)
b = F.softmax(a)
print("b",b)

db = torch.randn([B,D])
b.backward(db)
print("a.grad",a.grad)

a_ = a.detach().numpy()
db_ = db.detach().numpy()
da = softmax_backward(a_,db,False)
print("a.grad",da)


ModuleNotFoundError: No module named 'torch'

In [2]:


def softmax_gradient(z): 
    f = softmax(z)
    grads = []
    for i in range(len(f)):
        fi = f[i]
        grad = -np.outer(fi, fi) + np.diag(fi.flatten())  
        grads.append(grad)
    return np.array(grads)

    
def softmax_backward(z,df):    
    f = softmax(z)
    #print("f.shape",f.shape)
    #print("df.shape",df.shape)
    if len(df)==1:
        grad = -np.outer(f, f) + np.diag(f.flatten())
        return df@grad 
    else:
        grads = []
        for i in range(len(f)):
            fi = f[i]
            grad = -np.outer(fi, fi) + np.diag(fi.flatten())  
            grads.append(df[i]@grad)
        return np.array(grads)
    #return df@grad

x = np.array([[1, 2],[2, 5]])
print(softmax(x))
print(softmax_gradient(x))
df = np.array([[1, 3],[2, 4]])
print(softmax_backward(x,df))

[[0.26894142 0.73105858]
 [0.04742587 0.95257413]]
[[[ 0.19661193 -0.19661193]
  [-0.19661193  0.19661193]]

 [[ 0.04517666 -0.04517666]
  [-0.04517666  0.04517666]]]
[[-0.39322387  0.39322387]
 [-0.09035332  0.09035332]]


In [9]:
def out_product(F):
    grads=[]
    for i in range(F.shape[0]):
        f = F[i]
        grad = -np.outer(f, f) # -np.outer(f, f) + np.diag(f.flatten()) 
        grads.append(grad)
    grads = np.array(grads)
    return grads
 
x = np.array([[1, 2],[2, 5]])
print(out_product(x))
print( -np.einsum('ij,ik->ijk',x,x))

[[[ -1  -2]
  [ -2  -4]]

 [[ -4 -10]
  [-10 -25]]]
[[[ -1  -2]
  [ -2  -4]]

 [[ -4 -10]
  [-10 -25]]]


In [19]:
def softmax_backward_2(Z,dF,isF = True):  
    if isF:
        F = Z
    else:
        F = softmax(Z)   
    D = []
    for i in range(F.shape[0]):
        f = F[i]
        D.append(np.diag(f.flatten()))
    grads = D-np.einsum('ij,ik->ijk',F,F)     
    grad = np.einsum("bj, bjk -> bk", dF, grads)  # [B,D]*[B,D,D] -> [B,D]
    return grad

print(softmax_backward_2(x,df))        

[[-0.39322387  0.39322387]
 [-0.09035332  0.09035332]]


In [26]:
x = np.arange(6).reshape((2, 3))
y = np.arange(6).reshape((2, 3))
print("x:",x)
print("y:",y)
x+=y
print("x:",x)

x: [[0 1 2]
 [3 4 5]]
y: [[0 1 2]
 [3 4 5]]
x: [[ 0  2  4]
 [ 6  8 10]]
