# Imports

In [1]:
import numpy as np
from numpy.linalg import det
import torch

# Gradient Comparisons

# want gradient of  $-\log{\frac{\exp{u_o^T v_c}}{\sum\limits_w \exp{u_w^T v_c}}}$

In [2]:
torch.manual_seed(5)
uw=torch.randn(4,5, requires_grad=True)
uo=uw[1]
vc=torch.randn(5, requires_grad=True)

print(f'uw: {uw}')
print(f'uo: {uo}')
print(f'vc: {vc}')

loss=-torch.log(torch.exp(uo@vc)/torch.sum(torch.exp(uw@vc)))
loss.backward()
print()
print(f'torch gradient: {vc.grad}')

uw=uw.detach().numpy()
uo=uo.detach().numpy()
vc=vc.detach().numpy()

e_wv=np.exp(uw@vc)
e_ov=np.exp(vc@uo)

left=uo*np.sum(e_wv)
right=np.sum(uw*e_wv[:, None] ,axis=0)
bottom=np.sum(e_wv)

#grad1=-(np.sum(e_wv)/e_ov)*((left-right)/bottom)
grad1=-(left-right)/bottom
grad2=-((uo*np.sum(np.exp(vc@uw.T))-np.sum(uw*np.exp(vc@uw.T)[:, None] ,axis=0))/np.sum(np.exp(vc@uw.T)))

print(f'numpy gradient 1: {grad1}')
print(f'numpy gradient 2: {grad2}')

uw: tensor([[ 1.8423,  0.5189, -1.7119, -1.7014, -0.1297],
        [-0.6018,  0.1450, -0.1498,  2.6146, -0.4340],
        [ 0.3523, -0.0646,  1.4829,  0.4940,  0.2492],
        [ 1.7470,  0.7448,  0.0317, -1.1724, -1.5069]], requires_grad=True)
uo: tensor([-0.6018,  0.1450, -0.1498,  2.6146, -0.4340],
       grad_fn=<SelectBackward0>)
vc: tensor([ 0.0571, -1.1894, -0.5659, -0.8327,  0.9014], requires_grad=True)

torch gradient: tensor([ 2.3247,  0.3454, -1.2724, -4.1145,  0.2599])
numpy gradient 1: [ 2.3246808   0.34541243 -1.2724466  -4.1145277   0.25991312]
numpy gradient 2: [ 2.3246808   0.34541243 -1.2724466  -4.1145277   0.25991312]


# Checking Basic Gradients

In [6]:
torch.manual_seed(5)
uo_idx=1
uw=torch.randn(4,5, requires_grad=True)
uo=uw[uo_idx]
vc=torch.randn(5, requires_grad=True)

def softmax_np(w, v):
    return np.exp(w@v)/np.sum(np.exp(w@v))

def softmax_torch(w, v):
    return torch.exp(w@v)/torch.sum(torch.exp(w@v))

g=-torch.log(softmax_torch(uw, vc)[uo_idx])
print('res: ', g)
g.backward()
print('grad: ', vc.grad)

vc=vc.detach().numpy()
uw=uw.detach().numpy()

#print(f'guess:, {guess}')


print()


res:  tensor(4.5714, grad_fn=<NegBackward0>)
grad:  tensor([ 2.3247,  0.3454, -1.2724, -4.1145,  0.2599])



# Naive Softmax Loss
$P(O=o|C=c)=\frac{exp(u_o^T v_c)}{\sum\limits_{w \in Vocab}\exp(u_w^T vc)}$  
$J_{naive-softmax}(v_c, o, U)=-\log P(O=o|C=c)$

# Cross-Enropy
$-\sum\limits_{w \in Vocab}y_w\log(\hat{y}_w)=-\log(\hat{y}_o)$

# Prove that the naive-softmax loss is the same as cross entropy loss between **$y$** and **$\hat{y}$** 

We already have the center word so the probability $y_{w}$ is 1 so the output of the whole lefthand side is the $i$th value which corresponds to $\hat{y}_o$

In [34]:
sigmoid = lambda x: 1/(1+np.exp(-x))
vc=np.random.randn(3)
uw=np.random.randn(7,3)
sample_idx=[0, 0, 3, 2, 5]
neg_samples=uw[sample_idx]
print(sample_idx)
idx=1

t_vc=torch.tensor(vc, requires_grad=True)
t_uw=torch.tensor(uw, requires_grad=True)
t_neg_samples=torch.tensor(neg_samples, requires_grad=True)


t_ans=-torch.log(torch.sigmoid(t_uw[idx]@t_vc))-torch.sum(torch.log(torch.sigmoid(-t_neg_samples@t_vc)))
t_ans.backward()
print(t_vc.grad)

guess=(
    (sigmoid(uw[idx]@vc)-1)*uw[idx]+
    np.sum(-neg_samples*(sigmoid(-neg_samples@vc)-1)[:, None], axis=0)
)
print(guess)

[0, 0, 3, 2, 5]
tensor([-3.5009,  0.3441, -1.2571], dtype=torch.float64)
[-3.50085339  0.34406705 -1.25712608]
