In [1]:
from google.colab import drive

drive.mount('/content/drive', force_remount=True)

FOLDERNAME = 'cs231n/Practice/'

assert FOLDERNAME is not None, "[!] Enter the foldername."

%cd /content/drive/My\ Drive
%cp -r $FOLDERNAME ../../
%cd ../../
%cd Practice


Mounted at /content/drive
/content/drive/My Drive
/content
/content/Practice


# Attention : step forward
Open the file `attention_layers.py`. This file implements the forward and backward passes for different types of layers that are commonly used in attention.

In [2]:

def rel_error(x, y):
    """ returns relative error """
    return np.max(np.abs(x - y) / (np.maximum(1e-8, np.abs(x) + np.abs(y))))

def softmax(a, axis = 0) :
    exp_a = np.exp(a)
    sum_exp = np.sum(exp_a, axis = axis, keepdims = True)
    y = exp_a / sum_exp
    return y


def attention_step_forward(decoder_h, encoder_h, Wc, bc, Wy, by):
    
    """
    batchsize N

    Inputs:
    - encoder_h : (N, T, D)
    - decoder_h : (N, D) at time t
    - Wc : (D, 2D)
    - bc : (D, )
    - Wy : (word_dim, D)
    - bc : (word_dim, )
    Returns a tuple of:
    - next_h: Next hidden state, of shape (N, H)
    - cache: Tuple of values needed for the backward pass.
    """
    N,T,D = encoder_h.shape
    WD, _ = Wy.shape

    scores, cache = None, None
    score_at = np.sum(encoder_h * decoder_h.reshape(N, 1, D), axis = 2)
    # score_at = encoder_h @ decoder_h.T
    prob_at =  softmax(score_at, axis = 1).reshape(N, T, 1)
    attention_value = np.sum(prob_at * encoder_h, axis = 1)
    v_t = np.concatenate([decoder_h, attention_value], axis = 1)
    s_t = np.tanh(v_t @ Wc.T + bc)
    score = softmax(s_t @ Wy.T + by, axis = 1)    

    cache = decoder_h, encoder_h, Wc, bc, Wy, by, score_at, prob_at, attention_value, v_t, s_t, score

    return score, cache

def attention_step_backward(dloss, cache):
    
    """
    batchsize N

    Inputs:
    - encoder_h : (N, T, D)
    - decoder_h : (N, D) at time t
    - Wc : (D, 2D)
    - bc : (D, )
    - Wy : (word_dim, D)
    - bc : (word_dim, )
    Returns a tuple of:
    - next_h: Next hidden state, of shape (N, H)
    - cache: Tuple of values needed for the backward pass.
    """
    decoder_h, encoder_h, Wc, bc, Wy, by, score_at, prob_at, attetion_value, v_t, s_t, score = cache

    N,T,D = encoder_h.shape
    WD, _ = Wy.shape

    ddh =1
    deh = 1
    dWc = 1


    # score_at = np.sum(encoder_h * decoder_h.reshape(N, 1, D), axis = 2)
    # prob_at =  softmax(score_at, axis = 1).reshape(N, T, 1)
    # attention_value = np.sum(prob_at * encoder_h, axis = 1)
    # v_t = np.concatenate([decoder_h, attention_value], axis = 1)
    # s_t = np.tanh(v_t @ Wc.T + bc)
    # score = softmax(s_t @ Wy.T + by, axis = 1) 
    dscore = score * (- np.sum(dloss * score, axis = 1).reshape(N, -1) + dloss)
    
    dby = dscore.sum(axis = 0)
    dWy = dscore.T @ s_t
    ds_t = dscore @ Wy
    dss_t = (1 - s_t**2) * ds_t
    
    dbc = dss_t.sum(axis =  0)
    dWc = dss_t.T @ v_t



    ###it is very difficult
    dv_t = dss_t @ Wc
    ddh1 = dv_t[:,:D].copy()
    dav = dv_t[:,D:]

    # deh1 = prob_at * dav
    deh1 = np.repeat(np.expand_dims(dav, axis=1), T, axis=1) * prob_at
    dprob_at = (np.expand_dims(dav, axis=1) * encoder_h).sum(axis = 2)
    prob_at = prob_at.reshape(N,-1)
    dscore_at = prob_at* (- np.sum(dprob_at * prob_at, axis = 1).reshape(N, -1) + dprob_at)





    deh2 = dscore_at.reshape(N,T,1) * decoder_h.reshape(N,1,D)
    ddh2 = (dscore_at.reshape(N,T,1) * encoder_h).sum(axis = 1)


    ddh = ddh1 + ddh2
    deh = deh1 + deh2
    

    
    

    return ddh, deh, dWc, dbc, dWy, dby

In [9]:
from gradient_check import *
import numpy as np
import tensorflow as tf
# from attention_layers import attention_step_forward, attention_step_backward
N, T, D, word_dim = 3, 5, 6, 10

decoder_h = np.linspace(-0.5, 0.5, num=N*D).reshape(N, D)
encoder_h = np.linspace(-0.5, 0.5, num=N*T*D).reshape(N, T, D)
Wc = np.linspace(-0.1, 0.9, num=2*D**2).reshape(D, 2*D)
bc = np.linspace(-0.3, 0.6, num=D).reshape(D, )
Wy = np.linspace(-0.3, 0.7, num=word_dim * D).reshape(word_dim,  D)
by = np.linspace(-0.2, 0.4, num=word_dim).reshape(word_dim, )

next_h, _ = attention_step_forward(decoder_h, encoder_h, Wc, bc, Wy, by)

expected_next_h = np.array([[0.33540088, 0.22498186, 0.15091444, 0.10123113, 0.06790432,
        0.0455492 , 0.03055371, 0.02049497, 0.01374772, 0.00922176],
       [0.04405104, 0.05166217, 0.06058836, 0.0710568 , 0.08333399,
        0.09773243, 0.11461862, 0.13442241, 0.1576479 , 0.18488628],
       [0.00543037, 0.00870501, 0.01395434, 0.02236912, 0.03585822,
        0.05748156, 0.09214426, 0.14770937, 0.23678152, 0.37956623]])

print('next_h error: ', rel_error(expected_next_h, next_h))

next_h error:  3.0950742416212104e-07


In [10]:
next_h

array([[0.33540088, 0.22498186, 0.15091444, 0.10123113, 0.06790432,
        0.0455492 , 0.03055371, 0.02049497, 0.01374772, 0.00922176],
       [0.04405104, 0.05166217, 0.06058836, 0.0710568 , 0.08333399,
        0.09773243, 0.11461862, 0.13442241, 0.1576479 , 0.18488628],
       [0.00543037, 0.00870501, 0.01395434, 0.02236912, 0.03585822,
        0.05748156, 0.09214426, 0.14770937, 0.23678152, 0.37956623]])

# Attention : back forward
Open the file `attention_layers.py`. This file implements the forward and backward passes for different types of layers that are commonly used in attention.

In [11]:
decoder_h = np.random.randn(N, D)
encoder_h = np.random.randn(N, T, D)
Wc = np.random.randn(D, 2*D)
bc = np.random.randn(D, )
Wy = np.random.randn(word_dim, D)
by = np.random.randn(word_dim, )
sa = np.random.randn(N, T)



fdh = lambda decoder_h: attention_step_forward(decoder_h, encoder_h, Wc, bc, Wy, by)[0]
feh = lambda encoder_h: attention_step_forward(decoder_h, encoder_h, Wc, bc, Wy, by)[0]
fWc = lambda Wc: attention_step_forward(decoder_h, encoder_h, Wc, bc, Wy, by)[0]
fbc = lambda bc: attention_step_forward(decoder_h, encoder_h, Wc, bc, Wy, by)[0]
fWy = lambda Wy: attention_step_forward(decoder_h, encoder_h, Wc, bc, Wy, by)[0]
fby = lambda by: attention_step_forward(decoder_h, encoder_h, Wc, bc, Wy, by)[0]

# fsa = lambda vt: attention_step_forward(decoder_h, encoder_h, Wc, bc, Wy, by, sa)[0]

score, cache = attention_step_forward(decoder_h, encoder_h, Wc, bc, Wy, by)
dnext_h = np.random.randn(*score.shape)

ddh_num = eval_numerical_gradient_array(fdh, decoder_h, dnext_h)
deh_num = eval_numerical_gradient_array(feh, encoder_h, dnext_h)
dWc_num = eval_numerical_gradient_array(fWc, Wc, dnext_h)
dbc_num = eval_numerical_gradient_array(fbc, bc, dnext_h)
dWy_num = eval_numerical_gradient_array(fWy, Wy, dnext_h)
dby_num = eval_numerical_gradient_array(fby, by, dnext_h)
# dsa_num = eval_numerical_gradient_array(fsa, sa, dnext_h)


ddh, deh, dWc, dbc, dWy, dby = attention_step_backward(dnext_h, cache)

print('dby error: ', rel_error(dby_num, dby))
print('dWy error: ', rel_error(dWy_num, dWy))
print('dbc error: ', rel_error(dbc_num, dbc))
print('dWc error: ', rel_error(dWc_num, dWc))
print('deh error: ', rel_error(deh_num, deh))
print('ddh error: ', rel_error(ddh_num, ddh))

dby error:  2.0356755939708203e-09
dWy error:  1.9415791826442853e-08
dbc error:  9.771022052567119e-11
dWc error:  5.715093390032916e-09
deh error:  5.044453092110602e-08
ddh error:  2.1306614439672336e-08


#Bahdanau Attention

In [12]:
def battention_step_forward(decoder_h, encoder_h, Wa, Wb, Wc, ba, bb, bc):
    
    """
    batchsize N

    Inputs:
    - encoder_h : (N, T, D)
    - decoder_h : (N, D) at time t
    - Wa : (1, D')
    - Wb : (D, D')
    - Wc : (D, D')
    Returns a tuple of:
    - next_h: Next hidden state, of shape (N, H)
    - cache: Tuple of values needed for the backward pass.
    """
    N,T,D = encoder_h.shape
    _, units = Wb.shape
    scores, cache = None, None
    score_tanh = np.tanh((decoder_h @ Wb + bb).reshape(N,1,units) + encoder_h @ Wc + bc.reshape(1,1,units))
    score_at = score_tanh @ Wa + ba
    attention_value = softmax(score_at, axis = 1)
    context_vector = np.sum(encoder_h * attention_value, axis = 1)
    

    cache = score_tanh, score_at, attention_value, decoder_h, encoder_h, Wa, Wb, Wc

    return context_vector, cache

def battention_step_backward(dloss, cache):
    
    """
    batchsize N

    Inputs:
    - encoder_h : (N, T, D)
    - decoder_h : (N, D) at time t
    - Wa : (1, D')
    - Wb : (D, D')
    - Wc : (D, D')
    Returns a tuple of:
    - next_h: Next hidden state, of shape (N, H)
    - cache: Tuple of values needed for the backward pass.
    """
    score_tanh, score_at, attention_value, decoder_h, encoder_h, Wa, Wb, Wc = cache
    dWa, dWb, dWc, deh, ddh, dba, dbb, dbc = np.float32(0),np.float32(0),np.float32(0),np.float32(0),np.float32(0),np.float32(0),np.float32(0),np.float32(0)
    

    N,T,D = encoder_h.shape
    D,K = Wb.shape
    deh1 = dloss.reshape(N, 1, D) * attention_value.reshape(N, T, 1)
    print(deh1.dtype)
    dav = np.sum(encoder_h * dloss.reshape(N,1,D), axis = 2)
    attention_value = attention_value.reshape(N,T)
    dat = attention_value * (dav - np.sum(dav * attention_value, axis = 1).reshape(N, -1))
    dWa = np.einsum('NTK, NT-> K', score_tanh, dat)
    # dWa = np.sum(dat.reshape(N,T,1) * score_tanh, axis = (0,1))
    dba = np.sum(dat)
    dscore_tanh = dat.reshape(N,T,1) @ Wa.T
    dscore_tanh_in = (1 - score_tanh**2) * dscore_tanh
    ddh = np.sum(dscore_tanh_in @ Wb.T, axis = 1)

    ddh = np.einsum('NTK, DK -> ND',dscore_tanh_in, Wb)
    # ddh = np.sum(dscore_tanh_in @ Wb.T, axis = 1)
    deh2 = np.einsum('NTK, DK -> NTD', dscore_tanh_in, Wc)
    # deh2 = np.sum(dscore_tanh_in.reshape(N,T,1,K) * Wc.reshape(1,1,D,K), axis = 3)
    deh = deh1 + deh2

    dWb = np.einsum('ND, NTK -> DK', decoder_h, dscore_tanh_in)
    # dWb = np.sum(decoder_h.reshape(N,D,1,1) * dscore_tanh_in.reshape(N,1,T,K), axis = (0,2))
    dWc = np.einsum('NTD, NTK -> DK', encoder_h, dscore_tanh_in)
    # dWc = np.sum(encoder_h.reshape(N,T,D,1) * dscore_tanh_in.reshape(N,T,1,K), axis = (0,1))
    dbb = np.sum(dscore_tanh_in, axis = (0, 1))
    dbc = np.sum(dscore_tanh_in, axis = (0, 1))
      



    return dWa, dWb, dWc, deh, ddh, dba, dbb, dbc

In [19]:

# import 
class BahdanauAttention(tf.keras.Model):
  def __init__(self, units):
    super(BahdanauAttention, self).__init__()
    self.W1 = tf.keras.layers.Dense(units)
    self.W2 = tf.keras.layers.Dense(units)
    self.V = tf.keras.layers.Dense(1)

  def call(self, values, query): # 단, key와 value는 같음
    # query shape == (batch_size, hidden size)
    # hidden_with_time_axis shape == (batch_size, 1, hidden size)
    # score 계산을 위해 뒤에서 할 덧셈을 위해서 차원을 변경해줍니다.
    hidden_with_time_axis = tf.expand_dims(query, 1)

    # score shape == (batch_size, max_length, 1)
    # we get 1 at the last axis because we are applying score to self.V
    # the shape of the tensor before applying self.V is (batch_size, max_length, units)
    score = self.V(tf.nn.tanh(
        self.W1(values) + self.W2(hidden_with_time_axis)))
    # print(self.W1(values), self.W2(hidden_with_time_axis))
    # attention_weights shape == (batch_size, max_length, 1)
    attention_weights = tf.nn.softmax(score, axis=1)

    # context_vector shape after sum == (batch_size, hidden_size)
    context_vector = attention_weights * values
    context_vector = tf.reduce_sum(context_vector, axis=1)

    return context_vector, attention_weights

In [20]:
units = 5
BahdanauAttention1 = BahdanauAttention(units)
tf.random.set_seed(1)
BahdanauAttention1(encoder_h, decoder_h)

(<tf.Tensor: shape=(3, 6), dtype=float32, numpy=
 array([[ 0.09332979,  0.07215458,  0.09642259, -0.16156846,  0.7399138 ,
          0.01438008],
        [ 0.23674926,  0.36735293, -0.65598965,  0.46823952,  0.2048228 ,
         -0.05906077],
        [-0.54972124,  0.2614893 , -0.132893  , -0.5494666 ,  0.2640586 ,
          0.42872825]], dtype=float32)>,
 <tf.Tensor: shape=(3, 5, 1), dtype=float32, numpy=
 array([[[0.14924589],
         [0.18572585],
         [0.2236181 ],
         [0.24923243],
         [0.19217783]],
 
        [[0.16759665],
         [0.19448583],
         [0.24640141],
         [0.19926451],
         [0.19225161]],
 
        [[0.24428836],
         [0.21059395],
         [0.07079621],
         [0.25099948],
         [0.223322  ]]], dtype=float32)>)

In [21]:
    # - encoder_h : (N, T, D)
    # - decoder_h : (N, D) at time t
    # - Wa : (1, D)
    # - Wb : (D, D)
    # - Wc : (D, D)
Wc, bc, Wb, bb, Wa, ba = BahdanauAttention1.weights[0].numpy(), BahdanauAttention1.weights[1].numpy(), BahdanauAttention1.weights[2].numpy(), BahdanauAttention1.weights[3].numpy(), BahdanauAttention1.weights[4].numpy(), BahdanauAttention1.weights[5].numpy()
Wb, bb, Wc, bc, Wa, ba = np.array(Wb, dtype=np.float64), np.array(bb, dtype=np.float64), np.array(Wc, dtype=np.float64), np.array(bc, dtype=np.float64), np.array(Wa, dtype=np.float64), np.array(ba, dtype=np.float64)


ba, bb, bc = ba.reshape(1,-1), bb.reshape(1,-1), bc.reshape(1,-1)


decoder_h = np.random.randn(N, D)
# decoder_h = decoder_h.astype('float32')
encoder_h = np.random.randn(N, T, D)
# encoder_h = encoder_h.astype('float32')
# Wc = np.random.randn(D, units)
# Wb = np.random.randn(D, units)
# Wa = np.random.randn(units, 1)

# ba = np.random.randn(1, 1)
# bb = np.random.randn(1, units)
# bc = np.random.randn(1, units)
fdh = lambda decoder_h: battention_step_forward(decoder_h, encoder_h, Wa, Wb, Wc, ba, bb, bc)[0]
feh = lambda encoder_h: battention_step_forward(decoder_h, encoder_h, Wa, Wb, Wc, ba, bb, bc)[0]
fWc = lambda Wc: battention_step_forward(decoder_h, encoder_h, Wa, Wb, Wc, ba, bb, bc)[0]
fWb = lambda Wb: battention_step_forward(decoder_h, encoder_h, Wa, Wb, Wc, ba, bb, bc)[0]
fWa = lambda Wa: battention_step_forward(decoder_h, encoder_h, Wa, Wb, Wc, ba, bb, bc)[0]
fba = lambda ba: battention_step_forward(decoder_h, encoder_h, Wa, Wb, Wc, ba, bb, bc)[0]
fbb = lambda bb: battention_step_forward(decoder_h, encoder_h, Wa, Wb, Wc, ba, bb, bc)[0]
fbc = lambda bc: battention_step_forward(decoder_h, encoder_h, Wa, Wb, Wc, ba, bb, bc)[0]

score, cache = battention_step_forward(decoder_h, encoder_h, Wa, Wb, Wc, ba, bb, bc)


dnext_h = np.random.randn(*score.shape)
ddh_num = eval_numerical_gradient_array(fdh, decoder_h, dnext_h)
deh_num = eval_numerical_gradient_array(feh, encoder_h, dnext_h)
dWc_num = eval_numerical_gradient_array(fWc, Wc, dnext_h)
dWb_num = eval_numerical_gradient_array(fWb, Wb, dnext_h)
dWa_num = eval_numerical_gradient_array(fWa, Wa, dnext_h)
dba_num = eval_numerical_gradient_array(fba, ba, dnext_h)
dbb_num = eval_numerical_gradient_array(fbb, bb, dnext_h)
dbc_num = eval_numerical_gradient_array(fbc, bc, dnext_h)


dWa, dWb, dWc, deh, ddh, dba, dbb, dbc = battention_step_backward(dnext_h, cache)

print('ddh error: ', rel_error(ddh_num, ddh))
print('deh error: ', rel_error(deh_num, deh))
print('dWc error: ', rel_error(dWc_num, dWc))
print('dWb error: ', rel_error(dWb_num, dWb))
print('dWa error: ', rel_error(dWa_num.T, dWa))
print('dba error: ', rel_error(dba_num.T, dba))
print('dbb error: ', rel_error(dbb_num, dbb))
print('dbc error: ', rel_error(dbc_num, dbc))

float64
ddh error:  6.170281951912502e-10
deh error:  1.517665979660445e-09
dWc error:  6.699581029198585e-09
dWb error:  2.5840297177755672e-09
dWa error:  5.97365757206702e-11
dba error:  0.0016299368384956462
dbb error:  2.5028512899650587e-09
dbc error:  2.5028512899650587e-09


In [22]:
BahdanauAttention1(encoder_h, decoder_h)
loss_function = tf.keras.losses.MeanSquaredError()

with tf.GradientTape() as tape:
  predictions, _ = BahdanauAttention1(tf.convert_to_tensor(encoder_h), tf.convert_to_tensor(decoder_h))
  loss = loss_function(np.zeros((N,D)),predictions)

gradients = tape.gradient(loss, BahdanauAttention1.weights)

In [23]:
dnext_h = (2* (predictions)).numpy()/ 18
ddh_num = eval_numerical_gradient_array(fdh, decoder_h, dnext_h)
deh_num = eval_numerical_gradient_array(feh, encoder_h, dnext_h)
dWc_num = eval_numerical_gradient_array(fWc, Wc, dnext_h)
dWb_num = eval_numerical_gradient_array(fWb, Wb, dnext_h)
dWa_num = eval_numerical_gradient_array(fWa, Wa, dnext_h)
dba_num = eval_numerical_gradient_array(fba, ba, dnext_h)
dbb_num = eval_numerical_gradient_array(fbb, bb, dnext_h)
dbc_num = eval_numerical_gradient_array(fbc, bc, dnext_h)


dWa, dWb, dWc, deh, ddh, dba, dbb, dbc = battention_step_backward(dnext_h, cache)

print('ddh error: ', rel_error(ddh_num, ddh))
print('deh error: ', rel_error(deh_num, deh))
print('dWc error: ', rel_error(dWc_num, dWc))
print('dWb error: ', rel_error(dWb_num, dWb))
print('dWa error: ', rel_error(dWa_num.T, dWa))
print('dba error: ', rel_error(dba_num.T, dba))
print('dbb error: ', rel_error(dbb_num, dbb))
print('dbc error: ', rel_error(dbc_num, dbc))

float64
ddh error:  5.037884062004215e-09
deh error:  3.0489782942665384e-09
dWc error:  4.227825632401229e-08
dWb error:  1.5412739186585457e-08
dWa error:  2.0453212351221984e-10
dba error:  0.0005603483531667171
dbb error:  1.4289432147256335e-09
dbc error:  1.4289432147256335e-09


In [24]:
print(gradients)

print(dWc)
print(dbc)
print(dWb)
print(dbb)
print(dWa)
print(dba)

[<tf.Tensor: shape=(6, 5), dtype=float32, numpy=
array([[-6.04876913e-02,  1.64003279e-02, -5.66726725e-04,
         5.71519835e-03, -1.12971077e-02],
       [-8.80641341e-02,  6.04876038e-03,  1.56979228e-03,
        -2.63011153e-03, -4.86020072e-05],
       [-2.70124935e-02,  1.00945309e-02, -2.99077859e-04,
         3.80559242e-03, -3.15385964e-03],
       [-5.29027060e-02, -2.64720041e-02, -4.42029850e-05,
        -3.15383309e-03,  1.98871060e-03],
       [ 5.63869886e-02,  2.80794278e-02,  4.37938422e-03,
        -7.49630621e-03,  1.28993765e-02],
       [ 3.55078913e-02,  1.82834901e-02,  4.76088887e-03,
        -1.06200045e-02,  1.54926116e-02]], dtype=float32)>, <tf.Tensor: shape=(5,), dtype=float32, numpy=
array([-0.04020319,  0.01320723,  0.00204353, -0.00243291,  0.00359423],
      dtype=float32)>, <tf.Tensor: shape=(6, 5), dtype=float32, numpy=
array([[ 5.7762682e-02, -1.1678785e-02, -2.8254007e-04, -6.3083065e-03,
         9.9645685e-03],
       [-2.4183257e-02, -4.0865964